diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index f495e0845d..354b06a54a 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -109,7 +109,7 @@ Each command will have detailed information via `amd-smi [command] --help` For convenience, here is the help output for each command -``` bash +```bash ~$ amd-smi list --help usage: amd-smi list [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] @@ -123,10 +123,10 @@ GPU with some basic information for each VF. options: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -148,9 +148,9 @@ Command Modifiers: ```bash ~$ amd-smi static --help -usage: amd-smi static [-h] [-g GPU [GPU ...]] [-a] [-b] [-V] [-d] [-v] [-c] [-B] [-r] [-p] - [-l] [-P] [-x] [-s] [-u] [--json | --csv] [--file FILE] - [--loglevel LEVEL] +usage: amd-smi static [-h] [-g GPU [GPU ...] | -U CPU [CPU ...]] [-a] [-b] [-V] [-d] [-v] + [-c] [-B] [-R] [-r] [-p] [-l] [-P] [-x] [-u] [-s] [-i] + [--json | --csv] [--file FILE] [--loglevel LEVEL] If no GPU is specified, returns static information for all GPUs on the system. If no static argument is provided, all static information will be displayed. @@ -158,10 +158,10 @@ If no static argument is provided, all static information will be displayed. Static Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -176,10 +176,12 @@ Static Arguments: -v, --vram All vram information -c, --cache All cache information -B, --board All board information + -R, --process-isolation The process isolation status -r, --ras Displays RAS features information -p, --partition Partition information -l, --limit All limit metric values (i.e. power and thermal limits) - -s, --process-isolation The process isolation status + -P, --policy The available DPM policy + -x, --xgmi-plpd The available XGMI per-link power down policy -u, --numa All numa node information CPU Arguments: @@ -194,7 +196,7 @@ Command Modifiers: DEBUG, INFO, WARNING, ERROR, CRITICAL ``` -``` bash +```bash ~$ amd-smi firmware --help usage: amd-smi firmware [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-f] @@ -204,10 +206,10 @@ If no GPU is specified, return firmware information for all GPUs on the system. Firmware Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -239,10 +241,10 @@ If no GPU is specified, return bad page information for all GPUs on the system. Bad Pages Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -287,12 +289,12 @@ If no metric argument is provided all metric information will be displayed. Metric arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices - -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: + -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 ID: 1 ID: 2 @@ -368,10 +370,10 @@ If no process argument is provided all process information will be displayed. Process arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -409,10 +411,10 @@ If no GPU is specified, returns event information for all GPUs on the system. Event Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -444,10 +446,10 @@ If no topology argument is provided all topology information will be displayed. Topology arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -473,15 +475,16 @@ Command Modifiers: ``` ```bash +~$ amd-smi set --help usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION] - [-o WATTS] [-p POLICY] [-i STATUS] [--cpu-pwr-limit PWR_LIMIT] - [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] + [-o WATTS] [-p POLICY_ID] [-x POLICY_ID] [-R STATUS] + [--cpu-pwr-limit PWR_LIMIT] [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] [--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE] [--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE] [--cpu-df-pstate-range MAX_PSTATE MIN_PSTATE] [--cpu-enable-apb] [--cpu-disable-apb DF_PSTATE] [--soc-boost-limit BOOST_LIMIT] - [--core-boost-limit BOOST_LIMIT] [-c] [--json | --csv] [--file FILE] + [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE] [--loglevel LEVEL] A GPU must be specified to set a configuration. @@ -490,10 +493,10 @@ A set argument must be provided; Multiple set arguments are accepted Set Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -515,8 +518,7 @@ Set Arguments: -o, --power-cap WATTS Set power capacity limit -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id -x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id - -i, --process-isolation STATUS Enable or disable the GPU process isolation: 0 for disable and 1 for enable. - -c, --clear-sram-data Clear the GPU SRAM data + -R, --process-isolation STATUS Enable or disable the GPU process isolation: 0 for disable and 1 for enable. CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. @@ -546,7 +548,7 @@ Command Modifiers: ~$ amd-smi reset --help usage: amd-smi reset [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-G] [-c] - [-f] [-p] [-x] [-d] [-C] [-M] [-o] + [-f] [-p] [-x] [-d] [-C] [-M] [-o] [-l] A GPU must be specified to reset a configuration. A reset argument must be provided; Multiple reset arguments are accepted @@ -554,10 +556,10 @@ A reset argument must be provided; Multiple reset arguments are accepted Reset Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 71ff74a0-0000-1000-8066-0a3c71d5f817 - ID: 1 | BDF: 0001:01:00.0 | UUID: b4ff74a0-0000-1000-80b2-fa0be8628b1a - ID: 2 | BDF: 0002:01:00.0 | UUID: a9ff74a0-0000-1000-8007-3066a98ba4a6 - ID: 3 | BDF: 0003:01:00.0 | UUID: 53ff74a0-0000-1000-80a0-a1ff3830f499 + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 @@ -577,6 +579,92 @@ Reset Arguments: -C, --compute-partition Reset compute partitions on the specified GPU -M, --memory-partition Reset memory partitions on the specified GPU -o, --power-cap Reset power capacity limit to max capable + -l, --clear-sram-data Clear the GPU SRAM data + +Command Modifiers: + --json Displays output in JSON format (human readable by default). + --csv Displays output in CSV format (human readable by default). + --file FILE Saves output into a file on the provided path (stdout by default). + --loglevel LEVEL Set the logging level from the possible choices: + DEBUG, INFO, WARNING, ERROR, CRITICAL +``` + +```bash +~$ amd-smi monitor --help +usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] + [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] + [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n] + [-d] [-s] [-e] [-v] [-r] + +Monitor a target device for the specified arguments. +If no arguments are provided, all arguments will be enabled. +Use the watch arguments to run continuously + +Monitor Arguments: + -h, --help show this help message and exit + -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 + all | Selects all devices + -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: + ID: 0 + ID: 1 + ID: 2 + ID: 3 + all | Selects all devices + -O, --core CORE [CORE ...] Select a Core ID from the possible choices: + ID: 0 - 95 + all | Selects all devices + -w, --watch INTERVAL Reprint the command in a loop of INTERVAL seconds + -W, --watch_time TIME The total TIME to watch the given command + -i, --iterations ITERATIONS Total number of ITERATIONS to loop on the given command + -p, --power-usage Monitor power usage in Watts + -t, --temperature Monitor temperature in Celsius + -u, --gfx Monitor graphics utilization (%) and clock (MHz) + -m, --mem Monitor memory utilization (%) and clock (MHz) + -n, --encoder Monitor encoder utilization (%) and clock (MHz) + -d, --decoder Monitor decoder utilization (%) and clock (MHz) + -s, --throttle-status Monitor thermal throttle status + -e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts + -v, --vram-usage Monitor memory usage in MB + -r, --pcie Monitor PCIe bandwidth in Mb/s + +Command Modifiers: + --json Displays output in JSON format (human readable by default). + --csv Displays output in CSV format (human readable by default). + --file FILE Saves output into a file on the provided path (stdout by default). + --loglevel LEVEL Set the logging level from the possible choices: + DEBUG, INFO, WARNING, ERROR, CRITICAL +``` + +```bash +~$ amd-smi xgmi --help +usage: amd-smi xgmi [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] + [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-m] + +If no GPU is specified, returns information for all GPUs on the system. +If no xgmi argument is provided all xgmi information will be displayed. + +XGMI arguments: + -h, --help show this help message and exit + -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: + ID: 0 | BDF: 0000:01:00.0 | UUID: 7eff74a0-0000-1000-808f-7e20764e2714 + ID: 1 | BDF: 0001:01:00.0 | UUID: b6ff74a0-0000-1000-80ae-7c8cefe1f084 + ID: 2 | BDF: 0002:01:00.0 | UUID: 36ff74a0-0000-1000-8071-25d815189854 + ID: 3 | BDF: 0003:01:00.0 | UUID: f4ff74a0-0000-1000-80c4-4c2be5e66537 + all | Selects all devices + -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: + ID: 0 + ID: 1 + ID: 2 + ID: 3 + all | Selects all devices + -O, --core CORE [CORE ...] Select a Core ID from the possible choices: + ID: 0 - 95 + all | Selects all devices + -m, --metric Metric XGMI information Command Modifiers: --json Displays output in JSON format (human readable by default). @@ -593,28 +681,29 @@ Here is some example output from the tool: ```bash ~$ amd-smi static CPU: 0 - SMU: - FW_VERSION: 85:81:0 - INTERFACE_VERSION: - PROTO VERSION: 6 + SMU: + FW_VERSION: 85.90.0 + INTERFACE_VERSION: + PROTO VERSION: 6 CPU: 1 - SMU: - FW_VERSION: 85:81:0 - INTERFACE_VERSION: - PROTO VERSION: 6 + SMU: + FW_VERSION: 85.90.0 + INTERFACE_VERSION: + PROTO VERSION: 6 CPU: 2 - SMU: - FW_VERSION: 85:81:0 - INTERFACE_VERSION: - PROTO VERSION: 6 + SMU: + FW_VERSION: 85.90.0 + INTERFACE_VERSION: + PROTO VERSION: 6 CPU: 3 - SMU: - FW_VERSION: 85:81:0 - INTERFACE_VERSION: - PROTO VERSION: 6 + SMU: + FW_VERSION: 85.90.0 + INTERFACE_VERSION: + PROTO VERSION: 6 + GPU: 0 ASIC: @@ -623,8 +712,8 @@ GPU: 0 VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] SUBVENDOR_ID: 0x1002 DEVICE_ID: 0x74a0 - REV_ID: 0x0 - ASIC_SERIAL: 0x71660A3C71D5F817 + REV_ID: 0x00 + ASIC_SERIAL: 0x7E8F7E20764E2714 OAM_ID: 0 BUS: BDF: 0000:01:00.0 @@ -636,25 +725,26 @@ GPU: 0 NAME: N/A BUILD_DATE: N/A PART_NUMBER: N/A - VERSION: N/A + VERSION: N/A LIMIT: MAX_POWER: 550 W + MIN_POWER: 0 W SOCKET_POWER: 550 W SLOWDOWN_EDGE_TEMPERATURE: N/A SLOWDOWN_HOTSPOT_TEMPERATURE: 100 °C - SLOWDOWN_VRAM_TEMPERATURE: 95 °C + SLOWDOWN_VRAM_TEMPERATURE: 105 °C SHUTDOWN_EDGE_TEMPERATURE: N/A SHUTDOWN_HOTSPOT_TEMPERATURE: 110 °C - SHUTDOWN_VRAM_TEMPERATURE: 105 °C + SHUTDOWN_VRAM_TEMPERATURE: 115 °C DRIVER: NAME: amdgpu - VERSION: 6.7.0 + VERSION: 6.9.0-rc5+ BOARD: MODEL_NUMBER: N/A PRODUCT_SERIAL: N/A FRU_ID: N/A - PRODUCT_NAME: N/A - MANUFACTURER_NAME: N/A + PRODUCT_NAME: Aqua Vanjaram [Instinct MI300A] + MANUFACTURER_NAME: Advanced Micro Devices, Inc. [AMD/ATI] RAS: EEPROM_VERSION: 0x0 PARITY_SCHEMA: DISABLED @@ -676,63 +766,49 @@ GPU: 0 MP0: DISABLED MP1: DISABLED FUSE: DISABLED + MCA: DISABLED + VCN: DISABLED + JPEG: DISABLED + IH: DISABLED + MPIO: DISABLED PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - DPM_POLICY: - NUM_SUPPORTED: 4 - CURRENT_ID: 1 - POLICIES: - POLICY_ID: 0 - POLICY_DESCRIPTION: pstate_default - POLICY_ID: 1 - POLICY_DESCRIPTION: soc_pstate_0 - POLICY_ID: 2 - POLICY_DESCRIPTION: soc_pstate_1 - POLICY_ID: 3 - POLICY_DESCRIPTION: soc_pstate_2 - XGMI_PLPD: - NUM_SUPPORTED: 3 - CURRENT_ID: 1 - PLPDS: - POLICY_ID: 0 - POLICY_DESCRIPTION: plpd_disallow - POLICY_ID: 1 - POLICY_DESCRIPTION: plpd_default - POLICY_ID: 2 - POLICY_DESCRIPTION: plpd_optimized + DPM_POLICY: N/A + XGMI_PLPD: N/A + PROCESS_ISOLATION: N/A NUMA: NODE: 0 AFFINITY: 0 VRAM: TYPE: HBM VENDOR: N/A - SIZE: 96432 MB - CACHE_INFO: - CACHE_0: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 464 - CACHE_1: - CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE - CACHE_SIZE: 64 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 160 - CACHE_2: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32768 KB - CACHE_LEVEL: 2 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 - CACHE_3: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 262144 KB - CACHE_LEVEL: 3 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 + SIZE: 64289 MB + CACHE_INFO: + CACHE_0: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 32 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 348 + CACHE_1: + CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE + CACHE_SIZE: 64 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 120 + CACHE_2: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 4096 KB + CACHE_LEVEL: 2 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 + CACHE_3: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 262144 KB + CACHE_LEVEL: 3 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 GPU: 1 ASIC: @@ -741,8 +817,8 @@ GPU: 1 VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] SUBVENDOR_ID: 0x1002 DEVICE_ID: 0x74a0 - REV_ID: 0x0 - ASIC_SERIAL: 0xB4B2FA0BE8628B1A + REV_ID: 0x00 + ASIC_SERIAL: 0xB6AE7C8CEFE1F084 OAM_ID: 1 BUS: BDF: 0001:01:00.0 @@ -754,25 +830,26 @@ GPU: 1 NAME: N/A BUILD_DATE: N/A PART_NUMBER: N/A - VERSION: N/A + VERSION: N/A LIMIT: MAX_POWER: 550 W + MIN_POWER: 0 W SOCKET_POWER: 550 W SLOWDOWN_EDGE_TEMPERATURE: N/A SLOWDOWN_HOTSPOT_TEMPERATURE: 100 °C - SLOWDOWN_VRAM_TEMPERATURE: 95 °C + SLOWDOWN_VRAM_TEMPERATURE: 105 °C SHUTDOWN_EDGE_TEMPERATURE: N/A SHUTDOWN_HOTSPOT_TEMPERATURE: 110 °C - SHUTDOWN_VRAM_TEMPERATURE: 105 °C + SHUTDOWN_VRAM_TEMPERATURE: 115 °C DRIVER: NAME: amdgpu - VERSION: 6.7.0 + VERSION: 6.9.0-rc5+ BOARD: MODEL_NUMBER: N/A PRODUCT_SERIAL: N/A FRU_ID: N/A - PRODUCT_NAME: N/A - MANUFACTURER_NAME: N/A + PRODUCT_NAME: Aqua Vanjaram [Instinct MI300A] + MANUFACTURER_NAME: Advanced Micro Devices, Inc. [AMD/ATI] RAS: EEPROM_VERSION: 0x0 PARITY_SCHEMA: DISABLED @@ -794,63 +871,49 @@ GPU: 1 MP0: DISABLED MP1: DISABLED FUSE: DISABLED + MCA: DISABLED + VCN: DISABLED + JPEG: DISABLED + IH: DISABLED + MPIO: DISABLED PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - DPM_POLICY: - NUM_SUPPORTED: 4 - CURRENT_ID: 1 - POLICIES: - POLICY_ID: 0 - POLICY_DESCRIPTION: pstate_default - POLICY_ID: 1 - POLICY_DESCRIPTION: soc_pstate_0 - POLICY_ID: 2 - POLICY_DESCRIPTION: soc_pstate_1 - POLICY_ID: 3 - POLICY_DESCRIPTION: soc_pstate_2 - XGMI_PLPD: - NUM_SUPPORTED: 3 - CURRENT_ID: 1 - PLPDS: - POLICY_ID: 0 - POLICY_DESCRIPTION: plpd_disallow - POLICY_ID: 1 - POLICY_DESCRIPTION: plpd_default - POLICY_ID: 2 - POLICY_DESCRIPTION: plpd_optimized + DPM_POLICY: N/A + XGMI_PLPD: N/A + PROCESS_ISOLATION: N/A NUMA: NODE: 1 AFFINITY: 1 VRAM: TYPE: HBM VENDOR: N/A - SIZE: 96432 MB - CACHE_INFO: - CACHE_0: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 464 - CACHE_1: - CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE - CACHE_SIZE: 64 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 160 - CACHE_2: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32768 KB - CACHE_LEVEL: 2 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 - CACHE_3: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 262144 KB - CACHE_LEVEL: 3 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 + SIZE: 64289 MB + CACHE_INFO: + CACHE_0: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 32 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 348 + CACHE_1: + CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE + CACHE_SIZE: 64 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 120 + CACHE_2: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 4096 KB + CACHE_LEVEL: 2 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 + CACHE_3: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 262144 KB + CACHE_LEVEL: 3 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 GPU: 2 ASIC: @@ -859,8 +922,8 @@ GPU: 2 VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] SUBVENDOR_ID: 0x1002 DEVICE_ID: 0x74a0 - REV_ID: 0x0 - ASIC_SERIAL: 0xA9073066A98BA4A6 + REV_ID: 0x00 + ASIC_SERIAL: 0x367125D815189854 OAM_ID: 2 BUS: BDF: 0002:01:00.0 @@ -872,25 +935,26 @@ GPU: 2 NAME: N/A BUILD_DATE: N/A PART_NUMBER: N/A - VERSION: N/A + VERSION: N/A LIMIT: MAX_POWER: 550 W + MIN_POWER: 0 W SOCKET_POWER: 550 W SLOWDOWN_EDGE_TEMPERATURE: N/A SLOWDOWN_HOTSPOT_TEMPERATURE: 100 °C - SLOWDOWN_VRAM_TEMPERATURE: 95 °C + SLOWDOWN_VRAM_TEMPERATURE: 105 °C SHUTDOWN_EDGE_TEMPERATURE: N/A SHUTDOWN_HOTSPOT_TEMPERATURE: 110 °C - SHUTDOWN_VRAM_TEMPERATURE: 105 °C + SHUTDOWN_VRAM_TEMPERATURE: 115 °C DRIVER: NAME: amdgpu - VERSION: 6.7.0 + VERSION: 6.9.0-rc5+ BOARD: MODEL_NUMBER: N/A PRODUCT_SERIAL: N/A FRU_ID: N/A - PRODUCT_NAME: N/A - MANUFACTURER_NAME: N/A + PRODUCT_NAME: Aqua Vanjaram [Instinct MI300A] + MANUFACTURER_NAME: Advanced Micro Devices, Inc. [AMD/ATI] RAS: EEPROM_VERSION: 0x0 PARITY_SCHEMA: DISABLED @@ -912,63 +976,49 @@ GPU: 2 MP0: DISABLED MP1: DISABLED FUSE: DISABLED + MCA: DISABLED + VCN: DISABLED + JPEG: DISABLED + IH: DISABLED + MPIO: DISABLED PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - DPM_POLICY: - NUM_SUPPORTED: 4 - CURRENT_ID: 1 - POLICIES: - POLICY_ID: 0 - POLICY_DESCRIPTION: pstate_default - POLICY_ID: 1 - POLICY_DESCRIPTION: soc_pstate_0 - POLICY_ID: 2 - POLICY_DESCRIPTION: soc_pstate_1 - POLICY_ID: 3 - POLICY_DESCRIPTION: soc_pstate_2 - XGMI_PLPD: - NUM_SUPPORTED: 3 - CURRENT_ID: 1 - PLPDS: - POLICY_ID: 0 - POLICY_DESCRIPTION: plpd_disallow - POLICY_ID: 1 - POLICY_DESCRIPTION: plpd_default - POLICY_ID: 2 - POLICY_DESCRIPTION: plpd_optimized + DPM_POLICY: N/A + XGMI_PLPD: N/A + PROCESS_ISOLATION: N/A NUMA: NODE: 2 AFFINITY: 2 VRAM: TYPE: HBM VENDOR: N/A - SIZE: 96432 MB - CACHE_INFO: - CACHE_0: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 464 - CACHE_1: - CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE - CACHE_SIZE: 64 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 160 - CACHE_2: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32768 KB - CACHE_LEVEL: 2 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 - CACHE_3: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 262144 KB - CACHE_LEVEL: 3 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 + SIZE: 64289 MB + CACHE_INFO: + CACHE_0: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 32 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 348 + CACHE_1: + CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE + CACHE_SIZE: 64 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 120 + CACHE_2: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 4096 KB + CACHE_LEVEL: 2 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 + CACHE_3: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 262144 KB + CACHE_LEVEL: 3 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 GPU: 3 ASIC: @@ -977,8 +1027,8 @@ GPU: 3 VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] SUBVENDOR_ID: 0x1002 DEVICE_ID: 0x74a0 - REV_ID: 0x0 - ASIC_SERIAL: 0x53A0A1FF3830F499 + REV_ID: 0x00 + ASIC_SERIAL: 0xF4C44C2BE5E66537 OAM_ID: 3 BUS: BDF: 0003:01:00.0 @@ -990,25 +1040,26 @@ GPU: 3 NAME: N/A BUILD_DATE: N/A PART_NUMBER: N/A - VERSION: N/A + VERSION: N/A LIMIT: MAX_POWER: 550 W + MIN_POWER: 0 W SOCKET_POWER: 550 W SLOWDOWN_EDGE_TEMPERATURE: N/A SLOWDOWN_HOTSPOT_TEMPERATURE: 100 °C - SLOWDOWN_VRAM_TEMPERATURE: 95 °C + SLOWDOWN_VRAM_TEMPERATURE: 105 °C SHUTDOWN_EDGE_TEMPERATURE: N/A SHUTDOWN_HOTSPOT_TEMPERATURE: 110 °C - SHUTDOWN_VRAM_TEMPERATURE: 105 °C + SHUTDOWN_VRAM_TEMPERATURE: 115 °C DRIVER: NAME: amdgpu - VERSION: 6.7.0 + VERSION: 6.9.0-rc5+ BOARD: MODEL_NUMBER: N/A PRODUCT_SERIAL: N/A FRU_ID: N/A - PRODUCT_NAME: N/A - MANUFACTURER_NAME: N/A + PRODUCT_NAME: Aqua Vanjaram [Instinct MI300A] + MANUFACTURER_NAME: Advanced Micro Devices, Inc. [AMD/ATI] RAS: EEPROM_VERSION: 0x0 PARITY_SCHEMA: DISABLED @@ -1030,63 +1081,49 @@ GPU: 3 MP0: DISABLED MP1: DISABLED FUSE: DISABLED + MCA: DISABLED + VCN: DISABLED + JPEG: DISABLED + IH: DISABLED + MPIO: DISABLED PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - DPM_POLICY: - NUM_SUPPORTED: 4 - CURRENT_ID: 1 - POLICIES: - POLICY_ID: 0 - POLICY_DESCRIPTION: pstate_default - POLICY_ID: 1 - POLICY_DESCRIPTION: soc_pstate_0 - POLICY_ID: 2 - POLICY_DESCRIPTION: soc_pstate_1 - POLICY_ID: 3 - POLICY_DESCRIPTION: soc_pstate_2 - XGMI_PLPD: - NUM_SUPPORTED: 3 - CURRENT_ID: 1 - PLPDS: - POLICY_ID: 0 - POLICY_DESCRIPTION: plpd_disallow - POLICY_ID: 1 - POLICY_DESCRIPTION: plpd_default - POLICY_ID: 2 - POLICY_DESCRIPTION: plpd_optimized + DPM_POLICY: N/A + XGMI_PLPD: N/A + PROCESS_ISOLATION: N/A NUMA: NODE: 3 AFFINITY: 3 VRAM: TYPE: HBM VENDOR: N/A - SIZE: 96432 MB - CACHE_INFO: - CACHE_0: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 464 - CACHE_1: - CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE - CACHE_SIZE: 64 KB - CACHE_LEVEL: 1 - MAX_NUM_CU_SHARED: 2 - NUM_CACHE_INSTANCE: 160 - CACHE_2: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 32768 KB - CACHE_LEVEL: 2 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 - CACHE_3: - CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE - CACHE_SIZE: 262144 KB - CACHE_LEVEL: 3 - MAX_NUM_CU_SHARED: 304 - NUM_CACHE_INSTANCE: 1 + SIZE: 64289 MB + CACHE_INFO: + CACHE_0: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 32 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 348 + CACHE_1: + CACHE_PROPERTIES: INST_CACHE, SIMD_CACHE + CACHE_SIZE: 64 KB + CACHE_LEVEL: 1 + MAX_NUM_CU_SHARED: 2 + NUM_CACHE_INSTANCE: 120 + CACHE_2: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 4096 KB + CACHE_LEVEL: 2 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 + CACHE_3: + CACHE_PROPERTIES: DATA_CACHE, SIMD_CACHE + CACHE_SIZE: 262144 KB + CACHE_LEVEL: 3 + MAX_NUM_CU_SHARED: 228 + NUM_CACHE_INSTANCE: 1 ``` diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 23b39cb408..653e5c42b8 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -291,10 +291,14 @@ class AMDSMICommands(): args.vram = vram if cache: args.cache = cache + if process_isolation: + args.process_isolation = process_isolation # Store args that are applicable to the current platform - current_platform_args = ["asic", "bus", "vbios", "driver", "vram", "cache", "board"] - current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.vram, args.cache, args.board] + current_platform_args = ["asic", "bus", "vbios", "driver", "vram", "cache", + "board", "process_isolation"] + current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.vram, args.cache, + args.board, args.process_isolation] if self.helpers.is_linux() and self.helpers.is_baremetal(): if ras: @@ -307,10 +311,9 @@ class AMDSMICommands(): args.policy = policy if xgmi_plpd: args.xgmi_plpd = xgmi_plpd - if process_isolation: - args.process_isolation = process_isolation - current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd", "process_isolation"] - current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd, args.process_isolation] + + current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -3385,7 +3388,7 @@ class AMDSMICommands(): def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None, - process_isolation=None, clear_sram_data = None): + process_isolation=None): """Issue reset commands to target gpu(s) Args: @@ -3402,7 +3405,6 @@ class AMDSMICommands(): dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. - clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None. Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3433,8 +3435,7 @@ class AMDSMICommands(): args.xgmi_plpd = xgmi_plpd if process_isolation: args.process_isolation = process_isolation - if clear_sram_data: - args.clear_sram_data = clear_sram_data + # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3456,8 +3457,7 @@ class AMDSMICommands(): args.power_cap is not None, args.dpm_policy is not None, args.xgmi_plpd is not None, - args.process_isolation is not None, - args.clear_sram_data]): + args.process_isolation is not None]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3583,16 +3583,6 @@ class AMDSMICommands(): raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e self.logger.store_output(args.gpu, 'process_isolation', result) - if args.clear_sram_data: - try: - # Only 1 can be used for now. - amdsmi_interface.amdsmi_set_gpu_clear_sram_data(args.gpu, 1) - result = 'Successfully clear GPU SRAM data' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to clear SRAM data on GPU {gpu_id}") from e - self.logger.store_output(args.gpu, 'clear_sram_data', result) if multiple_devices: self.logger.store_multiple_device_output() @@ -3608,7 +3598,7 @@ class AMDSMICommands(): cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None, - process_isolation=None, clear_sram_data=None): + process_isolation=None): """Issue reset commands to target gpu(s) Args: @@ -3640,7 +3630,6 @@ class AMDSMICommands(): dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. process_isolation (int, optional): Value override for args.process_isolation. Defaults to None. - clear_sram_data (int, optional): Value override for args.clear_sram_data. Defaults to None. Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3660,8 +3649,7 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd", "process_isolation", - "clear_sram_data"] + "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd", "process_isolation"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3717,7 +3705,7 @@ class AMDSMICommands(): self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, memory_partition, power_cap, dpm_policy, xgmi_plpd, - process_isolation, clear_sram_data) + process_isolation) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3737,12 +3725,12 @@ class AMDSMICommands(): self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, memory_partition, power_cap, dpm_policy, xgmi_plpd, - process_isolation, clear_sram_data) + process_isolation) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None, - compute_partition=None, memory_partition=None, power_cap=None): + compute_partition=None, memory_partition=None, power_cap=None, clear_sram_data=None): """Issue reset commands to target gpu(s) Args: @@ -3757,7 +3745,9 @@ class AMDSMICommands(): perf_determinism (bool, optional): Value override for args.perf_determinism. Defaults to None. compute_partition (bool, optional): Value override for args.compute_partition. Defaults to None. memory_partition (bool, optional): Value override for args.memory_partition. Defaults to None. - power_cap (int, optional): Value override for args.power_cap. Defaults to None. + power_cap (bool, optional): Value override for args.power_cap. Defaults to None. + clear_sram_data (bool, optional): Value override for args.clear_sram_data. Defaults to None. + Raises: ValueError: Value error if no gpu value is provided IndexError: Index error if gpu list is empty @@ -3786,6 +3776,8 @@ class AMDSMICommands(): args.memory_partition = memory_partition if power_cap: args.power_cap = power_cap + if clear_sram_data: + args.clear_sram_data = clear_sram_data # Handle No GPU passed if args.gpu == None: @@ -3804,7 +3796,7 @@ class AMDSMICommands(): # Error if no subcommand args are passed if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \ args.perf_determinism, args.compute_partition, args.memory_partition, \ - args.power_cap]): + args.power_cap, args.clear_sram_data]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3955,6 +3947,16 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to reset power cap to {default_power_cap_in_w} on GPU {gpu_id}") from e self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap_in_w}") + if args.clear_sram_data: + try: + # Only 1 can be used for now. + amdsmi_interface.amdsmi_set_gpu_clear_sram_data(args.gpu, 1) + result = 'Successfully clear GPU SRAM data' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to clear SRAM data on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'clear_sram_data', result) if multiple_devices: self.logger.store_multiple_device_output() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index c4bb1964ea..2ea1038700 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -579,6 +579,7 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-v', '--vram', action='store_true', required=False, help=vram_help) static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help) static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) + static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) # Options to display on Hypervisors and Baremetal if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): @@ -587,7 +588,6 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) - static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -985,7 +985,6 @@ class AMDSMIParser(argparse.ArgumentParser): set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)" set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value" - run_gpu_clear_sram_data_help = f"Clear the GPU SRAM data\n" # Help text for CPU Core set options set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value" @@ -1013,7 +1012,6 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS') - set_value_parser.add_argument('-c', '--clear-sram-data', action='store_true', required=False, help=run_gpu_clear_sram_data_help) if self.helpers.is_amd_hsmp_initialized(): if self.helpers.is_baremetal(): @@ -1039,8 +1037,8 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_reset_parser(self, subparsers, func): - if not(self.helpers.is_baremetal() and self.helpers.is_linux()): - # This subparser is only applicable to Baremetal Linux + if not self.helpers.is_linux(): + # This subparser is only applicable to Linux return if not self.helpers.is_amdgpu_initialized(): @@ -1063,6 +1061,7 @@ class AMDSMIParser(argparse.ArgumentParser): reset_compute_help = "Reset compute partitions on the specified GPU" reset_memory_help = "Reset memory partitions on the specified GPU" reset_power_cap_help = "Reset power capacity limit to max capable" + reset_gpu_clear_sram_data_help = "Clear the GPU SRAM data\n" # Create reset subparser reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help) @@ -1075,16 +1074,20 @@ class AMDSMIParser(argparse.ArgumentParser): # Device args are required as safeguard from the user applying the operation to all gpus unintentionally self._add_device_arguments(reset_parser, required=True) - # Add reset arguments - reset_parser.add_argument('-G', '--gpureset', action='store_true', required=False, help=gpureset_help) - reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=reset_clocks_help) - reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=reset_fans_help) - reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=reset_profile_help) - reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help) - reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help) - reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help) - reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help) - reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help) + if self.helpers.is_baremetal(): + # Add Baremetal reset arguments + reset_parser.add_argument('-G', '--gpureset', action='store_true', required=False, help=gpureset_help) + reset_parser.add_argument('-c', '--clocks', action='store_true', required=False, help=reset_clocks_help) + reset_parser.add_argument('-f', '--fans', action='store_true', required=False, help=reset_fans_help) + reset_parser.add_argument('-p', '--profile', action='store_true', required=False, help=reset_profile_help) + reset_parser.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help) + reset_parser.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help) + reset_parser.add_argument('-C', '--compute-partition', action='store_true', required=False, help=reset_compute_help) + reset_parser.add_argument('-M', '--memory-partition', action='store_true', required=False, help=reset_memory_help) + reset_parser.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help) + + # Add Baremetal and Virtual OS reset arguments + reset_parser.add_argument('-l', '--clear-sram-data', action='store_true', required=False, help=reset_gpu_clear_sram_data_help) def _add_monitor_parser(self, subparsers, func):