SWDEV-335697- Add support for dynamic partitioning

Original updates:
    * Added .gitignore to help with future commits
    * Updated/added copyrights on modified or added files
    * Updated rocm_smi.h/.cc
      - Added 3 new SMI API functions:
          rsmi_dev_compute_partition_set &
          rsmi_dev_compute_partition_get
      - Added helpful maps/enums used in
        new get/set compute_partition API calls
    * Updated rocm_smi.py
      - Added --showcomputepartition
      - Added --setcomputepartition
      - Fixed a few mistypes
    * Updated rsmiBindings.py - added helpful class/dict/list
    * Updated rocm_smi_example.cc
      - Added helpful MACRO to detect if api is not supported.
      - Added current_compute_partition set/get rocm lib calls
      - Added helpful macro to discover future RSMI errors
      - Commented out test_set_freq, was having permission issues
        on a Navi21
    * Updated rocm_smi_main.cc
      - Added helpful map to debug API calls, left in for future use
      - Added comment to better understand a non-class function returns
    * Added computepartition_read_write.cc/.h
      - Added get/set compute partition API test calls
      - Confirmed on devices that do not support the API calls, tests pass
    * Updated rocm_smi_test/main.cc
      - Calls new compute partition gtests

Added following updates from review feedback:
   * Updated rocm_smi.h/cc
       - Removed C++ API calls, adding support for both C/C++
         API calls could cause confusion and adds extra work for us
       - rsmi_dev_compute_partition_get -> Fixed an edge case where
         user gives a small buffer length size (smaller than data
         received), but does not receive the partial buffer back.
         google Tests are updated to reflect this find.
   * Updated rocm_smi_example.cc
       - Fixed test_set_freq, issue was that file was not writable.
         We now indicate this warning, so prior errors make sense.
       - General test code cleanup. Removed extra code,
         by creating loops for tests.
   * Updated rocm_smi_main.cc
     - Moved and got rid of an external reference to a map used
       for debugging RSMI enums, now is a const public reference.
   * Updated rocm_smi.py
     - Updated python code to identify NOT_SUPPORTED due to
       (currently) only a few GPU support the feature

Change-Id: I4a567acbb59d6771fb64df08d19175fe3604fd1b


[ROCm/rocm_smi_lib commit: 4d7f3f2bc7]
Tento commit je obsažen v:
Charis Poag
2023-01-06 11:01:18 -06:00
rodič 621a2c76da
revize 1b8d3f507a
14 změnil soubory, kde provedl 1111 přidání a 54 odebrání
+124
Zobrazit soubor
@@ -0,0 +1,124 @@
#
# NOTE! Don't add files that are generated in specific
# subdirectories here. Add them in the ".gitignore" file
# in that subdirectory instead.
#
# NOTE! Please use 'git ls-files -i --exclude-standard'
# command after changing this file, to see if there are
# any tracked files which get ignored after the change.
#
# Normal rules
#
.*
*.o
*.o.*
*.a
*.s
*.ko
*.so
*.so.dbg
*.mod.c
*.i
*.lst
*.symtypes
*.order
modules.builtin
*.elf
*.bin
*.gz
*.bz2
*.lzma
*.xz
*.lzo
#*.patch
*.gcno
*.pyc
*current_compute_partition
#
# Top-level generic files/folders
#
/[Bb][Ui][Ll][Dd]
*/[Bb][Ui][Ll][Dd]
/build
*/build
/[Gg][Tt][Ee][Ss][Tt][Ss]
*/[Gg][Tt][Ee][Ss][Tt][Ss]
/tags
/TAGS
/linux
/vmlinux
/vmlinuz
/System.map
/Module.markers
Module.symvers
#
# Debian directory (make deb-pkg)
#
/debian/
#
# git files that we don't want to ignore even it they are dot-files
#
!.gitignore
!.mailmap
### VisualStudioCode ###
!.vscode/settings.json
#
# Generated include files
#
include/config
include/linux/version.h
include/generated
arch/*/include/generated
# git generated dirs
patches-*
# quilt's files
patches
series
# cscope files
cscope.*
ncscope.*
# gnu global files
GPATH
GRTAGS
GSYMS
GTAGS
*.orig
*~
\#*#
#
# Leavings from module signing
#
extra_certificates
signing_key.priv
signing_key.x509
x509.genkey
#cmake files
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
#
# ROCm files
# Removes generated config headers like rocmsmi64Config.h & oamConfig.h
#
*Config.h
+85 -1
Zobrazit soubor
@@ -3,7 +3,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -352,6 +352,26 @@ typedef enum {
typedef rsmi_clk_type_t rsmi_clk_type;
/// \endcond
/**
* Compute Partition types
*/
typedef enum {
RSMI_COMPUTE_PARTITION_INVALID = 0,
RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
//!< shared memory
RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs
//!< work together with shared memory
} rsmi_compute_partition_type_t;
/// \cond Ignore in docs.
typedef rsmi_compute_partition_type_t rsmi_compute_partition_type;
/// \endcond
/**
* @brief Temperature Metrics. This enum is used to identify various
* temperature metrics. Corresponding values will be in millidegress
@@ -3470,6 +3490,70 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
/** @} */ // end of HWTopo
/*****************************************************************************/
/** @defgroup ComputePartition Compute Partition Functions
* These functions are used to configure and query the device's
* compute parition setting.
* @{
*/
/**
* @brief Retrieves the current compute partitioning for a desired device
*
* @details
* Given a device index @p dv_ind and a string @p compute_partition ,
* and uint32 @p len , this function will attempt to obtain the device's
* current compute partition setting string. Upon successful retreival,
* the obtained device's compute partition settings string shall be stored in
* the passed @p compute_partition char string variable.
*
* @param[in] dv_ind a device index
*
* @param[inout] compute_partition a pointer to a char string variable,
* which the device's current compute partition will be written to.
*
* @param[in] len the length of the caller provided buffer @p compute_partition
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not
* large enough to hold the entire compute partition value. In this case,
* only @p len bytes will be written.
*
*/
rsmi_status_t
rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
uint32_t len);
/**
* @brief Modifies a selected device's compute partition setting.
*
* @details Given a device index @p dv_ind, a type of compute partition
* @p compute_partition, this function will attempt to update the selected
* device's compute partition setting.
*
* @param[in] dv_ind a device index
*
* @param[inout] compute_partition using enum ::rsmi_copmpute_partition_type_t,
* define what the selected device's compute partition setting should be
* updated to.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_PERMISSION function requires root access
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
*
*/
rsmi_status_t
rsmi_dev_compute_partition_set(uint32_t dv_ind,
rsmi_compute_partition_type_t compute_partition);
/** @} */ // end of ComputePartition
/*****************************************************************************/
/** @defgroup APISupport Supported Functions
* API function support varies by both GPU type and the version of the
+3 -2
Zobrazit soubor
@@ -3,7 +3,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -161,7 +161,8 @@ enum DevInfoTypes {
kDevMemPageBad,
kDevNumaNode,
kDevGpuMetrics,
kDevGpuReset
kDevGpuReset,
kDevComputePartition
};
typedef struct {
+1
Zobrazit soubor
@@ -113,6 +113,7 @@ class RocmSMI {
uint64_t *weight);
int get_node_index(uint32_t dv_ind, uint32_t *node_ind);
const RocmSMI_env_vars& getEnv(void);
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
private:
std::vector<std::shared_ptr<Device>> devices_;
+63 -3
Zobrazit soubor
@@ -760,7 +760,7 @@ def resetPerfDeterminism(deviceList):
if rsmi_ret_ok(ret, device, 'disable performance determinism'):
printLog(device, 'Successfully disabled performance determinism', None)
else:
logging.error('GPU[%s]\t\t: Unable to diable performance determinism', device)
logging.error('GPU[%s]\t\t: Unable to disable performance determinism', device)
printLogSpacer()
@@ -1305,6 +1305,37 @@ def setProfile(deviceList, profile):
printLogSpacer()
def setComputePartition(deviceList, computePartitionType):
""" Sets compute partitioning for a list of device
@param deviceList: List of DRM devices (can be a single-item list)
@param computePartition: Compute Partition type to set as
"""
printLogSpacer(' Set compute partition to %s ' % (str(computePartitionType).upper()))
for device in deviceList:
computePartitionType = computePartitionType.upper()
if computePartitionType not in compute_partition_type_l:
printErrLog(device, 'Invalid compute partition type %s'
'\nValid compute partition types are %s'
% ( computePartitionType.upper(),
(', '.join(map(str, compute_partition_type_l))) ))
return (None, None)
ret = rocmsmi.rsmi_dev_compute_partition_set(device,
rsmi_compute_partition_type_dict[computePartitionType])
if rsmi_ret_ok(ret, device, silent=True):
printLog(device,
'Successfully set compute partition to %s' % (computePartitionType),
None)
elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION:
printLog(device, 'Permission denied', None)
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
printLogSpacer()
def showAllConcise(deviceList):
""" Display critical info for all devices in a concise format
@@ -2732,6 +2763,24 @@ def showNodesBw(deviceList):
if nonXgmi:
printLog(None,"Non-xGMI links detected and is currently not supported", None)
def showComputePartition(deviceList):
""" Returns the current compute partitioning for a list of devices
@param deviceList: List of DRM devices (can be a single-item list)
"""
currentComputePartition = create_string_buffer(256)
printLogSpacer(' Current Compute Partition ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode():
printLog(device, 'Compute Partition', currentComputePartition.value.decode())
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
printLog(device, 'Not supported on the given system', None)
else:
rsmi_ret_ok(ret, device)
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.', None)
printLogSpacer()
def checkAmdGpus(deviceList):
""" Check if there are any AMD GPUs being queried,
return False if there are none
@@ -2905,6 +2954,8 @@ def relaunchAsSudo():
"""
if os.geteuid() != 0:
os.execvp('sudo', ['sudo'] + sys.argv)
#keeping below, if we want to run sudo with user's env variables
#os.execvp('sudo', ['sudo', '-E'] + sys.argv)
def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
@@ -2936,7 +2987,6 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
return False
return True
def save(deviceList, savefilepath):
""" Save clock frequencies and fan speeds for a list of devices to a specified file path.
@@ -3077,6 +3127,7 @@ if __name__ == '__main__':
groupDisplay.add_argument('--showenergycounter', help='Energy accumulator that stores amount of energy consumed',
action='store_true')
groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true')
groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true')
groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default',
action='store_true')
@@ -3121,6 +3172,10 @@ if __name__ == '__main__':
groupAction.add_argument('--setperfdeterminism',
help='Set clock frequency limit to get minimal performance variation', type=int,
metavar='SCLK', nargs=1)
groupAction.add_argument('--setcomputepartition', help='Set compute partition',
choices=compute_partition_type_l + [x.lower() for x in compute_partition_type_l],
type=str, nargs=1
)
groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2,
metavar=('BLOCK', 'ERRTYPE'))
groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2,
@@ -3158,7 +3213,7 @@ if __name__ == '__main__':
or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \
or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \
args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \
args.setvc or args.setsrange or args.setmrange or args.setclock:
args.setvc or args.setsrange or args.setmrange or args.setclock or args.setcomputepartition:
relaunchAsSudo()
# If there is one or more device specified, use that for all commands, otherwise use a
@@ -3220,6 +3275,7 @@ if __name__ == '__main__':
args.showpidgpus = []
args.showreplaycount = True
args.showvc = True
args.showcomputepartition = True
if not PRINT_JSON:
args.showprofile = True
@@ -3348,6 +3404,8 @@ if __name__ == '__main__':
showVoltageCurve(deviceList)
if args.showenergycounter:
showEnergy(deviceList)
if args.showcomputepartition:
showComputePartition(deviceList)
if args.setclock:
setClocks(deviceList, args.setclock[0], [int(args.setclock[1])])
if args.setsclk:
@@ -3386,6 +3444,8 @@ if __name__ == '__main__':
setClockRange(deviceList, 'mclk', args.setmrange[0], args.setmrange[1], args.autorespond)
if args.setperfdeterminism:
setPerfDeterminism(deviceList, args.setperfdeterminism[0])
if args.setcomputepartition:
setComputePartition(deviceList, args.setcomputepartition[0])
if args.resetprofile:
resetProfile(deviceList)
if args.resetxgmierr:
+25
Zobrazit soubor
@@ -582,3 +582,28 @@ class rsmi_func_id_value_t(Union):
_fields_ = [('id', c_uint64),
('name', c_char_p),
('submodule', submodule_union)]
class rsmi_compute_partition_type_t(c_int):
RSMI_COMPUTE_PARTITION_INVALID = 0
RSMI_COMPUTE_PARTITION_CPX = 1
RSMI_COMPUTE_PARTITION_SPX = 2
RSMI_COMPUTE_PARTITION_DPX = 3
RSMI_COMPUTE_PARTITION_TPX = 4
RSMI_COMPUTE_PARTITION_QPX = 5
rsmi_compute_partition_type_dict = {
#'RSMI_COMPUTE_PARTITION_INVALID': 0,
'CPX': 1,
'SPX': 2,
'DPX': 3,
'TPX': 4,
'QPX': 5
}
rsmi_compute_partition_type = rsmi_compute_partition_type_t
# compute_partition_type_l includes string names for the rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX]
# will return string 'CPX'
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
+192 -36
Zobrazit soubor
@@ -5,7 +5,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -50,13 +50,14 @@
#include <vector>
#include <iostream>
#include <bitset>
#include <map>
#include "rocm_smi/rocm_smi.h"
#define PRINT_RSMI_ERR(RET) { \
if (RET != RSMI_STATUS_SUCCESS) { \
const char *err_str; \
std::cout << "RSMI call returned " << (RET) \
std::cout << "[ERROR] RSMI call returned " << (RET) \
<< " at line " << __LINE__ << std::endl; \
rsmi_status_string((RET), &err_str); \
std::cout << err_str << std::endl; \
@@ -70,6 +71,11 @@
} \
}
#define CHK_AND_PRINT_RSMI_ERR_RET(RET) { \
PRINT_RSMI_ERR(RET) \
CHK_RSMI_RET(RET) \
}
#define CHK_RSMI_RET_I(RET) { \
PRINT_RSMI_ERR(RET) \
if (RET != RSMI_STATUS_SUCCESS) { \
@@ -85,6 +91,15 @@
} \
}
#define CHK_RSMI_NOT_SUPPORTED_RET(RET) { \
if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \
std::cout << "This function is not supported in the current environment." \
<< std::endl; \
} else { \
CHK_RSMI_RET(RET) \
} \
}
static void print_test_header(const char *str, uint32_t dv_ind) {
std::cout << "********************************" << std::endl;
std::cout << "*** " << str << std::endl;
@@ -92,6 +107,10 @@ static void print_test_header(const char *str, uint32_t dv_ind) {
std::cout << "Device index: " << dv_ind << std::endl;
}
static void print_mini_header(const char *str) {
std::cout << "\n>> " << str << " <<" << std::endl;
}
static const char *
power_profile_string(rsmi_power_profile_preset_masks_t profile) {
switch (profile) {
@@ -112,6 +131,33 @@ power_profile_string(rsmi_power_profile_preset_masks_t profile) {
}
}
static const std::string
compute_partition_string(rsmi_compute_partition_type partition) {
switch (partition) {
case RSMI_COMPUTE_PARTITION_CPX:
return "CPX";
case RSMI_COMPUTE_PARTITION_SPX:
return "SPX";
case RSMI_COMPUTE_PARTITION_DPX:
return "DPX";
case RSMI_COMPUTE_PARTITION_TPX:
return "TPX";
case RSMI_COMPUTE_PARTITION_QPX:
return "QPX";
default:
return "UNKNOWN";
}
}
static std::map<std::string, rsmi_compute_partition_type_t>
mapStringToRSMIComputePartitionTypes {
{"CPX", RSMI_COMPUTE_PARTITION_CPX},
{"SPX", RSMI_COMPUTE_PARTITION_SPX},
{"DPX", RSMI_COMPUTE_PARTITION_DPX},
{"TPX", RSMI_COMPUTE_PARTITION_TPX},
{"QPX", RSMI_COMPUTE_PARTITION_QPX}
};
static const char *
perf_level_string(rsmi_dev_perf_level_t perf_lvl) {
switch (perf_lvl) {
@@ -128,6 +174,34 @@ perf_level_string(rsmi_dev_perf_level_t perf_lvl) {
}
}
static bool isUserRunningAsSudo() {
bool isRunningWithSudo = false;
auto myUID = getuid();
auto myPrivledges = geteuid();
if (myUID == myPrivledges) {
isRunningWithSudo = true;
}
return isRunningWithSudo;
}
bool isFileWritable(rsmi_status_t response) {
// Clock files may not be writable, causing sets to
// return RSMI_STATUS_PERMISSION. If running as sudo,
// this means file is not writable.
// isFileWritable(ret) - intends to capture this
// response situation.
bool fileWritable = true;
if (isUserRunningAsSudo() && (response == RSMI_STATUS_PERMISSION)) {
PRINT_RSMI_ERR(response)
std::cout << "[WARN] User is running with sudo "
<< "permissions, file is not writable." << std::endl;
fileWritable = false;
} else {
CHK_AND_PRINT_RSMI_ERR_RET(response)
}
return fileWritable;
}
static rsmi_status_t test_power_profile(uint32_t dv_ind) {
rsmi_status_t ret;
rsmi_power_profile_status_t status;
@@ -355,13 +429,19 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) {
uint32_t freq_bitmask;
rsmi_clk_type rsmi_clk;
// Clock files may not be writable, causing sets to
// return RSMI_STATUS_PERMISSION even if running with
// sudo. See isFileWritable() for more info.
print_test_header("Clock Frequency Control", dv_ind);
for (uint32_t clk = (uint32_t)RSMI_CLK_TYPE_FIRST;
clk <= RSMI_CLK_TYPE_LAST; ++clk) {
std::string miniHeader = "Testing clock" + std::to_string(clk);
print_mini_header(miniHeader.c_str());
rsmi_clk = (rsmi_clk_type)clk;
ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f);
CHK_RSMI_RET(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "Initial frequency for clock" << rsmi_clk << " is " <<
f.current << std::endl;
@@ -380,19 +460,20 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) {
" to 0b" << freq_bm_str << " ..." << std::endl;
ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask);
CHK_RSMI_RET(ret)
isFileWritable(ret);
ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f);
CHK_RSMI_RET(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "Frequency is now index " << f.current << std::endl;
std::cout << "Resetting mask to all frequencies." << std::endl;
ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF);
CHK_RSMI_RET(ret)
isFileWritable(ret);
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO);
CHK_RSMI_RET(ret)
isFileWritable(ret);
}
std::cout << std::endl;
return RSMI_STATUS_SUCCESS;
}
@@ -406,13 +487,75 @@ static void print_frequencies(rsmi_frequencies_t *f) {
std::cout << std::endl;
}
}
static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) {
rsmi_status_t ret;
uint32_t buffer_len = 10;
char originalComputePartition[buffer_len];
print_test_header("Compute Partitioning Control", dv_ind);
/**
typedef enum {
RSMI_COMPUTE_PARTITION_INVALID = 0,
RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
//!< shared memory
RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs
//!< work together with shared memory
} rsmi_compute_partition_type_t;
*/
ret = rsmi_dev_compute_partition_get(dv_ind, originalComputePartition, buffer_len);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "Device does not support the compute partition feature."
<< std::endl;
std::cout << "*********************************************" << std::endl;
return RSMI_STATUS_SUCCESS;
} else {
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "Original compute partition is " << originalComputePartition
<< "." << std::endl;
}
for (int newComputePartition = RSMI_COMPUTE_PARTITION_CPX;
newComputePartition <= RSMI_COMPUTE_PARTITION_QPX;
newComputePartition++) {
rsmi_compute_partition_type newPartition
= static_cast<rsmi_compute_partition_type>(newComputePartition);
std::cout << "Attempting to set compute partition to "
<< compute_partition_string(newPartition) << "..."
<< std::endl;
ret = rsmi_dev_compute_partition_set(dv_ind, newPartition);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done setting compute partition to "
<< compute_partition_string(newPartition)
<< "." << std::endl;
std::cout << std::endl << std::endl;
}
std::string myComputePartition = originalComputePartition;
if (myComputePartition.empty() == false) {
std::cout << "Resetting compute partition to " << originalComputePartition
<< "... " << std::endl;
rsmi_compute_partition_type origComputePartitionType
= mapStringToRSMIComputePartitionTypes[originalComputePartition];
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "Done" << std::endl;
ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType);
}
return RSMI_STATUS_SUCCESS;
}
int main() {
rsmi_status_t ret;
ret = rsmi_init(0);
CHK_RSMI_RET_I(ret)
std::string val_str;
std::vector<std::string> val_vec;
uint64_t val_ui64, val2_ui64;
int64_t val_i64;
@@ -424,98 +567,111 @@ int main() {
rsmi_gpu_metrics_t p;
rsmi_num_monitor_devices(&num_monitor_devs);
for (uint32_t i = 0; i< num_monitor_devs; ++i) {
for (uint32_t i = 0; i < num_monitor_devs; ++i) {
ret = rsmi_dev_id_get(i, &val_ui16);
CHK_RSMI_RET_I(ret)
std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl;
std::cout << std::endl << std::endl;
std::cout << "Starting to call "
<< "rsmi_dev_compute_partition_get()..."
<< std::endl;
char current_compute_partition[256];
ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256);
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "\t**Current Compute Partition setting: "
<< current_compute_partition << std::endl;
ret = rsmi_dev_gpu_metrics_info_get(i, &p);
CHK_RSMI_RET(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**GPU METRICS" << std::endl;
ret = rsmi_dev_perf_level_get(i, &pfl);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Performance Level:" <<
perf_level_string(pfl) << std::endl;
ret = rsmi_dev_overdrive_level_get(i, &val_ui32);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**OverDrive Level:" << val_ui32 << std::endl;
ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_MEM, &f);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Supported GPU Memory clock frequencies: ";
std::cout << f.num_supported << std::endl;
print_frequencies(&f);
ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &f);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Supported GPU clock frequencies: ";
std::cout << f.num_supported << std::endl;
print_frequencies(&f);
char name[20];
ret = rsmi_dev_name_get(i, name, 20);
CHK_RSMI_RET_I(ret)
char name[128];
ret = rsmi_dev_name_get(i, name, 128);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Monitor name: " << name << std::endl;
ret = rsmi_dev_temp_metric_get(i, 0, RSMI_TEMP_CURRENT, &val_i64);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Temperature: " << val_i64/1000 << "C" << std::endl;
ret = rsmi_dev_volt_metric_get(i, RSMI_VOLT_TYPE_VDDGFX,
RSMI_VOLT_CURRENT, &val_i64);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Voltage: " << val_i64 << "mV" << std::endl;
ret = rsmi_dev_fan_speed_get(i, 0, &val_i64);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = rsmi_dev_fan_speed_max_get(i, 0, &val_ui64);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Current Fan Speed: ";
std::cout << val_i64/static_cast<int64_t>(val_ui64)*100;
std::cout << "% ("<< val_i64 << "/" << val_ui64 << ")" << std::endl;
ret = rsmi_dev_fan_rpms_get(i, 0, &val_i64);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Current fan RPMs: " << val_i64 << std::endl;
ret = rsmi_dev_power_cap_get(i, 0, &val_ui64);
CHK_RSMI_PERM_RET(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Current Power Cap: " << val_ui64 << "uW" <<std::endl;
ret = rsmi_dev_power_cap_range_get(i, 0, &val_ui64, &val2_ui64);
CHK_RSMI_PERM_RET(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Power Cap Range: " << val2_ui64 << " to " <<
val_ui64 << " uW" << std::endl;
ret = rsmi_dev_power_ave_get(i, 0, &val_ui64);
CHK_RSMI_PERM_RET(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t**Averge Power Usage: ";
std::cout << static_cast<float>(val_ui64)/1000 << " W" <<
std::endl;
std::cout << static_cast<float>(val_ui64)/1000 << " W" << std::endl;
ret = rsmi_dev_power_ave_get(i, 0, &val_ui64);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
std::cout << "\t=======" << std::endl;
}
std::cout << "***** Testing write api's" << std::endl;
for (uint32_t i = 0; i< num_monitor_devs; ++i) {
ret = test_set_overdrive(i);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = test_set_perf_level(i);
CHK_RSMI_RET_I(ret)
ret = test_set_freq(i);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = test_set_fan_speed(i);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = test_power_cap(i);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = test_power_profile(i);
CHK_RSMI_RET_I(ret)
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = test_set_compute_partitioning(i);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
ret = test_set_freq(i);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
}
return 0;
+131 -3
Zobrazit soubor
@@ -3,7 +3,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -45,7 +45,7 @@
#include <errno.h>
#include <sys/utsname.h>
#include <pthread.h>
#include <string.h>
#include <string>
#include <unistd.h>
#include <poll.h>
#include <fcntl.h>
@@ -1678,13 +1678,30 @@ static std::vector<std::string> pci_name_files = {
"/var/lib/pciutils/pci.ids"
};
enum eNameStrType {
NAME_STR_VENDOR = 0,
NAME_STR_DEVICE,
NAME_STR_SUBSYS
};
std::map<std::string, rsmi_compute_partition_type_t>
mapStringToRSMIComputePartitionTypes {
{"CPX", RSMI_COMPUTE_PARTITION_CPX},
{"SPX", RSMI_COMPUTE_PARTITION_SPX},
{"DPX", RSMI_COMPUTE_PARTITION_DPX},
{"TPX", RSMI_COMPUTE_PARTITION_TPX},
{"QPX", RSMI_COMPUTE_PARTITION_QPX}
};
std::map<rsmi_compute_partition_type_t, std::string>
mapRSMIToStringComputePartitionTypes {
{RSMI_COMPUTE_PARTITION_CPX, "CPX"},
{RSMI_COMPUTE_PARTITION_SPX, "SPX"},
{RSMI_COMPUTE_PARTITION_DPX, "DPX"},
{RSMI_COMPUTE_PARTITION_TPX, "TPX"},
{RSMI_COMPUTE_PARTITION_QPX, "QPX"}
};
static std::string
get_id_name_str_from_line(uint64_t id, std::string ln,
std::istringstream *ln_str) {
@@ -3697,6 +3714,117 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
CATCH
}
static rsmi_status_t
get_compute_partition(uint32_t dv_ind, std::string &compute_partition) {
TRY
std::string val_str;
if (compute_partition.c_str() == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
CHK_SUPPORT_NAME_ONLY(compute_partition.c_str())
DEVICE_MUTEX
rsmi_status_t ret = get_dev_value_str(amd::smi::kDevComputePartition,
dv_ind, &val_str);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
switch (mapStringToRSMIComputePartitionTypes[val_str]) {
case RSMI_COMPUTE_PARTITION_INVALID:
// Retrieved an unknown compute partition
return RSMI_STATUS_UNEXPECTED_DATA;
case RSMI_COMPUTE_PARTITION_CPX:
break;
case RSMI_COMPUTE_PARTITION_SPX:
break;
case RSMI_COMPUTE_PARTITION_DPX:
break;
case RSMI_COMPUTE_PARTITION_TPX:
break;
case RSMI_COMPUTE_PARTITION_QPX:
break;
default:
// Retrieved an unknown compute partition
return RSMI_STATUS_UNEXPECTED_DATA;
}
compute_partition = val_str;
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
uint32_t len) {
CHK_SUPPORT_NAME_ONLY(compute_partition)
if ((len == 0) || (compute_partition == nullptr)) {
return RSMI_STATUS_INVALID_ARGS;
}
TRY
std::string returning_compute_partition;
rsmi_status_t ret = get_compute_partition(dv_ind,
returning_compute_partition);
if (ret != RSMI_STATUS_SUCCESS) { return ret; }
std::size_t length = returning_compute_partition.copy(compute_partition, len);
compute_partition[length]='\0';
if (len < (returning_compute_partition.size() + 1)) {
return RSMI_STATUS_INSUFFICIENT_SIZE;
}
return ret;
CATCH
}
rsmi_status_t
rsmi_dev_compute_partition_set(uint32_t dv_ind,
rsmi_compute_partition_type_t compute_partition) {
TRY
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
std::string newComputePartitionStr
= mapRSMIToStringComputePartitionTypes[compute_partition];
std::string currentComputePartition;
switch (compute_partition) {
case RSMI_COMPUTE_PARTITION_INVALID:
// Retrieved an unknown compute partition
return RSMI_STATUS_INVALID_ARGS;
case RSMI_COMPUTE_PARTITION_CPX:
break;
case RSMI_COMPUTE_PARTITION_SPX:
break;
case RSMI_COMPUTE_PARTITION_DPX:
break;
case RSMI_COMPUTE_PARTITION_TPX:
break;
case RSMI_COMPUTE_PARTITION_QPX:
break;
default:
return RSMI_STATUS_INVALID_ARGS;
}
// do nothing if compute_partition is the current compute partition
get_compute_partition(dv_ind, currentComputePartition);
rsmi_compute_partition_type_t currRSMIComputePartition
= mapStringToRSMIComputePartitionTypes[currentComputePartition];
if (currRSMIComputePartition == compute_partition) {
return RSMI_STATUS_SUCCESS;
}
newComputePartitionStr = mapRSMIToStringComputePartitionTypes[compute_partition];
GET_DEV_FROM_INDX
int ret = dev->writeDevInfo(amd::smi::kDevComputePartition,
newComputePartitionStr);
return amd::smi::ErrnoToRsmiStatus(ret);
CATCH
}
enum iterator_handle_type {
FUNC_ITER = 0,
VARIANT_ITER,
+8 -5
Zobrazit soubor
@@ -3,7 +3,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -121,6 +121,7 @@ static const char *kDevXGMIErrorFName = "xgmi_error";
static const char *kDevSerialNumberFName = "serial_number";
static const char *kDevNumaNodeFName = "numa_node";
static const char *kDevGpuMetricsFName = "gpu_metrics";
static const char *kDevComputePartitionFName = "current_compute_partition";
// Firmware version files
static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version";
@@ -290,6 +291,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevNumaNode, kDevNumaNodeFName},
{kDevGpuMetrics, kDevGpuMetricsFName},
{kDevGpuReset, kDevGpuResetFName},
{kDevComputePartition, kDevComputePartitionFName},
};
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
@@ -413,6 +415,8 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
{"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}},
{"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}},
{"rsmi_dev_compute_partition_set", {{kDevComputePartitionFName}, {}}},
// These functions with variants, but no sensors/units. (May or may not
// have mandatory dependencies.)
@@ -563,9 +567,6 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
if (env_->path_DRM_root_override && type == env_->enum_override) {
sysfs_path = env_->path_DRM_root_override;
if (str) {
sysfs_path += ".write";
}
}
#endif
@@ -587,7 +588,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
fs->open(sysfs_path);
if (!fs->is_open()) {
return errno;
return errno;
}
return 0;
@@ -696,6 +697,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) {
case kDevPCIEClk:
case kDevPowerODVoltage:
case kDevSOCClk:
case kDevComputePartition:
return writeDevInfoStr(type, val);
default:
@@ -922,6 +924,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
case kDevVBiosVer:
case kDevPCIEThruPut:
case kDevSerialNumber:
case kDevComputePartition:
return readDevInfoStr(type, val);
break;
+2 -2
Zobrazit soubor
@@ -3,7 +3,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2021, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -280,7 +280,7 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
return RSMI_STATUS_NOT_SUPPORTED;
}
// Initialize the smu fiedls to zero as some of them only valid in
// Initialize the smu fields to zero as some of them only valid in
// a specific version.
*smu = {};
+74 -1
Zobrazit soubor
@@ -2,7 +2,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -73,6 +73,77 @@ static const char *kDeviceNamePrefix = "card";
static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""};
static const std::string amdSMI = "amd::smi::";
const std::map<amd::smi::DevInfoTypes, std::string> amd::smi::RocmSMI::devInfoTypesStrings = {
{amd::smi::kDevPerfLevel, amdSMI + "kDevPerfLevel"},
{amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"},
{amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"},
{amd::smi::kDevDevID, amdSMI + "kDevDevID"},
{amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"},
{amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"},
{amd::smi::kDevVendorID, amdSMI + "kDevVendorID"},
{amd::smi::kDevSubSysDevID, amdSMI + "kDevSubSysDevID"},
{amd::smi::kDevSubSysVendorID, amdSMI + "kDevSubSysVendorID"},
{amd::smi::kDevGPUMClk, amdSMI + "kDevGPUMClk"},
{amd::smi::kDevGPUSClk, amdSMI + "kDevGPUSClk"},
{amd::smi::kDevDCEFClk, amdSMI + "kDevDCEFClk"},
{amd::smi::kDevFClk, amdSMI + "kDevFClk"},
{amd::smi::kDevSOCClk, amdSMI + "kDevSOCClk"},
{amd::smi::kDevPCIEClk, amdSMI + "kDevPCIEClk"},
{amd::smi::kDevPowerProfileMode, amdSMI + "kDevPowerProfileMode"},
{amd::smi::kDevUsage, amdSMI + "kDevUsage"},
{amd::smi::kDevPowerODVoltage, amdSMI + "kDevPowerODVoltage"},
{amd::smi::kDevVBiosVer, amdSMI + "kDevVBiosVer"},
{amd::smi::kDevPCIEThruPut, amdSMI + "kDevPCIEThruPut"},
{amd::smi::kDevErrCntSDMA, amdSMI + "kDevErrCntSDMA"},
{amd::smi::kDevErrCntUMC, amdSMI + "kDevErrCntUMC"},
{amd::smi::kDevErrCntGFX, amdSMI + "kDevErrCntGFX"},
{amd::smi::kDevErrCntMMHUB, amdSMI + "kDevErrCntMMHUB"},
{amd::smi::kDevErrCntPCIEBIF, amdSMI + "kDevErrCntPCIEBIF"},
{amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"},
{amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"},
{amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"},
{amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"},
{amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"},
{amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"},
{amd::smi::kDevMemUsedGTT, amdSMI + "kDevMemUsedGTT"},
{amd::smi::kDevMemUsedVisVRAM, amdSMI + "kDevMemUsedVisVRAM"},
{amd::smi::kDevMemUsedVRAM, amdSMI + "kDevMemUsedVRAM"},
{amd::smi::kDevVramVendor, amdSMI + "kDevVramVendor"},
{amd::smi::kDevPCIEReplayCount, amdSMI + "kDevPCIEReplayCount"},
{amd::smi::kDevUniqueId, amdSMI + "kDevUniqueId"},
{amd::smi::kDevDFCountersAvailable, amdSMI + "kDevDFCountersAvailable"},
{amd::smi::kDevMemBusyPercent, amdSMI + "kDevMemBusyPercent"},
{amd::smi::kDevXGMIError, amdSMI + "kDevXGMIError"},
{amd::smi::kDevFwVersionAsd, amdSMI + "kDevFwVersionAsd"},
{amd::smi::kDevFwVersionCe, amdSMI + "kDevFwVersionCe"},
{amd::smi::kDevFwVersionDmcu, amdSMI + "kDevFwVersionDmcu"},
{amd::smi::kDevFwVersionMc, amdSMI + "kDevFwVersionMc"},
{amd::smi::kDevFwVersionMe, amdSMI + "kDevFwVersionMe"},
{amd::smi::kDevFwVersionMec, amdSMI + "kDevFwVersionMec"},
{amd::smi::kDevFwVersionMec2, amdSMI + "kDevFwVersionMec2"},
{amd::smi::kDevFwVersionPfp, amdSMI + "kDevFwVersionPfp"},
{amd::smi::kDevFwVersionRlc, amdSMI + "kDevFwVersionRlc"},
{amd::smi::kDevFwVersionRlcSrlc, amdSMI + "kDevFwVersionRlcSrlc"},
{amd::smi::kDevFwVersionRlcSrlg, amdSMI + "kDevFwVersionRlcSrlg"},
{amd::smi::kDevFwVersionRlcSrls, amdSMI + "kDevFwVersionRlcSrls"},
{amd::smi::kDevFwVersionSdma, amdSMI + "kDevFwVersionSdma"},
{amd::smi::kDevFwVersionSdma2, amdSMI + "kDevFwVersionSdma2"},
{amd::smi::kDevFwVersionSmc, amdSMI + "kDevFwVersionSmc"},
{amd::smi::kDevFwVersionSos, amdSMI + "kDevFwVersionSos"},
{amd::smi::kDevFwVersionTaRas, amdSMI + "kDevFwVersionTaRas"},
{amd::smi::kDevFwVersionTaXgmi, amdSMI + "kDevFwVersionTaXgmi"},
{amd::smi::kDevFwVersionUvd, amdSMI + "kDevFwVersionUvd"},
{amd::smi::kDevFwVersionVce, amdSMI + "kDevFwVersionVce"},
{amd::smi::kDevFwVersionVcn, amdSMI + "kDevFwVersionVcn"},
{amd::smi::kDevSerialNumber, amdSMI + "kDevSerialNumber"},
{amd::smi::kDevMemPageBad, amdSMI + "kDevMemPageBad"},
{amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"},
{amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"},
{amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"},
{amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"}
};
namespace amd {
namespace smi {
@@ -179,6 +250,8 @@ static bool bdfid_from_path(const std::string in_name, uint64_t *bdfid) {
return true;
}
// 0 = successful bdfid found
// 1 = not a good bdfid found
static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
assert(bdfid != nullptr);
char tpath[256] = {'\0'};
+324
Zobrazit soubor
@@ -0,0 +1,324 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/computepartition_read_write.h"
#include "rocm_smi_test/test_common.h"
TestComputePartitionReadWrite::TestComputePartitionReadWrite() : TestBase() {
set_title("RSMI Compute Partition Read/Write Test");
set_description("The Compute Parition tests verifies that the compute "
"parition can be read and updated properly.");
}
TestComputePartitionReadWrite::~TestComputePartitionReadWrite(void) {
}
void TestComputePartitionReadWrite::SetUp(void) {
TestBase::SetUp();
return;
}
void TestComputePartitionReadWrite::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestComputePartitionReadWrite::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestComputePartitionReadWrite::Close() {
// This will close handles opened within rsmitst utility calls and call
// rsmi_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
static const std::string
computePartitionString(rsmi_compute_partition_type computeParitionType) {
/**
* RSMI_COMPUTE_PARTITION_INVALID = 0,
* RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
* //!< shared memory
* RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
* //!< together with shared memory
* RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
* //!< together with shared memory
* RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs
* //!< work together with shared memory
* RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs
* //!< work together with shared memory
*/
switch (computeParitionType) {
case RSMI_COMPUTE_PARTITION_CPX:
return "CPX";
case RSMI_COMPUTE_PARTITION_SPX:
return "SPX";
case RSMI_COMPUTE_PARTITION_DPX:
return "DPX";
case RSMI_COMPUTE_PARTITION_TPX:
return "TPX";
case RSMI_COMPUTE_PARTITION_QPX:
return "QPX";
default:
return "UNKNOWN";
}
}
static const std::map<std::string, rsmi_compute_partition_type_t>
mapStringToRSMIComputePartitionTypes {
{"CPX", RSMI_COMPUTE_PARTITION_CPX},
{"SPX", RSMI_COMPUTE_PARTITION_SPX},
{"DPX", RSMI_COMPUTE_PARTITION_DPX},
{"TPX", RSMI_COMPUTE_PARTITION_TPX},
{"QPX", RSMI_COMPUTE_PARTITION_QPX}
};
void TestComputePartitionReadWrite::Run(void) {
rsmi_status_t ret, err;
char orig_char_computePartition[255];
char current_char_computePartition[255];
rsmi_compute_partition_type new_computePartition;
TestBase::Run();
if (setup_failed_) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
return;
}
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(dv_ind);
//Standard checks to see if API is supported, before running full tests
ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition,
255);
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout << "\t**" << ": "
<< "Not supported on this machine" << std::endl;
}
return;
} else {
CHK_ERR_ASRT(ret)
}
IF_VERB(STANDARD) {
std::cout << std::endl << "\t**"
<< "Original compute partition: "
<< orig_char_computePartition << std::endl;
}
if ((orig_char_computePartition == NULL) ||
(orig_char_computePartition[0] == '\0')) {
std::cout << "***System compute partition value is not defined. "
"Skip compute partition test." << std::endl;
return;
}
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);
// Verify api support checking functionality is working
uint32_t length = 2;
char smallBuffer[length];
err = rsmi_dev_compute_partition_get(dv_ind, smallBuffer, length);
size_t size = sizeof(smallBuffer)/sizeof(*smallBuffer);
ASSERT_EQ(err, RSMI_STATUS_INSUFFICIENT_SIZE);
ASSERT_EQ((size_t)length, size);
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_INSUFFICIENT_SIZE) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned "
<< "and size matches length requested." << std::endl;
}
}
// Verify api support checking functionality is working
err = rsmi_dev_compute_partition_get(dv_ind, nullptr, 255);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_NOT_SUPPORTED was returned."
<< std::endl;
}
}
// Verify api support checking functionality is working
err = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, 0);
ASSERT_EQ(err, (RSMI_STATUS_INVALID_ARGS || RSMI_STATUS_NOT_SUPPORTED));
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_INVALID_ARGS) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INVALID_ARGS was returned."
<< std::endl;
}
}
// Verify api support checking functionality is working
err = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
// Note: new_computePartition is not set
// DISPLAY_RSMI_ERR(err)
EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) ||
(err == RSMI_STATUS_NOT_SUPPORTED));
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_INVALID_ARGS) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INVALID_ARGS was returned."
<< std::endl;
} else {
DISPLAY_RSMI_ERR(err)
}
}
ASSERT_FALSE(err == RSMI_STATUS_PERMISSION);
// Verify api support checking functionality is working
new_computePartition
= rsmi_compute_partition_type::RSMI_COMPUTE_PARTITION_INVALID;
err = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
// DISPLAY_RSMI_ERR(err)
EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) ||
(err == RSMI_STATUS_NOT_SUPPORTED) ||
(err == RSMI_STATUS_PERMISSION));
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_INVALID_ARGS) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INVALID_ARGS was returned."
<< std::endl;
} else if (err == RSMI_STATUS_PERMISSION) {
DISPLAY_RSMI_ERR(err)
// tests should not continue if err is a permission issue
ASSERT_FALSE(err == RSMI_STATUS_PERMISSION);
} else {
DISPLAY_RSMI_ERR(err)
}
}
// Re-run original get, so we can reset to later
ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition,
255);
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);
/**
* RSMI_COMPUTE_PARTITION_INVALID = 0,
* RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
* //!< shared memory
* RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
* //!< together with shared memory
* RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
* //!< together with shared memory
* RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs
* //!< work together with shared memory
* RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs
* //!< work together with shared memory
*/
for (int partition = RSMI_COMPUTE_PARTITION_CPX;
partition <= RSMI_COMPUTE_PARTITION_QPX;
partition++) {
new_computePartition
= static_cast<rsmi_compute_partition_type>(partition);
IF_VERB(STANDARD) {
std::cout << std::endl;
std::cout << "\t**"
<< "======== TEST RSMI_COMPUTE_PARTITION_"
<< computePartitionString(new_computePartition)
<< " ===============" << std::endl;
}
ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Attempting to set compute partition to: "
<< computePartitionString(new_computePartition) << std::endl;
}
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "Current compute partition: "
<< current_char_computePartition
<< std::endl;
}
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);
EXPECT_STREQ(computePartitionString(new_computePartition).c_str(),
current_char_computePartition);
}
/* TEST RETURN TO ORIGINAL COMPUTE PARTITIONING SETTING */
IF_VERB(STANDARD) {
std::cout << std::endl;
std::cout << "\t**"
<< "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITIONING "
<< "SETTING ========" << std::endl;
}
new_computePartition
= mapStringToRSMIComputePartitionTypes.at(orig_char_computePartition);
ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**" << "Returning compute partition to: "
<< computePartitionString(new_computePartition) << std::endl;
}
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
255);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**" << "Attempted to set compute partition: "
<< computePartitionString(new_computePartition) << std::endl
<< "\t**"
<< "Current compute partition: " << current_char_computePartition
<< std::endl;
}
EXPECT_EQ(RSMI_STATUS_SUCCESS, ret);
EXPECT_STREQ(computePartitionString(new_computePartition).c_str(),
current_char_computePartition);
}
}
+73
Zobrazit soubor
@@ -0,0 +1,73 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_
#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_
#include "rocm_smi_test/test_base.h"
class TestComputePartitionReadWrite : public TestBase {
public:
TestComputePartitionReadWrite();
// @Brief: Destructor for test case of TestComputePartitionReadWrite
virtual ~TestComputePartitionReadWrite();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_
+6 -1
Zobrazit soubor
@@ -5,7 +5,7 @@
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2018, Advanced Micro Devices, Inc.
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
@@ -81,6 +81,7 @@
#include "functional/mutual_exclusion.h"
#include "functional/evt_notif_read_write.h"
#include "functional/init_shutdown_refcount.h"
#include "functional/computepartition_read_write.h"
#include "rocm_smi_test/functional/hw_topology_read.h"
#include "rocm_smi_test/functional/gpu_metrics_read.h"
#include "rocm_smi_test/functional/metrics_counter_read.h"
@@ -267,6 +268,10 @@ TEST(rsmitstReadWrite, TestEvtNotifReadWrite) {
TestEvtNotifReadWrite tst;
RunGenericTest(&tst);
}
TEST(rsmitstReadWrite, TestComputePartitionReadWrite) {
TestComputePartitionReadWrite tst;
RunGenericTest(&tst);
}
TEST(rsmitstReadOnly, TestConcurrentInit) {
TestConcurrentInit tst;
SetFlags(&tst);