From 1b8d3f507a2c2244f23e86868200eb0b8de59ac5 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Fri, 6 Jan 2023 11:01:18 -0600 Subject: [PATCH] SWDEV-335697- Add support for dynamic partitioning Original updates: * Added .gitignore to help with future commits * Updated/added copyrights on modified or added files * Updated rocm_smi.h/.cc - Added 3 new SMI API functions: rsmi_dev_compute_partition_set & rsmi_dev_compute_partition_get - Added helpful maps/enums used in new get/set compute_partition API calls * Updated rocm_smi.py - Added --showcomputepartition - Added --setcomputepartition - Fixed a few mistypes * Updated rsmiBindings.py - added helpful class/dict/list * Updated rocm_smi_example.cc - Added helpful MACRO to detect if api is not supported. - Added current_compute_partition set/get rocm lib calls - Added helpful macro to discover future RSMI errors - Commented out test_set_freq, was having permission issues on a Navi21 * Updated rocm_smi_main.cc - Added helpful map to debug API calls, left in for future use - Added comment to better understand a non-class function returns * Added computepartition_read_write.cc/.h - Added get/set compute partition API test calls - Confirmed on devices that do not support the API calls, tests pass * Updated rocm_smi_test/main.cc - Calls new compute partition gtests Added following updates from review feedback: * Updated rocm_smi.h/cc - Removed C++ API calls, adding support for both C/C++ API calls could cause confusion and adds extra work for us - rsmi_dev_compute_partition_get -> Fixed an edge case where user gives a small buffer length size (smaller than data received), but does not receive the partial buffer back. google Tests are updated to reflect this find. * Updated rocm_smi_example.cc - Fixed test_set_freq, issue was that file was not writable. We now indicate this warning, so prior errors make sense. - General test code cleanup. Removed extra code, by creating loops for tests. * Updated rocm_smi_main.cc - Moved and got rid of an external reference to a map used for debugging RSMI enums, now is a const public reference. * Updated rocm_smi.py - Updated python code to identify NOT_SUPPORTED due to (currently) only a few GPU support the feature Change-Id: I4a567acbb59d6771fb64df08d19175fe3604fd1b [ROCm/rocm_smi_lib commit: 4d7f3f2bc759926d4a25441840006e42eae4176e] --- projects/rocm-smi-lib/.gitignore | 124 +++++++ .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 86 ++++- .../include/rocm_smi/rocm_smi_device.h | 5 +- .../include/rocm_smi/rocm_smi_main.h | 1 + .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 66 +++- .../python_smi_tools/rsmiBindings.py | 25 ++ .../rocm_smi/example/rocm_smi_example.cc | 228 ++++++++++-- projects/rocm-smi-lib/src/rocm_smi.cc | 134 +++++++- projects/rocm-smi-lib/src/rocm_smi_device.cc | 13 +- .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 4 +- projects/rocm-smi-lib/src/rocm_smi_main.cc | 75 +++- .../functional/computepartition_read_write.cc | 324 ++++++++++++++++++ .../functional/computepartition_read_write.h | 73 ++++ .../rocm-smi-lib/tests/rocm_smi_test/main.cc | 7 +- 14 files changed, 1111 insertions(+), 54 deletions(-) create mode 100644 projects/rocm-smi-lib/.gitignore create mode 100755 projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc create mode 100755 projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.h diff --git a/projects/rocm-smi-lib/.gitignore b/projects/rocm-smi-lib/.gitignore new file mode 100644 index 0000000000..dc0fa0928c --- /dev/null +++ b/projects/rocm-smi-lib/.gitignore @@ -0,0 +1,124 @@ +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules +# +.* +*.o +*.o.* +*.a +*.s +*.ko +*.so +*.so.dbg +*.mod.c +*.i +*.lst +*.symtypes +*.order +modules.builtin +*.elf +*.bin +*.gz +*.bz2 +*.lzma +*.xz +*.lzo +#*.patch +*.gcno +*.pyc +*current_compute_partition + +# +# Top-level generic files/folders +# +/[Bb][Ui][Ll][Dd] +*/[Bb][Ui][Ll][Dd] +/build +*/build +/[Gg][Tt][Ee][Ss][Tt][Ss] +*/[Gg][Tt][Ee][Ss][Tt][Ss] +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers +Module.symvers + +# +# Debian directory (make deb-pkg) +# +/debian/ + +# +# git files that we don't want to ignore even it they are dot-files +# +!.gitignore +!.mailmap + +### VisualStudioCode ### +!.vscode/settings.json + +# +# Generated include files +# +include/config +include/linux/version.h +include/generated +arch/*/include/generated + +# git generated dirs +patches-* + +# quilt's files +patches +series + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +*.orig +*~ +\#*# + +# +# Leavings from module signing +# +extra_certificates +signing_key.priv +signing_key.x509 +x509.genkey + +#cmake files +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps + +# +# ROCm files +# Removes generated config headers like rocmsmi64Config.h & oamConfig.h +# +*Config.h \ No newline at end of file diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 2317158827..7c8aeb1d48 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -352,6 +352,26 @@ typedef enum { typedef rsmi_clk_type_t rsmi_clk_type; /// \endcond +/** + * Compute Partition types + */ +typedef enum { + RSMI_COMPUTE_PARTITION_INVALID = 0, + RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory +} rsmi_compute_partition_type_t; +/// \cond Ignore in docs. +typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; +/// \endcond + /** * @brief Temperature Metrics. This enum is used to identify various * temperature metrics. Corresponding values will be in millidegress @@ -3470,6 +3490,70 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, /** @} */ // end of HWTopo +/*****************************************************************************/ +/** @defgroup ComputePartition Compute Partition Functions + * These functions are used to configure and query the device's + * compute parition setting. + * @{ + */ + +/** + * @brief Retrieves the current compute partitioning for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p compute_partition , + * and uint32 @p len , this function will attempt to obtain the device's + * current compute partition setting string. Upon successful retreival, + * the obtained device's compute partition settings string shall be stored in + * the passed @p compute_partition char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] compute_partition a pointer to a char string variable, + * which the device's current compute partition will be written to. + * + * @param[in] len the length of the caller provided buffer @p compute_partition + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire compute partition value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t +rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, + uint32_t len); + +/** + * @brief Modifies a selected device's compute partition setting. + * + * @details Given a device index @p dv_ind, a type of compute partition + * @p compute_partition, this function will attempt to update the selected + * device's compute partition setting. + * + * @param[in] dv_ind a device index + * + * @param[inout] compute_partition using enum ::rsmi_copmpute_partition_type_t, + * define what the selected device's compute partition setting should be + * updated to. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_PERMISSION function requires root access + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * + */ +rsmi_status_t +rsmi_dev_compute_partition_set(uint32_t dv_ind, + rsmi_compute_partition_type_t compute_partition); + +/** @} */ // end of ComputePartition + /*****************************************************************************/ /** @defgroup APISupport Supported Functions * API function support varies by both GPU type and the version of the diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index aef25c2804..d76ddfa4de 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -161,7 +161,8 @@ enum DevInfoTypes { kDevMemPageBad, kDevNumaNode, kDevGpuMetrics, - kDevGpuReset + kDevGpuReset, + kDevComputePartition }; typedef struct { diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index 126bbd7436..c6d5f077b9 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -113,6 +113,7 @@ class RocmSMI { uint64_t *weight); int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); + static const std::map devInfoTypesStrings; private: std::vector> devices_; diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 6c0385a45a..8a5c664bb6 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -760,7 +760,7 @@ def resetPerfDeterminism(deviceList): if rsmi_ret_ok(ret, device, 'disable performance determinism'): printLog(device, 'Successfully disabled performance determinism', None) else: - logging.error('GPU[%s]\t\t: Unable to diable performance determinism', device) + logging.error('GPU[%s]\t\t: Unable to disable performance determinism', device) printLogSpacer() @@ -1305,6 +1305,37 @@ def setProfile(deviceList, profile): printLogSpacer() +def setComputePartition(deviceList, computePartitionType): + """ Sets compute partitioning for a list of device + + @param deviceList: List of DRM devices (can be a single-item list) + @param computePartition: Compute Partition type to set as + """ + printLogSpacer(' Set compute partition to %s ' % (str(computePartitionType).upper())) + for device in deviceList: + computePartitionType = computePartitionType.upper() + if computePartitionType not in compute_partition_type_l: + printErrLog(device, 'Invalid compute partition type %s' + '\nValid compute partition types are %s' + % ( computePartitionType.upper(), + (', '.join(map(str, compute_partition_type_l))) )) + return (None, None) + ret = rocmsmi.rsmi_dev_compute_partition_set(device, + rsmi_compute_partition_type_dict[computePartitionType]) + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, + 'Successfully set compute partition to %s' % (computePartitionType), + None) + elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: + printLog(device, 'Permission denied', None) + elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Not supported on the given system', None) + else: + rsmi_ret_ok(ret, device) + printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.') + printLogSpacer() + + def showAllConcise(deviceList): """ Display critical info for all devices in a concise format @@ -2732,6 +2763,24 @@ def showNodesBw(deviceList): if nonXgmi: printLog(None,"Non-xGMI links detected and is currently not supported", None) +def showComputePartition(deviceList): + """ Returns the current compute partitioning for a list of devices + + @param deviceList: List of DRM devices (can be a single-item list) + """ + currentComputePartition = create_string_buffer(256) + printLogSpacer(' Current Compute Partition ') + for device in deviceList: + ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) + if rsmi_ret_ok(ret, device, silent=True) and currentComputePartition.value.decode(): + printLog(device, 'Compute Partition', currentComputePartition.value.decode()) + elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Not supported on the given system', None) + else: + rsmi_ret_ok(ret, device) + printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.', None) + printLogSpacer() + def checkAmdGpus(deviceList): """ Check if there are any AMD GPUs being queried, return False if there are none @@ -2905,6 +2954,8 @@ def relaunchAsSudo(): """ if os.geteuid() != 0: os.execvp('sudo', ['sudo'] + sys.argv) + #keeping below, if we want to run sudo with user's env variables + #os.execvp('sudo', ['sudo', '-E'] + sys.argv) def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): @@ -2936,7 +2987,6 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): return False return True - def save(deviceList, savefilepath): """ Save clock frequencies and fan speeds for a list of devices to a specified file path. @@ -3077,6 +3127,7 @@ if __name__ == '__main__': groupDisplay.add_argument('--showenergycounter', help='Energy accumulator that stores amount of energy consumed', action='store_true') groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true') + groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true') groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default', action='store_true') @@ -3121,6 +3172,10 @@ if __name__ == '__main__': groupAction.add_argument('--setperfdeterminism', help='Set clock frequency limit to get minimal performance variation', type=int, metavar='SCLK', nargs=1) + groupAction.add_argument('--setcomputepartition', help='Set compute partition', + choices=compute_partition_type_l + [x.lower() for x in compute_partition_type_l], + type=str, nargs=1 + ) groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2, metavar=('BLOCK', 'ERRTYPE')) groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2, @@ -3158,7 +3213,7 @@ if __name__ == '__main__': or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \ or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ - args.setvc or args.setsrange or args.setmrange or args.setclock: + args.setvc or args.setsrange or args.setmrange or args.setclock or args.setcomputepartition: relaunchAsSudo() # If there is one or more device specified, use that for all commands, otherwise use a @@ -3220,6 +3275,7 @@ if __name__ == '__main__': args.showpidgpus = [] args.showreplaycount = True args.showvc = True + args.showcomputepartition = True if not PRINT_JSON: args.showprofile = True @@ -3348,6 +3404,8 @@ if __name__ == '__main__': showVoltageCurve(deviceList) if args.showenergycounter: showEnergy(deviceList) + if args.showcomputepartition: + showComputePartition(deviceList) if args.setclock: setClocks(deviceList, args.setclock[0], [int(args.setclock[1])]) if args.setsclk: @@ -3386,6 +3444,8 @@ if __name__ == '__main__': setClockRange(deviceList, 'mclk', args.setmrange[0], args.setmrange[1], args.autorespond) if args.setperfdeterminism: setPerfDeterminism(deviceList, args.setperfdeterminism[0]) + if args.setcomputepartition: + setComputePartition(deviceList, args.setcomputepartition[0]) if args.resetprofile: resetProfile(deviceList) if args.resetxgmierr: diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py index 90eb6b3fed..4502b1c62b 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py @@ -582,3 +582,28 @@ class rsmi_func_id_value_t(Union): _fields_ = [('id', c_uint64), ('name', c_char_p), ('submodule', submodule_union)] + +class rsmi_compute_partition_type_t(c_int): + RSMI_COMPUTE_PARTITION_INVALID = 0 + RSMI_COMPUTE_PARTITION_CPX = 1 + RSMI_COMPUTE_PARTITION_SPX = 2 + RSMI_COMPUTE_PARTITION_DPX = 3 + RSMI_COMPUTE_PARTITION_TPX = 4 + RSMI_COMPUTE_PARTITION_QPX = 5 + +rsmi_compute_partition_type_dict = { + #'RSMI_COMPUTE_PARTITION_INVALID': 0, + 'CPX': 1, + 'SPX': 2, + 'DPX': 3, + 'TPX': 4, + 'QPX': 5 +} + +rsmi_compute_partition_type = rsmi_compute_partition_type_t + +# compute_partition_type_l includes string names for the rsmi_compute_partition_type_t +# Usage example to get corresponding names: +# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX] +# will return string 'CPX' +compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX'] \ No newline at end of file diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index 08f7710451..58623c7c14 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -50,13 +50,14 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #define PRINT_RSMI_ERR(RET) { \ if (RET != RSMI_STATUS_SUCCESS) { \ const char *err_str; \ - std::cout << "RSMI call returned " << (RET) \ + std::cout << "[ERROR] RSMI call returned " << (RET) \ << " at line " << __LINE__ << std::endl; \ rsmi_status_string((RET), &err_str); \ std::cout << err_str << std::endl; \ @@ -70,6 +71,11 @@ } \ } +#define CHK_AND_PRINT_RSMI_ERR_RET(RET) { \ + PRINT_RSMI_ERR(RET) \ + CHK_RSMI_RET(RET) \ +} + #define CHK_RSMI_RET_I(RET) { \ PRINT_RSMI_ERR(RET) \ if (RET != RSMI_STATUS_SUCCESS) { \ @@ -85,6 +91,15 @@ } \ } +#define CHK_RSMI_NOT_SUPPORTED_RET(RET) { \ + if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ + std::cout << "This function is not supported in the current environment." \ + << std::endl; \ + } else { \ + CHK_RSMI_RET(RET) \ + } \ +} + static void print_test_header(const char *str, uint32_t dv_ind) { std::cout << "********************************" << std::endl; std::cout << "*** " << str << std::endl; @@ -92,6 +107,10 @@ static void print_test_header(const char *str, uint32_t dv_ind) { std::cout << "Device index: " << dv_ind << std::endl; } +static void print_mini_header(const char *str) { + std::cout << "\n>> " << str << " <<" << std::endl; +} + static const char * power_profile_string(rsmi_power_profile_preset_masks_t profile) { switch (profile) { @@ -112,6 +131,33 @@ power_profile_string(rsmi_power_profile_preset_masks_t profile) { } } +static const std::string +compute_partition_string(rsmi_compute_partition_type partition) { + switch (partition) { + case RSMI_COMPUTE_PARTITION_CPX: + return "CPX"; + case RSMI_COMPUTE_PARTITION_SPX: + return "SPX"; + case RSMI_COMPUTE_PARTITION_DPX: + return "DPX"; + case RSMI_COMPUTE_PARTITION_TPX: + return "TPX"; + case RSMI_COMPUTE_PARTITION_QPX: + return "QPX"; + default: + return "UNKNOWN"; + } +} + +static std::map +mapStringToRSMIComputePartitionTypes { + {"CPX", RSMI_COMPUTE_PARTITION_CPX}, + {"SPX", RSMI_COMPUTE_PARTITION_SPX}, + {"DPX", RSMI_COMPUTE_PARTITION_DPX}, + {"TPX", RSMI_COMPUTE_PARTITION_TPX}, + {"QPX", RSMI_COMPUTE_PARTITION_QPX} +}; + static const char * perf_level_string(rsmi_dev_perf_level_t perf_lvl) { switch (perf_lvl) { @@ -128,6 +174,34 @@ perf_level_string(rsmi_dev_perf_level_t perf_lvl) { } } +static bool isUserRunningAsSudo() { + bool isRunningWithSudo = false; + auto myUID = getuid(); + auto myPrivledges = geteuid(); + if (myUID == myPrivledges) { + isRunningWithSudo = true; + } + return isRunningWithSudo; +} + +bool isFileWritable(rsmi_status_t response) { + // Clock files may not be writable, causing sets to + // return RSMI_STATUS_PERMISSION. If running as sudo, + // this means file is not writable. + // isFileWritable(ret) - intends to capture this + // response situation. + bool fileWritable = true; + if (isUserRunningAsSudo() && (response == RSMI_STATUS_PERMISSION)) { + PRINT_RSMI_ERR(response) + std::cout << "[WARN] User is running with sudo " + << "permissions, file is not writable." << std::endl; + fileWritable = false; + } else { + CHK_AND_PRINT_RSMI_ERR_RET(response) + } + return fileWritable; +} + static rsmi_status_t test_power_profile(uint32_t dv_ind) { rsmi_status_t ret; rsmi_power_profile_status_t status; @@ -355,13 +429,19 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) { uint32_t freq_bitmask; rsmi_clk_type rsmi_clk; + // Clock files may not be writable, causing sets to + // return RSMI_STATUS_PERMISSION even if running with + // sudo. See isFileWritable() for more info. + print_test_header("Clock Frequency Control", dv_ind); for (uint32_t clk = (uint32_t)RSMI_CLK_TYPE_FIRST; clk <= RSMI_CLK_TYPE_LAST; ++clk) { + std::string miniHeader = "Testing clock" + std::to_string(clk); + print_mini_header(miniHeader.c_str()); rsmi_clk = (rsmi_clk_type)clk; ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_RSMI_RET(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "Initial frequency for clock" << rsmi_clk << " is " << f.current << std::endl; @@ -380,19 +460,20 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) { " to 0b" << freq_bm_str << " ..." << std::endl; ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask); - CHK_RSMI_RET(ret) + isFileWritable(ret); ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_RSMI_RET(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "Frequency is now index " << f.current << std::endl; std::cout << "Resetting mask to all frequencies." << std::endl; ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF); - CHK_RSMI_RET(ret) + isFileWritable(ret); ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); - CHK_RSMI_RET(ret) + isFileWritable(ret); } + std::cout << std::endl; return RSMI_STATUS_SUCCESS; } @@ -406,13 +487,75 @@ static void print_frequencies(rsmi_frequencies_t *f) { std::cout << std::endl; } } + +static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { + rsmi_status_t ret; + uint32_t buffer_len = 10; + char originalComputePartition[buffer_len]; + print_test_header("Compute Partitioning Control", dv_ind); + /** + typedef enum { + RSMI_COMPUTE_PARTITION_INVALID = 0, + RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + } rsmi_compute_partition_type_t; + */ + ret = rsmi_dev_compute_partition_get(dv_ind, originalComputePartition, buffer_len); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "Device does not support the compute partition feature." + << std::endl; + std::cout << "*********************************************" << std::endl; + return RSMI_STATUS_SUCCESS; + } else { + CHK_AND_PRINT_RSMI_ERR_RET(ret) + std::cout << "Original compute partition is " << originalComputePartition + << "." << std::endl; + } + + for (int newComputePartition = RSMI_COMPUTE_PARTITION_CPX; + newComputePartition <= RSMI_COMPUTE_PARTITION_QPX; + newComputePartition++) { + rsmi_compute_partition_type newPartition + = static_cast(newComputePartition); + std::cout << "Attempting to set compute partition to " + << compute_partition_string(newPartition) << "..." + << std::endl; + ret = rsmi_dev_compute_partition_set(dv_ind, newPartition); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "Done setting compute partition to " + << compute_partition_string(newPartition) + << "." << std::endl; + std::cout << std::endl << std::endl; + } + + std::string myComputePartition = originalComputePartition; + if (myComputePartition.empty() == false) { + std::cout << "Resetting compute partition to " << originalComputePartition + << "... " << std::endl; + rsmi_compute_partition_type origComputePartitionType + = mapStringToRSMIComputePartitionTypes[originalComputePartition]; + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "Done" << std::endl; + ret = rsmi_dev_compute_partition_set(dv_ind, origComputePartitionType); + } + return RSMI_STATUS_SUCCESS; +} + int main() { rsmi_status_t ret; ret = rsmi_init(0); CHK_RSMI_RET_I(ret) - std::string val_str; std::vector val_vec; uint64_t val_ui64, val2_ui64; int64_t val_i64; @@ -424,98 +567,111 @@ int main() { rsmi_gpu_metrics_t p; rsmi_num_monitor_devices(&num_monitor_devs); - for (uint32_t i = 0; i< num_monitor_devs; ++i) { + for (uint32_t i = 0; i < num_monitor_devs; ++i) { ret = rsmi_dev_id_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl; + std::cout << std::endl << std::endl; + std::cout << "Starting to call " + << "rsmi_dev_compute_partition_get()..." + << std::endl; + char current_compute_partition[256]; + ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "\t**Current Compute Partition setting: " + << current_compute_partition << std::endl; + ret = rsmi_dev_gpu_metrics_info_get(i, &p); - CHK_RSMI_RET(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**GPU METRICS" << std::endl; ret = rsmi_dev_perf_level_get(i, &pfl); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Performance Level:" << perf_level_string(pfl) << std::endl; - ret = rsmi_dev_overdrive_level_get(i, &val_ui32); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**OverDrive Level:" << val_ui32 << std::endl; ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_MEM, &f); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Supported GPU Memory clock frequencies: "; std::cout << f.num_supported << std::endl; print_frequencies(&f); ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &f); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Supported GPU clock frequencies: "; std::cout << f.num_supported << std::endl; print_frequencies(&f); - char name[20]; - ret = rsmi_dev_name_get(i, name, 20); - CHK_RSMI_RET_I(ret) + char name[128]; + ret = rsmi_dev_name_get(i, name, 128); + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Monitor name: " << name << std::endl; ret = rsmi_dev_temp_metric_get(i, 0, RSMI_TEMP_CURRENT, &val_i64); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Temperature: " << val_i64/1000 << "C" << std::endl; ret = rsmi_dev_volt_metric_get(i, RSMI_VOLT_TYPE_VDDGFX, RSMI_VOLT_CURRENT, &val_i64); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Voltage: " << val_i64 << "mV" << std::endl; ret = rsmi_dev_fan_speed_get(i, 0, &val_i64); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) ret = rsmi_dev_fan_speed_max_get(i, 0, &val_ui64); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Current Fan Speed: "; std::cout << val_i64/static_cast(val_ui64)*100; std::cout << "% ("<< val_i64 << "/" << val_ui64 << ")" << std::endl; ret = rsmi_dev_fan_rpms_get(i, 0, &val_i64); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Current fan RPMs: " << val_i64 << std::endl; ret = rsmi_dev_power_cap_get(i, 0, &val_ui64); - CHK_RSMI_PERM_RET(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Current Power Cap: " << val_ui64 << "uW" <(val_ui64)/1000 << " W" << - std::endl; + std::cout << static_cast(val_ui64)/1000 << " W" << std::endl; + ret = rsmi_dev_power_ave_get(i, 0, &val_ui64); + CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t=======" << std::endl; } std::cout << "***** Testing write api's" << std::endl; for (uint32_t i = 0; i< num_monitor_devs; ++i) { ret = test_set_overdrive(i); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) ret = test_set_perf_level(i); - CHK_RSMI_RET_I(ret) - - ret = test_set_freq(i); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) ret = test_set_fan_speed(i); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) ret = test_power_cap(i); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) ret = test_power_profile(i); - CHK_RSMI_RET_I(ret) + CHK_AND_PRINT_RSMI_ERR_RET(ret) + + ret = test_set_compute_partitioning(i); + CHK_AND_PRINT_RSMI_ERR_RET(ret) + + ret = test_set_freq(i); + CHK_AND_PRINT_RSMI_ERR_RET(ret) } return 0; diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index c2cbb238c1..694015f13d 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include @@ -1678,13 +1678,30 @@ static std::vector pci_name_files = { "/var/lib/pciutils/pci.ids" }; - enum eNameStrType { NAME_STR_VENDOR = 0, NAME_STR_DEVICE, NAME_STR_SUBSYS }; +std::map +mapStringToRSMIComputePartitionTypes { + {"CPX", RSMI_COMPUTE_PARTITION_CPX}, + {"SPX", RSMI_COMPUTE_PARTITION_SPX}, + {"DPX", RSMI_COMPUTE_PARTITION_DPX}, + {"TPX", RSMI_COMPUTE_PARTITION_TPX}, + {"QPX", RSMI_COMPUTE_PARTITION_QPX} +}; + +std::map +mapRSMIToStringComputePartitionTypes { + {RSMI_COMPUTE_PARTITION_CPX, "CPX"}, + {RSMI_COMPUTE_PARTITION_SPX, "SPX"}, + {RSMI_COMPUTE_PARTITION_DPX, "DPX"}, + {RSMI_COMPUTE_PARTITION_TPX, "TPX"}, + {RSMI_COMPUTE_PARTITION_QPX, "QPX"} +}; + static std::string get_id_name_str_from_line(uint64_t id, std::string ln, std::istringstream *ln_str) { @@ -3697,6 +3714,117 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, CATCH } +static rsmi_status_t +get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { + TRY + std::string val_str; + + if (compute_partition.c_str() == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + CHK_SUPPORT_NAME_ONLY(compute_partition.c_str()) + + DEVICE_MUTEX + rsmi_status_t ret = get_dev_value_str(amd::smi::kDevComputePartition, + dv_ind, &val_str); + + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + switch (mapStringToRSMIComputePartitionTypes[val_str]) { + case RSMI_COMPUTE_PARTITION_INVALID: + // Retrieved an unknown compute partition + return RSMI_STATUS_UNEXPECTED_DATA; + case RSMI_COMPUTE_PARTITION_CPX: + break; + case RSMI_COMPUTE_PARTITION_SPX: + break; + case RSMI_COMPUTE_PARTITION_DPX: + break; + case RSMI_COMPUTE_PARTITION_TPX: + break; + case RSMI_COMPUTE_PARTITION_QPX: + break; + default: + // Retrieved an unknown compute partition + return RSMI_STATUS_UNEXPECTED_DATA; + } + compute_partition = val_str; + return RSMI_STATUS_SUCCESS; + CATCH +} + +rsmi_status_t +rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, + uint32_t len) { + CHK_SUPPORT_NAME_ONLY(compute_partition) + if ((len == 0) || (compute_partition == nullptr)) { + return RSMI_STATUS_INVALID_ARGS; + } + + TRY + std::string returning_compute_partition; + rsmi_status_t ret = get_compute_partition(dv_ind, + returning_compute_partition); + + if (ret != RSMI_STATUS_SUCCESS) { return ret; } + + std::size_t length = returning_compute_partition.copy(compute_partition, len); + compute_partition[length]='\0'; + + if (len < (returning_compute_partition.size() + 1)) { + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + return ret; + CATCH +} + +rsmi_status_t +rsmi_dev_compute_partition_set(uint32_t dv_ind, + rsmi_compute_partition_type_t compute_partition) { + TRY + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + + std::string newComputePartitionStr + = mapRSMIToStringComputePartitionTypes[compute_partition]; + std::string currentComputePartition; + + switch (compute_partition) { + case RSMI_COMPUTE_PARTITION_INVALID: + // Retrieved an unknown compute partition + return RSMI_STATUS_INVALID_ARGS; + case RSMI_COMPUTE_PARTITION_CPX: + break; + case RSMI_COMPUTE_PARTITION_SPX: + break; + case RSMI_COMPUTE_PARTITION_DPX: + break; + case RSMI_COMPUTE_PARTITION_TPX: + break; + case RSMI_COMPUTE_PARTITION_QPX: + break; + default: + return RSMI_STATUS_INVALID_ARGS; + } + + // do nothing if compute_partition is the current compute partition + get_compute_partition(dv_ind, currentComputePartition); + rsmi_compute_partition_type_t currRSMIComputePartition + = mapStringToRSMIComputePartitionTypes[currentComputePartition]; + if (currRSMIComputePartition == compute_partition) { + return RSMI_STATUS_SUCCESS; + } + + newComputePartitionStr = mapRSMIToStringComputePartitionTypes[compute_partition]; + GET_DEV_FROM_INDX + int ret = dev->writeDevInfo(amd::smi::kDevComputePartition, + newComputePartitionStr); + return amd::smi::ErrnoToRsmiStatus(ret); + CATCH +} + enum iterator_handle_type { FUNC_ITER = 0, VARIANT_ITER, diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 0eade39ce5..4421599c13 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -121,6 +121,7 @@ static const char *kDevXGMIErrorFName = "xgmi_error"; static const char *kDevSerialNumberFName = "serial_number"; static const char *kDevNumaNodeFName = "numa_node"; static const char *kDevGpuMetricsFName = "gpu_metrics"; +static const char *kDevComputePartitionFName = "current_compute_partition"; // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -290,6 +291,7 @@ static const std::map kDevAttribNameMap = { {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, + {kDevComputePartition, kDevComputePartitionFName}, }; static const std::map kDevPerfLvlMap = { @@ -413,6 +415,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, + {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, + {"rsmi_dev_compute_partition_set", {{kDevComputePartitionFName}, {}}}, // These functions with variants, but no sensors/units. (May or may not // have mandatory dependencies.) @@ -563,9 +567,6 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (env_->path_DRM_root_override && type == env_->enum_override) { sysfs_path = env_->path_DRM_root_override; - if (str) { - sysfs_path += ".write"; - } } #endif @@ -587,7 +588,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { fs->open(sysfs_path); if (!fs->is_open()) { - return errno; + return errno; } return 0; @@ -696,6 +697,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { case kDevPCIEClk: case kDevPowerODVoltage: case kDevSOCClk: + case kDevComputePartition: return writeDevInfoStr(type, val); default: @@ -922,6 +924,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevVBiosVer: case kDevPCIEThruPut: case kDevSerialNumber: + case kDevComputePartition: return readDevInfoStr(type, val); break; diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 1f9fad479c..3f540bc850 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2021, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -280,7 +280,7 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { return RSMI_STATUS_NOT_SUPPORTED; } - // Initialize the smu fiedls to zero as some of them only valid in + // Initialize the smu fields to zero as some of them only valid in // a specific version. *smu = {}; diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 40541d9280..ca992d51ce 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -2,7 +2,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -73,6 +73,77 @@ static const char *kDeviceNamePrefix = "card"; static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""}; +static const std::string amdSMI = "amd::smi::"; +const std::map amd::smi::RocmSMI::devInfoTypesStrings = { + {amd::smi::kDevPerfLevel, amdSMI + "kDevPerfLevel"}, + {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, + {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, + {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, + {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, + {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, + {amd::smi::kDevVendorID, amdSMI + "kDevVendorID"}, + {amd::smi::kDevSubSysDevID, amdSMI + "kDevSubSysDevID"}, + {amd::smi::kDevSubSysVendorID, amdSMI + "kDevSubSysVendorID"}, + {amd::smi::kDevGPUMClk, amdSMI + "kDevGPUMClk"}, + {amd::smi::kDevGPUSClk, amdSMI + "kDevGPUSClk"}, + {amd::smi::kDevDCEFClk, amdSMI + "kDevDCEFClk"}, + {amd::smi::kDevFClk, amdSMI + "kDevFClk"}, + {amd::smi::kDevSOCClk, amdSMI + "kDevSOCClk"}, + {amd::smi::kDevPCIEClk, amdSMI + "kDevPCIEClk"}, + {amd::smi::kDevPowerProfileMode, amdSMI + "kDevPowerProfileMode"}, + {amd::smi::kDevUsage, amdSMI + "kDevUsage"}, + {amd::smi::kDevPowerODVoltage, amdSMI + "kDevPowerODVoltage"}, + {amd::smi::kDevVBiosVer, amdSMI + "kDevVBiosVer"}, + {amd::smi::kDevPCIEThruPut, amdSMI + "kDevPCIEThruPut"}, + {amd::smi::kDevErrCntSDMA, amdSMI + "kDevErrCntSDMA"}, + {amd::smi::kDevErrCntUMC, amdSMI + "kDevErrCntUMC"}, + {amd::smi::kDevErrCntGFX, amdSMI + "kDevErrCntGFX"}, + {amd::smi::kDevErrCntMMHUB, amdSMI + "kDevErrCntMMHUB"}, + {amd::smi::kDevErrCntPCIEBIF, amdSMI + "kDevErrCntPCIEBIF"}, + {amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"}, + {amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"}, + {amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"}, + {amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"}, + {amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"}, + {amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"}, + {amd::smi::kDevMemUsedGTT, amdSMI + "kDevMemUsedGTT"}, + {amd::smi::kDevMemUsedVisVRAM, amdSMI + "kDevMemUsedVisVRAM"}, + {amd::smi::kDevMemUsedVRAM, amdSMI + "kDevMemUsedVRAM"}, + {amd::smi::kDevVramVendor, amdSMI + "kDevVramVendor"}, + {amd::smi::kDevPCIEReplayCount, amdSMI + "kDevPCIEReplayCount"}, + {amd::smi::kDevUniqueId, amdSMI + "kDevUniqueId"}, + {amd::smi::kDevDFCountersAvailable, amdSMI + "kDevDFCountersAvailable"}, + {amd::smi::kDevMemBusyPercent, amdSMI + "kDevMemBusyPercent"}, + {amd::smi::kDevXGMIError, amdSMI + "kDevXGMIError"}, + {amd::smi::kDevFwVersionAsd, amdSMI + "kDevFwVersionAsd"}, + {amd::smi::kDevFwVersionCe, amdSMI + "kDevFwVersionCe"}, + {amd::smi::kDevFwVersionDmcu, amdSMI + "kDevFwVersionDmcu"}, + {amd::smi::kDevFwVersionMc, amdSMI + "kDevFwVersionMc"}, + {amd::smi::kDevFwVersionMe, amdSMI + "kDevFwVersionMe"}, + {amd::smi::kDevFwVersionMec, amdSMI + "kDevFwVersionMec"}, + {amd::smi::kDevFwVersionMec2, amdSMI + "kDevFwVersionMec2"}, + {amd::smi::kDevFwVersionPfp, amdSMI + "kDevFwVersionPfp"}, + {amd::smi::kDevFwVersionRlc, amdSMI + "kDevFwVersionRlc"}, + {amd::smi::kDevFwVersionRlcSrlc, amdSMI + "kDevFwVersionRlcSrlc"}, + {amd::smi::kDevFwVersionRlcSrlg, amdSMI + "kDevFwVersionRlcSrlg"}, + {amd::smi::kDevFwVersionRlcSrls, amdSMI + "kDevFwVersionRlcSrls"}, + {amd::smi::kDevFwVersionSdma, amdSMI + "kDevFwVersionSdma"}, + {amd::smi::kDevFwVersionSdma2, amdSMI + "kDevFwVersionSdma2"}, + {amd::smi::kDevFwVersionSmc, amdSMI + "kDevFwVersionSmc"}, + {amd::smi::kDevFwVersionSos, amdSMI + "kDevFwVersionSos"}, + {amd::smi::kDevFwVersionTaRas, amdSMI + "kDevFwVersionTaRas"}, + {amd::smi::kDevFwVersionTaXgmi, amdSMI + "kDevFwVersionTaXgmi"}, + {amd::smi::kDevFwVersionUvd, amdSMI + "kDevFwVersionUvd"}, + {amd::smi::kDevFwVersionVce, amdSMI + "kDevFwVersionVce"}, + {amd::smi::kDevFwVersionVcn, amdSMI + "kDevFwVersionVcn"}, + {amd::smi::kDevSerialNumber, amdSMI + "kDevSerialNumber"}, + {amd::smi::kDevMemPageBad, amdSMI + "kDevMemPageBad"}, + {amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"}, + {amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"}, + {amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"}, + {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"} +}; + namespace amd { namespace smi { @@ -179,6 +250,8 @@ static bool bdfid_from_path(const std::string in_name, uint64_t *bdfid) { return true; } +// 0 = successful bdfid found +// 1 = not a good bdfid found static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { assert(bdfid != nullptr); char tpath[256] = {'\0'}; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc new file mode 100755 index 0000000000..8e22022716 --- /dev/null +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -0,0 +1,324 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include + +#include "gtest/gtest.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi_test/functional/computepartition_read_write.h" +#include "rocm_smi_test/test_common.h" + +TestComputePartitionReadWrite::TestComputePartitionReadWrite() : TestBase() { + set_title("RSMI Compute Partition Read/Write Test"); + set_description("The Compute Parition tests verifies that the compute " + "parition can be read and updated properly."); +} + +TestComputePartitionReadWrite::~TestComputePartitionReadWrite(void) { +} + +void TestComputePartitionReadWrite::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestComputePartitionReadWrite::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestComputePartitionReadWrite::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestComputePartitionReadWrite::Close() { + // This will close handles opened within rsmitst utility calls and call + // rsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +static const std::string +computePartitionString(rsmi_compute_partition_type computeParitionType) { + /** + * RSMI_COMPUTE_PARTITION_INVALID = 0, + * RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + * //!< shared memory + * RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + * //!< together with shared memory + * RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + * //!< together with shared memory + * RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + * //!< work together with shared memory + * RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + * //!< work together with shared memory + */ + switch (computeParitionType) { + case RSMI_COMPUTE_PARTITION_CPX: + return "CPX"; + case RSMI_COMPUTE_PARTITION_SPX: + return "SPX"; + case RSMI_COMPUTE_PARTITION_DPX: + return "DPX"; + case RSMI_COMPUTE_PARTITION_TPX: + return "TPX"; + case RSMI_COMPUTE_PARTITION_QPX: + return "QPX"; + default: + return "UNKNOWN"; + } +} + +static const std::map +mapStringToRSMIComputePartitionTypes { + {"CPX", RSMI_COMPUTE_PARTITION_CPX}, + {"SPX", RSMI_COMPUTE_PARTITION_SPX}, + {"DPX", RSMI_COMPUTE_PARTITION_DPX}, + {"TPX", RSMI_COMPUTE_PARTITION_TPX}, + {"QPX", RSMI_COMPUTE_PARTITION_QPX} +}; + +void TestComputePartitionReadWrite::Run(void) { + rsmi_status_t ret, err; + char orig_char_computePartition[255]; + char current_char_computePartition[255]; + rsmi_compute_partition_type new_computePartition; + + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + + for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { + PrintDeviceHeader(dv_ind); + + //Standard checks to see if API is supported, before running full tests + ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, + 255); + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << ": " + << "Not supported on this machine" << std::endl; + } + return; + } else { + CHK_ERR_ASRT(ret) + } + IF_VERB(STANDARD) { + std::cout << std::endl << "\t**" + << "Original compute partition: " + << orig_char_computePartition << std::endl; + } + + if ((orig_char_computePartition == NULL) || + (orig_char_computePartition[0] == '\0')) { + std::cout << "***System compute partition value is not defined. " + "Skip compute partition test." << std::endl; + return; + } + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + + // Verify api support checking functionality is working + uint32_t length = 2; + char smallBuffer[length]; + err = rsmi_dev_compute_partition_get(dv_ind, smallBuffer, length); + size_t size = sizeof(smallBuffer)/sizeof(*smallBuffer); + ASSERT_EQ(err, RSMI_STATUS_INSUFFICIENT_SIZE); + ASSERT_EQ((size_t)length, size); + IF_VERB(STANDARD) { + if (err == RSMI_STATUS_INSUFFICIENT_SIZE) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned " + << "and size matches length requested." << std::endl; + } + } + + // Verify api support checking functionality is working + err = rsmi_dev_compute_partition_get(dv_ind, nullptr, 255); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + IF_VERB(STANDARD) { + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_NOT_SUPPORTED was returned." + << std::endl; + } + } + + // Verify api support checking functionality is working + err = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, 0); + ASSERT_EQ(err, (RSMI_STATUS_INVALID_ARGS || RSMI_STATUS_NOT_SUPPORTED)); + IF_VERB(STANDARD) { + if (err == RSMI_STATUS_INVALID_ARGS) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } + } + + // Verify api support checking functionality is working + err = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); + // Note: new_computePartition is not set + // DISPLAY_RSMI_ERR(err) + EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || + (err == RSMI_STATUS_NOT_SUPPORTED)); + IF_VERB(STANDARD) { + if (err == RSMI_STATUS_INVALID_ARGS) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } else { + DISPLAY_RSMI_ERR(err) + } + } + ASSERT_FALSE(err == RSMI_STATUS_PERMISSION); + + // Verify api support checking functionality is working + new_computePartition + = rsmi_compute_partition_type::RSMI_COMPUTE_PARTITION_INVALID; + err = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); + // DISPLAY_RSMI_ERR(err) + EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || + (err == RSMI_STATUS_NOT_SUPPORTED) || + (err == RSMI_STATUS_PERMISSION)); + IF_VERB(STANDARD) { + if (err == RSMI_STATUS_INVALID_ARGS) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } else if (err == RSMI_STATUS_PERMISSION) { + DISPLAY_RSMI_ERR(err) + // tests should not continue if err is a permission issue + ASSERT_FALSE(err == RSMI_STATUS_PERMISSION); + } else { + DISPLAY_RSMI_ERR(err) + } + } + + // Re-run original get, so we can reset to later + ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, + 255); + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + + /** + * RSMI_COMPUTE_PARTITION_INVALID = 0, + * RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + * //!< shared memory + * RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + * //!< together with shared memory + * RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + * //!< together with shared memory + * RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + * //!< work together with shared memory + * RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + * //!< work together with shared memory + */ + + for (int partition = RSMI_COMPUTE_PARTITION_CPX; + partition <= RSMI_COMPUTE_PARTITION_QPX; + partition++) { + new_computePartition + = static_cast(partition); + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "======== TEST RSMI_COMPUTE_PARTITION_" + << computePartitionString(new_computePartition) + << " ===============" << std::endl; + } + ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Attempting to set compute partition to: " + << computePartitionString(new_computePartition) << std::endl; + } + ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, + 255); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Current compute partition: " + << current_char_computePartition + << std::endl; + } + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + EXPECT_STREQ(computePartitionString(new_computePartition).c_str(), + current_char_computePartition); + } + + /* TEST RETURN TO ORIGINAL COMPUTE PARTITIONING SETTING */ + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITIONING " + << "SETTING ========" << std::endl; + } + new_computePartition + = mapStringToRSMIComputePartitionTypes.at(orig_char_computePartition); + ret = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" << "Returning compute partition to: " + << computePartitionString(new_computePartition) << std::endl; + } + ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, + 255); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" << "Attempted to set compute partition: " + << computePartitionString(new_computePartition) << std::endl + << "\t**" + << "Current compute partition: " << current_char_computePartition + << std::endl; + } + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + EXPECT_STREQ(computePartitionString(new_computePartition).c_str(), + current_char_computePartition); + } +} diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.h b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.h new file mode 100755 index 0000000000..a2e3627fc2 --- /dev/null +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.h @@ -0,0 +1,73 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_ +#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_ + +#include "rocm_smi_test/test_base.h" + +class TestComputePartitionReadWrite : public TestBase { + public: + TestComputePartitionReadWrite(); + + // @Brief: Destructor for test case of TestComputePartitionReadWrite + virtual ~TestComputePartitionReadWrite(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_COMPUTEPARTITION_READ_WRITE_H_ diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc index 8dc7b51c9b..369c049042 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -81,6 +81,7 @@ #include "functional/mutual_exclusion.h" #include "functional/evt_notif_read_write.h" #include "functional/init_shutdown_refcount.h" +#include "functional/computepartition_read_write.h" #include "rocm_smi_test/functional/hw_topology_read.h" #include "rocm_smi_test/functional/gpu_metrics_read.h" #include "rocm_smi_test/functional/metrics_counter_read.h" @@ -267,6 +268,10 @@ TEST(rsmitstReadWrite, TestEvtNotifReadWrite) { TestEvtNotifReadWrite tst; RunGenericTest(&tst); } +TEST(rsmitstReadWrite, TestComputePartitionReadWrite) { + TestComputePartitionReadWrite tst; + RunGenericTest(&tst); +} TEST(rsmitstReadOnly, TestConcurrentInit) { TestConcurrentInit tst; SetFlags(&tst);