diff --git a/projects/amdsmi/.gitignore b/projects/amdsmi/.gitignore index dc0fa0928c..632b6ea6c9 100644 --- a/projects/amdsmi/.gitignore +++ b/projects/amdsmi/.gitignore @@ -30,10 +30,9 @@ modules.builtin *.lzma *.xz *.lzo -#*.patch +*.patch *.gcno *.pyc -*current_compute_partition # # Top-level generic files/folders @@ -121,4 +120,9 @@ _deps # ROCm files # Removes generated config headers like rocmsmi64Config.h & oamConfig.h # -*Config.h \ No newline at end of file +*Config.h + +# +# Fake SYSFS files +# +/device/* \ No newline at end of file diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index 7c8aeb1d48..c478c045b8 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -124,8 +124,10 @@ typedef enum { RSMI_STATUS_BUSY, //!< A resource or mutex could not be //!< acquired because it is already //!< being used - RSMI_STATUS_REFCOUNT_OVERFLOW, //!< An internal reference counter + RSMI_STATUS_REFCOUNT_OVERFLOW, //!< An internal reference counter //!< exceeded INT32_MAX + RSMI_STATUS_AMDGPU_RESTART_ERR, //!< Could not successfully restart + //!< the amdgpu driver RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred } rsmi_status_t; @@ -353,7 +355,8 @@ typedef rsmi_clk_type_t rsmi_clk_type; /// \endcond /** - * Compute Partition types + * @brief Compute Partition. This enum is used to identify + * various compute partitioning settings. */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, @@ -365,13 +368,37 @@ typedef enum { //!< together with shared memory RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs //!< work together with shared memory - RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + RSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs //!< work together with shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; /// \endcond +/** + * @brief NPS Modes. This enum is used to identify various + * NPS mode types. + */ +typedef enum { + RSMI_MEMORY_PARTITION_UNKNOWN = 0, + RSMI_MEMORY_PARTITION_NPS1, //!< NPS1 - All CCD & XCD data is interleaved + //!< accross all 8 HBM stacks (all stacks/1). + RSMI_MEMORY_PARTITION_NPS2, //!< NPS2 - 2 sets of CCDs or 4 XCD interleaved + //!< accross the 4 HBM stacks per AID pair + //!< (8 stacks/2). + RSMI_MEMORY_PARTITION_NPS4, //!< NPS4 - Each XCD data is interleaved accross + //!< accross 2 (or single) HBM stacks + //!< (8 stacks/8 or 8 stacks/4). + RSMI_MEMORY_PARTITION_NPS8, //!< NPS8 - Each XCD uses a single HBM stack + //!< (8 stacks/8). Or each XCD uses a single + //!< HBM stack & CCDs share 2 non-interleaved + //!< HBM stacks on its AID + //!< (AID[1,2,3] = 6 stacks/6). +} rsmi_nps_mode_type_t; +/// \cond Ignore in docs. +typedef rsmi_nps_mode_type_t rsmi_nps_mode_type; +/// \endcond + /** * @brief Temperature Metrics. This enum is used to identify various * temperature metrics. Corresponding values will be in millidegress @@ -3512,7 +3539,7 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, * @param[inout] compute_partition a pointer to a char string variable, * which the device's current compute partition will be written to. * - * @param[in] len the length of the caller provided buffer @p compute_partition + * @param[in] len the length of the caller provided buffer @p compute_partition * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid @@ -3537,7 +3564,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, * * @param[in] dv_ind a device index * - * @param[inout] compute_partition using enum ::rsmi_copmpute_partition_type_t, + * @param[in] compute_partition using enum ::rsmi_compute_partition_type_t, * define what the selected device's compute partition setting should be * updated to. * @@ -3554,6 +3581,69 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, /** @} */ // end of ComputePartition +/*****************************************************************************/ +/** @defgroup NPSMode NPS Mode Functions + * These functions are used to query the device's NPS mode (memory partition). + * @{ + */ + +/** + * @brief Retrieves the NPS mode (memory partition) for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p nps_mode , + * and uint32 @p len , this function will attempt to obtain the device's + * nps mode string. Upon successful retreival, the obtained device's + * nps mode string shall be stored in the passed @p nps_mode char string + * variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] nps_mode a pointer to a char string variable, + * which the device's nps mode will be written to. + * + * @param[in] len the length of the caller provided buffer @p nps_mode , + * suggested length is 5 or greater. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire nps mode value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t +rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, uint32_t len); + +/** + * @brief Modifies a selected device's NPS mode (memory partition) setting. + * + * @details Given a device index @p dv_ind and a type of nps mode + * @p nps_mode, this function will attempt to update the selected + * device's nps mode setting. + * + * @param[in] dv_ind a device index + * + * @param[in] nps_mode using enum ::rsmi_nps_mode_type_t, + * define what the selected device's NPS mode setting should be updated to. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_PERMISSION function requires root access + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart + * the amdgpu driver + * + */ +rsmi_status_t +rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode); + +/** @} */ // end of NPSMode + /*****************************************************************************/ /** @defgroup APISupport Supported Functions * API function support varies by both GPU type and the version of the diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_common.h b/projects/amdsmi/include/rocm_smi/rocm_smi_common.h index dad39ad13f..fd124d3530 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_common.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_common.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -49,6 +49,7 @@ #include #include #include +#include #define CHECK_DV_IND_RANGE \ amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \ @@ -165,7 +166,9 @@ struct RocmSMI_env_vars { // The integer value of sysfs field enum that is to be over-ridden. // Env. variable RSMI_DEBUG_ENUM_OVERRIDE is used to specify this. - uint32_t enum_override; + // A set of enum overrides, RSMI_DEBUG_ENUM_OVERRIDE now supports + // comma delimited values. + std::unordered_set enum_overrides; // Sysfs path overrides diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index d76ddfa4de..6b7b3baeca 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -162,7 +162,8 @@ enum DevInfoTypes { kDevNumaNode, kDevGpuMetrics, kDevGpuReset, - kDevComputePartition + kDevComputePartition, + kDevMemoryPartition }; typedef struct { @@ -215,6 +216,7 @@ class Device { void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, uint64_t sub_variant); + rsmi_status_t restartAMDGpuDriver(void); private: std::shared_ptr monitor_; diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_main.h b/projects/amdsmi/include/rocm_smi/rocm_smi_main.h index c6d5f077b9..74aad3668b 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_main.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_main.h @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2017, Advanced Micro Devices, Inc. + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -113,6 +113,7 @@ class RocmSMI { uint64_t *weight); int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); + void printEnvVarInfo(void); static const std::map devInfoTypesStrings; private: diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index a8f4cfa848..527342d544 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -76,6 +76,8 @@ int WriteSysfsStr(std::string path, std::string val); bool IsInteger(const std::string & n_str); +std::pair executeCommand(std::string command, bool stdOut = true); + rsmi_status_t handleException(); rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index e237c9da60..d235738703 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -18,6 +18,9 @@ import sys import subprocess import _thread import time +import multiprocessing +import trace +from io import StringIO from time import ctime from subprocess import check_output from rsmiBindings import * @@ -509,7 +512,7 @@ def printEventList(device, delay, eventList): data.message.decode('utf8') + '\r']]) -def printLog(device, metricName, value): +def printLog(device, metricName, value, extraSpace=False): """ Print out to the SMI log @param device: DRM device identifier @@ -530,7 +533,13 @@ def printLog(device, metricName, value): if device is None: logstr = logstr[13:] # Force thread safe printing - print(logstr + '\n', end='') + lock = multiprocessing.Lock() + lock.acquire() + if extraSpace: + print('\n' + logstr + '\n', end='', flush=True) + else: + print(logstr + '\n', end='', flush=True) + lock.release() def printListLog(metricName, valuesList): @@ -1336,6 +1345,76 @@ def setComputePartition(deviceList, computePartitionType): printLogSpacer() +def progressbar(it, prefix="", size=60, out=sys.stdout): + count = len(it) + def show(j): + x = int(size*j/count) + lock = multiprocessing.Lock() + lock.acquire() + print("{}[{}{}] {}/{} secs remain".format(prefix, u"█"*x, "."*(size-x), j, count), + end='\r', file=out, flush=True) + lock.release() + show(0) + for i, item in enumerate(it): + yield item + show(i+1) + lock = multiprocessing.Lock() + lock.acquire() + print("\n", flush=True, file=out) + lock.release() + +def showProgressbar(title="", timeInSeconds=13): + if title != "": + title += ": " + for i in progressbar(range(timeInSeconds), title, 40): + time.sleep(1) + + +def setNPSMode(deviceList, npsMode): + """ Sets nps mode (memory partition) for a list of devices + + @param deviceList: List of DRM devices (can be a single-item list) + @param npsMode: NPS Mode type to set as + """ + printLogSpacer(' Set nps mode to %s ' % (str(npsMode).upper())) + for device in deviceList: + npsMode = npsMode.upper() + if npsMode not in nps_mode_type_l: + printErrLog(device, 'Invalid nps mode type %s' + '\nValid nps mode types are %s' + % ( npsMode.upper(), + (', '.join(map(str, nps_mode_type_l))) )) + return (None, None) + + t1 = multiprocessing.Process(target=showProgressbar, + args=("Updating NPS mode",13,)) + t1.start() + addExtraLine=True + start=time.time() + ret = rocmsmi.rsmi_dev_nps_mode_set(device, + rsmi_nps_mode_type_dict[npsMode]) + stop=time.time() + duration=stop-start + if t1.is_alive(): + t1.terminate() + t1.join() + if duration < float(0.1): # For longer runs, add extra line before output + addExtraLine=False # This is to prevent overriding progress bar + + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, + 'Successfully set nps mode to %s' % (npsMode), + None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_PERMISSION: + printLog(device, 'Permission denied', None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Not supported on the given system', None, addExtraLine) + else: + rsmi_ret_ok(ret, device) + printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.') + printLogSpacer() + + def showAllConcise(deviceList): """ Display critical info for all devices in a concise format @@ -2780,9 +2859,28 @@ def showComputePartition(deviceList): printLog(device, 'Not supported on the given system', None) else: rsmi_ret_ok(ret, device) - printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.', None) + printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.') printLogSpacer() +def showNPSMode(deviceList): + """ Returns the current NPS mode for a list of devices + + @param deviceList: List of DRM devices (can be a single-item list) + """ + npsMode = create_string_buffer(256) + printLogSpacer(' Current NPS Mode ') + for device in deviceList: + ret = rocmsmi.rsmi_dev_nps_mode_get(device, npsMode, 256) + if rsmi_ret_ok(ret, device, silent=True) and npsMode.value.decode(): + printLog(device, 'NPS Mode', npsMode.value.decode()) + elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + printLog(device, 'Not supported on the given system', None) + else: + rsmi_ret_ok(ret, device) + printErrLog(device, 'Failed to retrieve NPS mode, even though device supports it.') + printLogSpacer() + + def checkAmdGpus(deviceList): """ Check if there are any AMD GPUs being queried, return False if there are none @@ -3130,6 +3228,7 @@ if __name__ == '__main__': action='store_true') groupDisplay.add_argument('--shownodesbw', help='Shows the numa nodes ', action='store_true') groupDisplay.add_argument('--showcomputepartition', help='Shows current compute partitioning ', action='store_true') + groupDisplay.add_argument('--shownpsmode', help='Shows current nps mode ', action='store_true') groupActionReset.add_argument('-r', '--resetclocks', help='Reset clocks and OverDrive to default', action='store_true') @@ -3176,8 +3275,10 @@ if __name__ == '__main__': metavar='SCLK', nargs=1) groupAction.add_argument('--setcomputepartition', help='Set compute partition', choices=compute_partition_type_l + [x.lower() for x in compute_partition_type_l], - type=str, nargs=1 - ) + type=str, nargs=1) + groupAction.add_argument('--setnpsmode', help='Set nps mode', + choices=nps_mode_type_l + [x.lower() for x in nps_mode_type_l], + type=str, nargs=1) groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2, metavar=('BLOCK', 'ERRTYPE')) groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2, @@ -3215,7 +3316,8 @@ if __name__ == '__main__': or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \ or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ - args.setvc or args.setsrange or args.setmrange or args.setclock or args.setcomputepartition: + args.setvc or args.setsrange or args.setmrange or args.setclock or \ + args.setcomputepartition or args.setnpsmode: relaunchAsSudo() # If there is one or more device specified, use that for all commands, otherwise use a @@ -3278,6 +3380,7 @@ if __name__ == '__main__': args.showreplaycount = True args.showvc = True args.showcomputepartition = True + args.shownpsmode = True if not PRINT_JSON: args.showprofile = True @@ -3408,6 +3511,8 @@ if __name__ == '__main__': showEnergy(deviceList) if args.showcomputepartition: showComputePartition(deviceList) + if args.shownpsmode: + showNPSMode(deviceList) if args.setclock: setClocks(deviceList, args.setclock[0], [int(args.setclock[1])]) if args.setsclk: @@ -3448,6 +3553,8 @@ if __name__ == '__main__': setPerfDeterminism(deviceList, args.setperfdeterminism[0]) if args.setcomputepartition: setComputePartition(deviceList, args.setcomputepartition[0]) + if args.setnpsmode: + setNPSMode(deviceList, args.setnpsmode[0]) if args.resetprofile: resetProfile(deviceList) if args.resetxgmierr: diff --git a/projects/amdsmi/python_smi_tools/rsmiBindings.py b/projects/amdsmi/python_smi_tools/rsmiBindings.py index 4502b1c62b..8f70b63d90 100644 --- a/projects/amdsmi/python_smi_tools/rsmiBindings.py +++ b/projects/amdsmi/python_smi_tools/rsmiBindings.py @@ -66,6 +66,10 @@ class rsmi_status_t(c_int): RSMI_STATUS_INTERRUPT = 0xC RSMI_STATUS_UNEXPECTED_SIZE = 0xD RSMI_STATUS_NO_DATA = 0xE + RSMI_STATUS_UNEXPECTED_DATA = 0xF + RSMI_STATUS_BUSY = 0x10 + RSMI_STATUS_REFCOUNT_OVERFLOW = 0x11 + RSMI_STATUS_AMDGPU_RESTART_ERR = 0x12 RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF @@ -86,6 +90,10 @@ rsmi_status_verbose_err_out = { rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution', rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read', rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input', + rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: 'Unexpected data received', + rsmi_status_t.RSMI_STATUS_BUSY: 'Busy - resources are preventing call the ability to execute', + rsmi_status_t.RSMI_STATUS_REFCOUNT_OVERFLOW: 'Data overflow - data exceeded INT32_MAX', + rsmi_status_t.RSMI_STATUS_AMDGPU_RESTART_ERR: 'Could not successfully restart the amdgpu driver', rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured' } @@ -606,4 +614,26 @@ rsmi_compute_partition_type = rsmi_compute_partition_type_t # Usage example to get corresponding names: # compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX] # will return string 'CPX' -compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX'] \ No newline at end of file +compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX'] + +class rsmi_nps_mode_type_t(c_int): + RSMI_MEMORY_PARTITION_UNKNOWN = 0 + RSMI_MEMORY_PARTITION_NPS1 = 1 + RSMI_MEMORY_PARTITION_NPS2 = 2 + RSMI_MEMORY_PARTITION_NPS4 = 3 + RSMI_MEMORY_PARTITION_NPS8 = 4 + +rsmi_nps_mode_type_dict = { + 'NPS1': 1, + 'NPS2': 2, + 'NPS4': 3, + 'NPS8': 4 +} + +rsmi_nps_mode_type = rsmi_nps_mode_type_t + +# nps_mode_type_l includes string names for the rsmi_compute_partition_type_t +# Usage example to get corresponding names: +# nps_mode_type_l[rsmi_nps_mode_type_t.RSMI_MEMORY_PARTITION_NPS2] +# will return string 'NPS2' +nps_mode_type_l = ['NPS1', 'NPS2', 'NPS4', 'NPS8'] \ No newline at end of file diff --git a/projects/amdsmi/rocm_smi/docs/ROCm_SMI_Manual.pdf b/projects/amdsmi/rocm_smi/docs/ROCm_SMI_Manual.pdf index ac05a506e8..05cf23122e 100644 Binary files a/projects/amdsmi/rocm_smi/docs/ROCm_SMI_Manual.pdf and b/projects/amdsmi/rocm_smi/docs/ROCm_SMI_Manual.pdf differ diff --git a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc index 58623c7c14..a5001a47ef 100755 --- a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc +++ b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc @@ -100,6 +100,33 @@ } \ } +#define CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(RET) { \ + if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ + std::cout << "This function is not supported in the current environment." \ + << std::endl; \ + } else if ((RET) == RSMI_STATUS_UNEXPECTED_DATA) { \ + std::cout << "[ERROR] RSMI_STATUS_UNEXPECTED_DATA retrieved." \ + << std::endl; \ + } else { \ + CHK_RSMI_RET(RET) \ + } \ +} + +#define CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(RET) { \ + if ((RET) == RSMI_STATUS_NOT_SUPPORTED) { \ + std::cout << "This function is not supported in the current environment." \ + << std::endl; \ + } else if ((RET) == RSMI_STATUS_UNEXPECTED_DATA) { \ + std::cout << "[WARN] RSMI_STATUS_UNEXPECTED_DATA retrieved." \ + << std::endl; \ + } else if ((RET) == RSMI_STATUS_INSUFFICIENT_SIZE) { \ + std::cout << "[WARN] RSMI_STATUS_INSUFFICIENT_SIZE retrieved." \ + << std::endl; \ + } else { \ + CHK_RSMI_RET(RET) \ + } \ +} + static void print_test_header(const char *str, uint32_t dv_ind) { std::cout << "********************************" << std::endl; std::cout << "*** " << str << std::endl; @@ -158,6 +185,30 @@ mapStringToRSMIComputePartitionTypes { {"QPX", RSMI_COMPUTE_PARTITION_QPX} }; +static const std::string +nps_mode_string(rsmi_nps_mode_type_t partition) { + switch (partition) { + case RSMI_MEMORY_PARTITION_NPS1: + return "NPS1"; + case RSMI_MEMORY_PARTITION_NPS2: + return "NPS2"; + case RSMI_MEMORY_PARTITION_NPS4: + return "NPS4"; + case RSMI_MEMORY_PARTITION_NPS8: + return "NPS8"; + default: + return "UNKNOWN"; + } +} + +static std::map +mapStringToRSMINpsModeTypes { + {"NPS1", RSMI_MEMORY_PARTITION_NPS1}, + {"NPS2", RSMI_MEMORY_PARTITION_NPS2}, + {"NPS4", RSMI_MEMORY_PARTITION_NPS4}, + {"NPS8", RSMI_MEMORY_PARTITION_NPS8} +}; + static const char * perf_level_string(rsmi_dev_perf_level_t perf_lvl) { switch (perf_lvl) { @@ -184,7 +235,7 @@ static bool isUserRunningAsSudo() { return isRunningWithSudo; } -bool isFileWritable(rsmi_status_t response) { +static bool isFileWritable(rsmi_status_t response) { // Clock files may not be writable, causing sets to // return RSMI_STATUS_PERMISSION. If running as sudo, // this means file is not writable. @@ -492,35 +543,23 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { rsmi_status_t ret; uint32_t buffer_len = 10; char originalComputePartition[buffer_len]; + originalComputePartition[0] = '\0'; print_test_header("Compute Partitioning Control", dv_ind); - /** - typedef enum { - RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory - } rsmi_compute_partition_type_t; - */ - ret = rsmi_dev_compute_partition_get(dv_ind, originalComputePartition, buffer_len); - CHK_RSMI_NOT_SUPPORTED_RET(ret) + + ret = rsmi_dev_compute_partition_get(dv_ind, originalComputePartition, + buffer_len); + CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) if (ret == RSMI_STATUS_NOT_SUPPORTED) { - std::cout << "Device does not support the compute partition feature." - << std::endl; - std::cout << "*********************************************" << std::endl; return RSMI_STATUS_SUCCESS; - } else { - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "Original compute partition is " << originalComputePartition - << "." << std::endl; } + std::cout << "Original Compute Partition: " + << (((originalComputePartition == nullptr) + || ((originalComputePartition != nullptr) + && (originalComputePartition[0] == '\0'))) + ? "UNKNOWN" : originalComputePartition) + << std::endl << std::endl; + for (int newComputePartition = RSMI_COMPUTE_PARTITION_CPX; newComputePartition <= RSMI_COMPUTE_PARTITION_QPX; newComputePartition++) { @@ -550,6 +589,59 @@ static rsmi_status_t test_set_compute_partitioning(uint32_t dv_ind) { return RSMI_STATUS_SUCCESS; } +static rsmi_status_t test_set_nps_mode(uint32_t dv_ind) { + rsmi_status_t ret; + uint32_t buffer_len = 10; + char originalNpsMode[buffer_len]; + originalNpsMode[0] = '\0'; + print_test_header("NPS Mode Control", dv_ind); + + ret = rsmi_dev_nps_mode_get(dv_ind, originalNpsMode, buffer_len); + CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + return RSMI_STATUS_SUCCESS; + } + + std::cout << "Original NPS Mode: " + << (((originalNpsMode == nullptr) + || ((originalNpsMode != nullptr) + && (originalNpsMode[0] == '\0'))) + ? "UNKNOWN" : originalNpsMode) + << std::endl << std::endl; + + for (int newNpsMode = RSMI_MEMORY_PARTITION_NPS1; + newNpsMode <= RSMI_MEMORY_PARTITION_NPS8; + newNpsMode++) { + rsmi_nps_mode_type_t newMemoryPartition + = static_cast(newNpsMode); + std::cout << "Attempting to set NPS mode to " + << nps_mode_string(newMemoryPartition) << "..." + << std::endl; + ret = rsmi_dev_nps_mode_set(dv_ind, newMemoryPartition); + CHK_RSMI_NOT_SUPPORTED_RET(ret) + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + // do not continue attempting to set, device does not support setting + return RSMI_STATUS_SUCCESS; + } + std::cout << "Done setting NPS mode to " + << nps_mode_string(newMemoryPartition) + << "." << std::endl; + std::cout << std::endl << std::endl; + } + + std::string myNpsMode = originalNpsMode; + if (myNpsMode.empty() == false) { + std::cout << "Resetting compute partition to " << originalNpsMode + << "... " << std::endl; + rsmi_nps_mode_type_t origNpsModeType + = mapStringToRSMINpsModeTypes[originalNpsMode]; + CHK_RSMI_NOT_SUPPORTED_RET(ret) + std::cout << "Done" << std::endl; + ret = rsmi_dev_nps_mode_set(dv_ind, origNpsModeType); + } + return RSMI_STATUS_SUCCESS; +} + int main() { rsmi_status_t ret; @@ -577,10 +669,31 @@ int main() { << "rsmi_dev_compute_partition_get()..." << std::endl; char current_compute_partition[256]; + current_compute_partition[0] = '\0'; ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256); - CHK_RSMI_NOT_SUPPORTED_RET(ret) - std::cout << "\t**Current Compute Partition setting: " - << current_compute_partition << std::endl; + CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) + std::cout << "\t**Current Compute Partition: " + << (((current_compute_partition == nullptr) + || ((current_compute_partition != nullptr) + && (current_compute_partition[0] == '\0'))) + ? "UNKNOWN" : current_compute_partition) + << std::endl; + + std::cout << std::endl << std::endl; + std::cout << "Starting to call " + << "rsmi_dev_nps_mode_get()..." + << std::endl; + uint32_t len = 5; + char nps_mode[len]; + nps_mode[0] = '\0'; + ret = rsmi_dev_nps_mode_get(i, nps_mode, len); + CHK_NOT_SUPPORTED_OR_UNEXPECTED_DATA_OR_INSUFFICIENT_SIZE_RET(ret) + std::cout << "\t**NPS Mode: " + << (((nps_mode == nullptr) + || ((nps_mode != nullptr) + && (nps_mode[0] == '\0'))) + ? "UNKNOWN" : nps_mode) + << std::endl; ret = rsmi_dev_gpu_metrics_info_get(i, &p); CHK_AND_PRINT_RSMI_ERR_RET(ret) @@ -672,6 +785,9 @@ int main() { ret = test_set_freq(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) + + ret = test_set_nps_mode(i); + CHK_AND_PRINT_RSMI_ERR_RET(ret) } return 0; diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index e0f102b4a6..645e1f6e6d 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -62,6 +62,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi.h" @@ -1696,6 +1697,7 @@ mapStringToRSMIComputePartitionTypes { std::map mapRSMIToStringComputePartitionTypes { + {RSMI_COMPUTE_PARTITION_INVALID, "UNKNOWN"}, {RSMI_COMPUTE_PARTITION_CPX, "CPX"}, {RSMI_COMPUTE_PARTITION_SPX, "SPX"}, {RSMI_COMPUTE_PARTITION_DPX, "DPX"}, @@ -1703,6 +1705,23 @@ mapRSMIToStringComputePartitionTypes { {RSMI_COMPUTE_PARTITION_QPX, "QPX"} }; +std::map +mapRSMIToStringNPSModeTypes { + {RSMI_MEMORY_PARTITION_UNKNOWN, "UNKNOWN"}, + {RSMI_MEMORY_PARTITION_NPS1, "NPS1"}, + {RSMI_MEMORY_PARTITION_NPS2, "NPS2"}, + {RSMI_MEMORY_PARTITION_NPS4, "NPS4"}, + {RSMI_MEMORY_PARTITION_NPS8, "NPS8"} +}; + +std::map +mapStringToNPSModeTypes { + {"NPS1", RSMI_MEMORY_PARTITION_NPS1}, + {"NPS2", RSMI_MEMORY_PARTITION_NPS2}, + {"NPS4", RSMI_MEMORY_PARTITION_NPS4}, + {"NPS8", RSMI_MEMORY_PARTITION_NPS8} +}; + static std::string get_id_name_str_from_line(uint64_t id, std::string ln, std::istringstream *ln_str) { @@ -2780,71 +2799,84 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) { break; case RSMI_STATUS_OUT_OF_RESOURCES: - *status_string = "Unable to acquire memory or other resource"; + *status_string = "RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire memory " + "or other resource"; break; case RSMI_STATUS_INTERNAL_EXCEPTION: - *status_string = "An internal exception was caught"; + *status_string = "RSMI_STATUS_INTERNAL_EXCEPTION: An internal exception " + "was caught"; break; case RSMI_STATUS_INPUT_OUT_OF_BOUNDS: - *status_string = "The provided input is out of allowable or safe range"; + *status_string = "RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided input is " + "out of allowable or safe range"; break; case RSMI_STATUS_INIT_ERROR: - *status_string = "An error occurred during initialization, during " - "monitor discovery or when when initializing internal data structures"; + *status_string = "RSMI_STATUS_INIT_ERROR: An error occurred during " + "initialization, during monitor discovery or when when " + "initializing internal data structures"; break; case RSMI_STATUS_NOT_YET_IMPLEMENTED: - *status_string = "The called function has not been implemented in this " - "system for this device type"; + *status_string = "RSMI_STATUS_NOT_YET_IMPLEMENTED: The called function " + "has not been implemented in this system for this " + "device type"; break; case RSMI_STATUS_NOT_FOUND: - *status_string = "An item required to complete the call was not found"; + *status_string = "RSMI_STATUS_NOT_FOUND: An item required to complete " + "the call was not found"; break; case RSMI_STATUS_INSUFFICIENT_SIZE: - *status_string = "Not enough resources were available to fully execute" - " the call"; + *status_string = "RSMI_STATUS_INSUFFICIENT_SIZE: Not enough resources " + "were available to fully execute the call"; break; case RSMI_STATUS_INTERRUPT: - *status_string = "An interrupt occurred while executing the function"; + *status_string = "RSMI_STATUS_INTERRUPT: An interrupt occurred while " + "executing the function"; break; case RSMI_STATUS_UNEXPECTED_SIZE: - *status_string = "Data (usually from reading a file) was out of" - " range from what was expected"; + *status_string = "RSMI_STATUS_UNEXPECTED_SIZE: Data (usually from reading" + " a file) was out of range from what was expected"; break; case RSMI_STATUS_NO_DATA: - *status_string = "No data was found (usually from reading a file) " - "where data was expected"; + *status_string = "RSMI_STATUS_NO_DATA: No data was found (usually from " + "reading a file) where data was expected"; break; case RSMI_STATUS_UNEXPECTED_DATA: - *status_string = "Data (usually from reading a file) was not of the " - "type that was expected"; + *status_string = "RSMI_STATUS_UNEXPECTED_DATA: Data (usually from reading" + " a file) was not of the type that was expected"; break; case RSMI_STATUS_BUSY: - *status_string = "A resource or mutex could not be acquired " - "because it is already being used"; + *status_string = "RSMI_STATUS_BUSY: A resource or mutex could not be " + "acquired because it is already being used"; break; case RSMI_STATUS_REFCOUNT_OVERFLOW: - *status_string = "An internal reference counter exceeded INT32_MAX"; + *status_string = "RSMI_STATUS_REFCOUNT_OVERFLOW: An internal reference " + "counter exceeded INT32_MAX"; + break; + + case RSMI_STATUS_AMDGPU_RESTART_ERR: + *status_string = "RSMI_STATUS_AMDGPU_RESTART_ERR: Could not successfully " + "restart the amdgpu driver"; break; case RSMI_STATUS_UNKNOWN_ERROR: - *status_string = "An unknown error prevented the call from completing" - " successfully"; + *status_string = "RSMI_STATUS_UNKNOWN_ERROR: An unknown error prevented " + "the call from completing successfully"; break; default: - *status_string = "An unknown error occurred"; + *status_string = "RSMI_STATUS_UNKNOWN_ERROR: An unknown error occurred"; return RSMI_STATUS_UNKNOWN_ERROR; } return RSMI_STATUS_SUCCESS; @@ -3718,12 +3750,8 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, static rsmi_status_t get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { TRY - std::string val_str; - - if (compute_partition.c_str() == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } CHK_SUPPORT_NAME_ONLY(compute_partition.c_str()) + std::string val_str; DEVICE_MUTEX rsmi_status_t ret = get_dev_value_str(amd::smi::kDevComputePartition, @@ -3811,14 +3839,18 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, } // do nothing if compute_partition is the current compute partition - get_compute_partition(dv_ind, currentComputePartition); + rsmi_status_t ret_get = get_compute_partition(dv_ind, currentComputePartition); + // we can try to set, even if we get unexpected data + if (ret_get != RSMI_STATUS_SUCCESS + && ret_get != RSMI_STATUS_UNEXPECTED_DATA) { + return ret_get; + } rsmi_compute_partition_type_t currRSMIComputePartition = mapStringToRSMIComputePartitionTypes[currentComputePartition]; if (currRSMIComputePartition == compute_partition) { return RSMI_STATUS_SUCCESS; } - newComputePartitionStr = mapRSMIToStringComputePartitionTypes[compute_partition]; GET_DEV_FROM_INDX int ret = dev->writeDevInfo(amd::smi::kDevComputePartition, newComputePartitionStr); @@ -3826,6 +3858,134 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, CATCH } +static rsmi_status_t get_nps_mode(uint32_t dv_ind, std::string &nps_mode) { + TRY + CHK_SUPPORT_NAME_ONLY(nps_mode.c_str()) + std::string val_str; + + DEVICE_MUTEX + rsmi_status_t ret = get_dev_value_str(amd::smi::kDevMemoryPartition, + dv_ind, &val_str); + + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + switch (mapStringToNPSModeTypes[val_str]) { + case RSMI_MEMORY_PARTITION_UNKNOWN: + // Retrieved an unknown NPS mode + return RSMI_STATUS_UNEXPECTED_DATA; + case RSMI_MEMORY_PARTITION_NPS1: + break; + case RSMI_MEMORY_PARTITION_NPS2: + break; + case RSMI_MEMORY_PARTITION_NPS4: + break; + case RSMI_MEMORY_PARTITION_NPS8: + break; + default: + // Retrieved an unknown NPS mode + return RSMI_STATUS_UNEXPECTED_DATA; + } + nps_mode = val_str; + return RSMI_STATUS_SUCCESS; + CATCH +} + +rsmi_status_t +rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { + TRY + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + bool isCorrectDevice = false; + char boardName[128]; + boardName[0] = '\0'; + // rsmi_dev_nps_mode_set is only available for for discrete variant, + // others are required to update through bios settings + rsmi_dev_name_get(dv_ind, boardName, 128); + std::string myBoardName = boardName; + if (!myBoardName.empty()) { + std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(), + ::tolower); + if (myBoardName.find("mi") != std::string::npos && + myBoardName.find("00x") != std::string::npos) { + isCorrectDevice = true; + } + } + + if (isCorrectDevice == false) { + return RSMI_STATUS_NOT_SUPPORTED; + } + + std::string newNPSMode + = mapRSMIToStringNPSModeTypes[nps_mode]; + std::string currentNPSMode; + + switch (nps_mode) { + case RSMI_MEMORY_PARTITION_UNKNOWN: + // Retrieved an unknown NPS mode + return RSMI_STATUS_INVALID_ARGS; + case RSMI_MEMORY_PARTITION_NPS1: + break; + case RSMI_MEMORY_PARTITION_NPS2: + break; + case RSMI_MEMORY_PARTITION_NPS4: + break; + case RSMI_MEMORY_PARTITION_NPS8: + break; + default: + return RSMI_STATUS_INVALID_ARGS; + } + + // do nothing if nps_mode is the current NPS mode + rsmi_status_t ret_get = get_nps_mode(dv_ind, currentNPSMode); + // we can try to set, even if we get unexpected data + if (ret_get != RSMI_STATUS_SUCCESS + && ret_get != RSMI_STATUS_UNEXPECTED_DATA) { + return ret_get; + } + rsmi_nps_mode_type_t currRSMINpsMode + = mapStringToNPSModeTypes[currentNPSMode]; + if (currRSMINpsMode == nps_mode) { + return RSMI_STATUS_SUCCESS; + } + + GET_DEV_FROM_INDX + int ret = dev->writeDevInfo(amd::smi::kDevMemoryPartition, newNPSMode); + + if (amd::smi::ErrnoToRsmiStatus(ret) != RSMI_STATUS_SUCCESS) { + return amd::smi::ErrnoToRsmiStatus(ret); + } + + return dev->restartAMDGpuDriver(); + CATCH +} + +rsmi_status_t +rsmi_dev_nps_mode_get(uint32_t dv_ind, char *nps_mode, + uint32_t len) { + CHK_SUPPORT_NAME_ONLY(nps_mode) + if ((len == 0) || (nps_mode == nullptr)) { + return RSMI_STATUS_INVALID_ARGS; + } + + TRY + std::string returning_nps_mode; + rsmi_status_t ret = get_nps_mode(dv_ind, + returning_nps_mode); + + if (ret != RSMI_STATUS_SUCCESS) { return ret; } + + std::size_t length = returning_nps_mode.copy(nps_mode, len); + nps_mode[length]='\0'; + + if (len < (returning_nps_mode.size() + 1)) { + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + return ret; + CATCH +} + enum iterator_handle_type { FUNC_ITER = 0, VARIANT_ITER, diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 4421599c13..3c9fba6287 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -122,6 +122,7 @@ static const char *kDevSerialNumberFName = "serial_number"; static const char *kDevNumaNodeFName = "numa_node"; static const char *kDevGpuMetricsFName = "gpu_metrics"; static const char *kDevComputePartitionFName = "current_compute_partition"; +static const char *kDevMemoryPartitionFName = "current_memory_partition"; // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -292,6 +293,7 @@ static const std::map kDevAttribNameMap = { {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevComputePartition, kDevComputePartitionFName}, + {kDevMemoryPartition, kDevMemoryPartitionFName}, }; static const std::map kDevPerfLvlMap = { @@ -417,6 +419,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, {"rsmi_dev_compute_partition_set", {{kDevComputePartitionFName}, {}}}, + {"rsmi_dev_memory_partition_get", {{kDevMemoryPartitionFName}, {}}}, + {"rsmi_dev_memory_partition_set", {{kDevMemoryPartitionFName}, {}}}, // These functions with variants, but no sensors/units. (May or may not // have mandatory dependencies.) @@ -564,9 +568,9 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { auto sysfs_path = path_; #ifdef DEBUG - if (env_->path_DRM_root_override && type == env_->enum_override) { + if (env_->path_DRM_root_override + && (env_->enum_overrides.find(type) != env_->enum_overrides.end())) { sysfs_path = env_->path_DRM_root_override; - } #endif @@ -698,6 +702,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { case kDevPowerODVoltage: case kDevSOCClk: case kDevComputePartition: + case kDevMemoryPartition: return writeDevInfoStr(type, val); default: @@ -925,6 +930,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevPCIEThruPut: case kDevSerialNumber: case kDevComputePartition: + case kDevMemoryPartition: return readDevInfoStr(type, val); break; @@ -1102,6 +1108,44 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, return false; } +rsmi_status_t Device::restartAMDGpuDriver(void) { + REQUIRE_ROOT_ACCESS + bool restartSuccessful = true; + bool success = false; + std::string out = ""; + bool wasGdmServiceActive = false; + + // sudo systemctl is-active gdm + // we do not care about the success of checking if gdm is active + std::tie(success, out) = executeCommand("systemctl is-active gdm"); + (out == "active") ? (restartSuccessful &= success) : + (restartSuccessful = true); + + // if gdm is active -> sudo systemctl stop gdm + // TODO: are are there other display manager's we need to take into account? + // see https://en.wikipedia.org/wiki/GNOME_Display_Manager + if (success && (out == "active")) { + wasGdmServiceActive = true; + std::tie(success, out) = executeCommand("systemctl stop gdm&", false); + restartSuccessful &= success; + } + + // sudo modprobe -r amdgpu + // sudo modprobe amdgpu + std::tie(success, out) = + executeCommand("modprobe -r amdgpu && modprobe amdgpu&", false); + restartSuccessful &= success; + + // if gdm was active -> sudo systemctl start gdm + if (wasGdmServiceActive) { + std::tie(success, out) = executeCommand("systemctl start gdm&", false); + restartSuccessful &= success; + } + + return (restartSuccessful ? RSMI_STATUS_SUCCESS : + RSMI_STATUS_AMDGPU_RESTART_ERR); +} + #undef RET_IF_NONZERO } // namespace smi } // namespace amd diff --git a/projects/amdsmi/src/rocm_smi_main.cc b/projects/amdsmi/src/rocm_smi_main.cc index f95ad528d6..ba3649f66b 100755 --- a/projects/amdsmi/src/rocm_smi_main.cc +++ b/projects/amdsmi/src/rocm_smi_main.cc @@ -57,6 +57,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" @@ -141,7 +142,8 @@ const std::map amd::smi::RocmSMI::devInfoTy {amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"}, {amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"}, {amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"}, - {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"} + {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"}, + {amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"} }; namespace amd { @@ -305,6 +307,8 @@ RocmSMI::Initialize(uint64_t flags) { euid_ = geteuid(); GetEnvVariables(); + // To help debug env variable issues + // printEnvVarInfo(); while (env_vars_.debug_inf_loop) {} @@ -429,6 +433,31 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) { return 0; } +static std::unordered_set GetEnvVarUIntegerSets(const char *ev_str) { + std::unordered_set returnSet; +#ifndef DEBUG + (void)ev_str; +#else + ev_str = getenv(ev_str); + if(ev_str == nullptr) { return returnSet; } + std::string stringEnv = ev_str; + + if (stringEnv.empty() == false) { + // parse out values by commas + std::string parsedVal; + std::istringstream ev_str_ss(stringEnv); + + while (std::getline(ev_str_ss, parsedVal, ',')) { + int parsedInt = std::stoi(parsedVal); + assert(parsedInt >= 0); + uint32_t parsedUInt = static_cast(parsedInt); + returnSet.insert(parsedUInt); + } + } +#endif + return returnSet; +} + // Get and store env. variables in this method void RocmSMI::GetEnvVariables(void) { #ifndef DEBUG @@ -437,15 +466,15 @@ void RocmSMI::GetEnvVariables(void) { env_vars_.path_DRM_root_override = nullptr; env_vars_.path_HWMon_root_override = nullptr; env_vars_.path_power_root_override = nullptr; - env_vars_.enum_override = 0; env_vars_.debug_inf_loop = 0; + env_vars_.enum_overrides.clear(); #else env_vars_.debug_output_bitfield = GetEnvVarUInteger("RSMI_DEBUG_BITFIELD"); env_vars_.path_DRM_root_override = getenv("RSMI_DEBUG_DRM_ROOT_OVERRIDE"); env_vars_.path_HWMon_root_override = getenv("RSMI_DEBUG_HWMON_ROOT_OVERRIDE"); env_vars_.path_power_root_override = getenv("RSMI_DEBUG_PP_ROOT_OVERRIDE"); - env_vars_.enum_override = GetEnvVarUInteger("RSMI_DEBUG_ENUM_OVERRIDE"); env_vars_.debug_inf_loop = GetEnvVarUInteger("RSMI_DEBUG_INFINITE_LOOP"); + env_vars_.enum_overrides = GetEnvVarUIntegerSets("RSMI_DEBUG_ENUM_OVERRIDE"); #endif } @@ -453,6 +482,43 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) { return env_vars_; } +void RocmSMI::printEnvVarInfo(void) { + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = " + << ((env_vars_.debug_output_bitfield == 0) ? "" + : std::to_string(env_vars_.debug_output_bitfield)) + << std::endl; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = " + << ((env_vars_.path_DRM_root_override == nullptr) + ? "" : env_vars_.path_DRM_root_override) + << std::endl; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = " + << ((env_vars_.path_HWMon_root_override == nullptr) + ? "" : env_vars_.path_HWMon_root_override) + << std::endl; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = " + << ((env_vars_.path_power_root_override == nullptr) + ? "" : env_vars_.path_power_root_override) + << std::endl; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = " + << ((env_vars_.debug_inf_loop == 0) ? "" + : std::to_string(env_vars_.debug_output_bitfield)) + << std::endl; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; + if (env_vars_.enum_overrides.empty()) { + std::cout << "}" << std::endl; + return; + } + for (auto it=env_vars_.enum_overrides.begin(); + it != env_vars_.enum_overrides.end(); ++it) { + std::cout << *it; + auto temp_it = it; + if(++temp_it != env_vars_.enum_overrides.end()) { + std::cout << ","; + } + } + std::cout << "}" << std::endl; +} + std::shared_ptr RocmSMI::FindMonitor(std::string monitor_path) { std::string tmp; diff --git a/projects/amdsmi/src/rocm_smi_utils.cc b/projects/amdsmi/src/rocm_smi_utils.cc index 473d1e2b67..8c9fe39567 100755 --- a/projects/amdsmi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/src/rocm_smi_utils.cc @@ -3,7 +3,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2018, Advanced Micro Devices, Inc. + * Copyright (c) 2018-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -51,6 +51,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -234,5 +235,68 @@ rsmi_status_t ErrnoToRsmiStatus(int err) { } } +std::string leftTrim(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("^\\s+"), ""); + } + return s; +} + +std::string rightTrim(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("\\s+$"), ""); + } + return s; +} + +std::string removeNewLines(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("\n+"), ""); + } + return s; +} + +std::string trim(const std::string &s) { + if (!s.empty()) { + // remove new lines -> trim white space at ends + std::string noNewLines = removeNewLines(s); + return leftTrim(rightTrim(noNewLines)); + } + return s; +} + +// defaults to trim stdOut +std::pair executeCommand(std::string command, bool stdOut) { + char buffer[128]; + std::string stdoutAndErr = ""; + bool successfulRun = true; + command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering + + FILE *pipe = popen(command.c_str(), "r"); + if (!pipe) { + stdoutAndErr = "[ERROR] popen failed to call " + command; + successfulRun = false; + } else { + //read until end of process + while (!feof(pipe)) { + // use buffer to read and add to stdoutAndErr + if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + stdoutAndErr += buffer; + } + } + } + + // any return code other than 0, is a failed execution + if (pclose(pipe) != 0) { + successfulRun = false; + } + + if (stdOut) { + // remove leading and trailing spaces of output and new lines + stdoutAndErr = trim(stdoutAndErr); + } + return std::make_pair(successfulRun, stdoutAndErr); +} + } // namespace smi } // namespace amd diff --git a/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt b/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt index e7321d0b97..8aaacb9e91 100755 --- a/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt +++ b/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt @@ -49,6 +49,24 @@ endif() set(RSMI_INC_DIR ${ROCM_DIR}/include) set(RSMI_LIB_DIR ${ROCM_DIR}/lib) + + +message("") +message("Google Test Configuration init:") +message("-----------ROCM_DIR: " ${ROCM_DIR}) +message("-----------GOOGLE_TEST_FRWK_NAME: " ${GOOGLE_TEST_FRWK_NAME}) +message("-----------RSMITST: " ${RSMITST}) +message("-----------RSMITST_ROOT: " ${RSMITST_ROOT}) +message("-----------RSMITST_LIBS: " ${RSMITST_LIBS}) +message("-----------PROJECT_BINARY_DIR: " ${PROJECT_BINARY_DIR}) +message("-----------RSMI_LIB_DIR: " ${RSMI_LIB_DIR}) +message("-----------GTEST_LIB_DIR: " ${GTEST_LIB_DIR}) +message("-----------RSMI_INC_DIR: " ${RSMI_INC_DIR}) +message("-----------rsmitstSources: " ${rsmitstSources}) +message("-----------functionalSources: " ${functionalSources}) +message("") + + # # Determine RSMI Header files are present # (no external source dependencies) @@ -76,6 +94,21 @@ else() endif() endif() +message("") +message("Google Test Configuration (after lib check):") +message("-----------ROCM_DIR: " ${ROCM_DIR}) +message("-----------GOOGLE_TEST_FRWK_NAME: " ${GOOGLE_TEST_FRWK_NAME}) +message("-----------RSMITST: " ${RSMITST}) +message("-----------RSMITST_ROOT: " ${RSMITST_ROOT}) +message("-----------RSMITST_LIBS: " ${RSMITST_LIBS}) +message("-----------PROJECT_BINARY_DIR: " ${PROJECT_BINARY_DIR}) +message("-----------RSMI_LIB_DIR: " ${RSMI_LIB_DIR}) +message("-----------GTEST_LIB_DIR: " ${GTEST_LIB_DIR}) +message("-----------RSMI_INC_DIR: " ${RSMI_INC_DIR}) +message("-----------rsmitstSources: " ${rsmitstSources}) +message("-----------functionalSources: " ${functionalSources}) +message("") + string(TOLOWER "${RSMITST_BLD_TYPE}" tmp) if("${tmp}" STREQUAL release) set(BUILD_TYPE "Release") @@ -224,3 +257,18 @@ install(TARGETS ${RSMITST} LIBRARY DESTINATION ${PROJECT_BINARY_DIR}/lib RUNTIME DESTINATION ${PROJECT_BINARY_DIR}/bin) +message("") +message("Google Test Configuration:") +message("-----------ROCM_DIR: " ${ROCM_DIR}) +message("-----------GOOGLE_TEST_FRWK_NAME: " ${GOOGLE_TEST_FRWK_NAME}) +message("-----------RSMITST: " ${RSMITST}) +message("-----------RSMITST_ROOT: " ${RSMITST_ROOT}) +message("-----------RSMITST_LIBS: " ${RSMITST_LIBS}) +message("-----------PROJECT_BINARY_DIR: " ${PROJECT_BINARY_DIR}) +message("-----------RSMI_LIB_DIR: " ${RSMI_LIB_DIR}) +message("-----------GTEST_LIB_DIR: " ${GTEST_LIB_DIR}) +message("-----------RSMI_INC_DIR: " ${RSMI_INC_DIR}) +message("-----------rsmitstSources: " ${rsmitstSources}) +message("-----------functionalSources: " ${functionalSources}) +message("") + diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/computepartition_read_write.cc b/projects/amdsmi/tests/rocm_smi_test/functional/computepartition_read_write.cc index 8e22022716..2e71322270 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/computepartition_read_write.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -204,7 +204,6 @@ void TestComputePartitionReadWrite::Run(void) { // Verify api support checking functionality is working err = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); // Note: new_computePartition is not set - // DISPLAY_RSMI_ERR(err) EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || (err == RSMI_STATUS_NOT_SUPPORTED)); IF_VERB(STANDARD) { @@ -222,7 +221,6 @@ void TestComputePartitionReadWrite::Run(void) { new_computePartition = rsmi_compute_partition_type::RSMI_COMPUTE_PARTITION_INVALID; err = rsmi_dev_compute_partition_set(dv_ind, new_computePartition); - // DISPLAY_RSMI_ERR(err) EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || (err == RSMI_STATUS_NOT_SUPPORTED) || (err == RSMI_STATUS_PERMISSION)); diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/npsmode_read_write.cc b/projects/amdsmi/tests/rocm_smi_test/functional/npsmode_read_write.cc new file mode 100755 index 0000000000..e2356ea5f4 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/npsmode_read_write.cc @@ -0,0 +1,298 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include + +#include "gtest/gtest.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi_test/functional/npsmode_read_write.h" +#include "rocm_smi_test/test_common.h" + +TestNPSModeReadWrite::TestNPSModeReadWrite() : TestBase() { + set_title("RSMI NPS Mode Read Test"); + set_description("The NPS Mode tests verifies that the memory " + "parition setting can be read and updated properly."); +} + +TestNPSModeReadWrite::~TestNPSModeReadWrite(void) { +} + +void TestNPSModeReadWrite::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestNPSModeReadWrite::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestNPSModeReadWrite::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestNPSModeReadWrite::Close() { + // This will close handles opened within rsmitst utility calls and call + // rsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +static const std::string +npsModeString(rsmi_nps_mode_type npsModeType) { + switch (npsModeType) { + case RSMI_MEMORY_PARTITION_NPS1: + return "NPS1"; + case RSMI_MEMORY_PARTITION_NPS2: + return "NPS2"; + case RSMI_MEMORY_PARTITION_NPS4: + return "NPS4"; + case RSMI_MEMORY_PARTITION_NPS8: + return "NPS8"; + default: + return "UNKNOWN"; + } +} + +static const std::map +mapStringToRSMINpsModeTypes { + {"NPS1", RSMI_MEMORY_PARTITION_NPS1}, + {"NPS2", RSMI_MEMORY_PARTITION_NPS2}, + {"NPS4", RSMI_MEMORY_PARTITION_NPS4}, + {"NPS8", RSMI_MEMORY_PARTITION_NPS8} +}; + +void TestNPSModeReadWrite::Run(void) { + rsmi_status_t ret, err; + char orig_nps_mode[255]; + char current_nps_mode[255]; + orig_nps_mode[0] = '\0'; + current_nps_mode[0] = '\0'; + rsmi_nps_mode_type new_nps_mode; + + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + + for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { + PrintDeviceHeader(dv_ind); + + //Standard checks to see if API is supported, before running full tests + ret = rsmi_dev_nps_mode_get(dv_ind, orig_nps_mode, 255); + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << ": " + << "Not supported on this machine" << std::endl; + } + return; + } else { + CHK_ERR_ASRT(ret) + } + IF_VERB(STANDARD) { + std::cout << std::endl << "\t**" + << "NPS Mode: " + << orig_nps_mode << std::endl; + } + + if ((orig_nps_mode == nullptr) || + (orig_nps_mode[0] == '\0')) { + std::cout << "***System nps mode value is not defined or received unexpected data. " + "Skip nps mode test." << std::endl; + return; + } + EXPECT_TRUE(ret == RSMI_STATUS_SUCCESS); + + // Verify api support checking functionality is working + uint32_t length = 2; + char smallBuffer[length]; + err = rsmi_dev_nps_mode_get(dv_ind, smallBuffer, length); + size_t size = sizeof(smallBuffer)/sizeof(*smallBuffer); + ASSERT_EQ(err, RSMI_STATUS_INSUFFICIENT_SIZE); + ASSERT_EQ((size_t)length, size); + if (err == RSMI_STATUS_INSUFFICIENT_SIZE) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INSUFFICIENT_SIZE was returned " + << "and size matches length requested." << std::endl; + } + } + + // Verify api support checking functionality is working + err = rsmi_dev_nps_mode_get(dv_ind, nullptr, 255); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + + if (err == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_NOT_SUPPORTED was returned." + << std::endl; + } + } + + // Verify api support checking functionality is working + err = rsmi_dev_nps_mode_get(dv_ind, orig_nps_mode, 0); + ASSERT_EQ(err, (RSMI_STATUS_INVALID_ARGS || RSMI_STATUS_NOT_SUPPORTED)); + if (err == RSMI_STATUS_INVALID_ARGS) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } + } + + /******************************/ + /* rsmi_dev_nps_mode_set(...) */ + /******************************/ + // Verify api support checking functionality is working + err = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); + // Note: new_nps_mode is not set + EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || + (err == RSMI_STATUS_NOT_SUPPORTED)); + if (err == RSMI_STATUS_INVALID_ARGS) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } + } else if (err == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << ": " + << "rsmi_dev_nps_mode_set not supported on this machine" + << "\n\t (if rsmi_dev_nps_mode_get work, then likely " + << "need to set in bios)" + << std::endl; + } + return; + } else { + DISPLAY_RSMI_ERR(err) + } + ASSERT_FALSE(err == RSMI_STATUS_PERMISSION); + + // Verify api support checking functionality is working + new_nps_mode = rsmi_nps_mode_type::RSMI_MEMORY_PARTITION_UNKNOWN; + err = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); + EXPECT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || + (err == RSMI_STATUS_NOT_SUPPORTED) || + (err == RSMI_STATUS_PERMISSION)); + if (err == RSMI_STATUS_INVALID_ARGS) { + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." + << std::endl; + } else if (err == RSMI_STATUS_PERMISSION) { + DISPLAY_RSMI_ERR(err) + // tests should not continue if err is a permission issue + ASSERT_FALSE(err == RSMI_STATUS_PERMISSION); + } else { + DISPLAY_RSMI_ERR(err) + } + } + + // Re-run original get, so we can reset to later + ret = rsmi_dev_nps_mode_get(dv_ind, orig_nps_mode, 255); + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + + for (int partition = RSMI_MEMORY_PARTITION_NPS1; + partition <= RSMI_MEMORY_PARTITION_NPS8; + partition++) { + new_nps_mode + = static_cast(partition); + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "======== TEST RSMI_MEMORY_PARTITION_" + << npsModeString(new_nps_mode) + << " ===============" << std::endl; + } + ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Attempting to set nps mode to: " + << npsModeString(new_nps_mode) << std::endl; + } + ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" + << "Current nps mode: " << current_nps_mode << std::endl; + } + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + EXPECT_STREQ(npsModeString(new_nps_mode).c_str(), current_nps_mode); + } + + /* TEST RETURN TO ORIGINAL NPS MODE SETTING */ + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "=========== TEST RETURN TO ORIGINAL NPS MODE " + << "SETTING ========" << std::endl; + } + new_nps_mode + = mapStringToRSMINpsModeTypes.at(orig_nps_mode); + ret = rsmi_dev_nps_mode_set(dv_ind, new_nps_mode); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" << "Returning nps mode to: " + << npsModeString(new_nps_mode) << std::endl; + } + ret = rsmi_dev_nps_mode_get(dv_ind, current_nps_mode, 255); + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\t**" << "Attempted to set nps mode: " + << npsModeString(new_nps_mode) << std::endl + << "\t**" << "Current compute partition: " << current_nps_mode + << std::endl; + } + EXPECT_EQ(RSMI_STATUS_SUCCESS, ret); + EXPECT_STREQ(npsModeString(new_nps_mode).c_str(), current_nps_mode); + + } +} diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/npsmode_read_write.h b/projects/amdsmi/tests/rocm_smi_test/functional/npsmode_read_write.h new file mode 100755 index 0000000000..54eed75e01 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/npsmode_read_write.h @@ -0,0 +1,73 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_NPSMODE_READ_WRITE_H_ +#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_NPSMODE_READ_WRITE_H_ + +#include "rocm_smi_test/test_base.h" + +class TestNPSModeReadWrite : public TestBase { + public: + TestNPSModeReadWrite(); + + // @Brief: Destructor for test case of TestNPSModeReadWrite + virtual ~TestNPSModeReadWrite(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_NPSMODE_READ_WRITE_H_ diff --git a/projects/amdsmi/tests/rocm_smi_test/main.cc b/projects/amdsmi/tests/rocm_smi_test/main.cc index e707566599..e761ac4fd0 100755 --- a/projects/amdsmi/tests/rocm_smi_test/main.cc +++ b/projects/amdsmi/tests/rocm_smi_test/main.cc @@ -87,6 +87,7 @@ #include "rocm_smi_test/functional/gpu_metrics_read.h" #include "rocm_smi_test/functional/metrics_counter_read.h" #include "rocm_smi_test/functional/perf_determinism.h" +#include "functional/npsmode_read_write.h" static RSMITstGlobals *sRSMIGlvalues = nullptr; @@ -277,7 +278,11 @@ TEST(rsmitstReadWrite, TestComputePartitionReadWrite) { TestComputePartitionReadWrite tst; RunGenericTest(&tst); } -TEST(rsmitstReadOnly, TestConcurrentInit) { +TEST(rsmitstReadWrite, TestNPSModeReadWrite) { + TestNPSModeReadWrite tst; + RunGenericTest(&tst); +} +TEST(rsmitstReadOnly, Test) { TestConcurrentInit tst; SetFlags(&tst); tst.DisplayTestInfo();