2
0

Adding Performance Determinism Mode to rocm_smi lib, CLI & gtest.

A special mode of operation to achieve minimal performance variation by letting
the user have the ability to provide the desired frequency to be set as the soft limit.
The user can control the entry and exit to the mode via rocm-smi a mechanism to
enter / exit performance determinism mode as below.

Enter performance determinism mode:
- hold a lock
- write performance_determinism to power_dpm_force_performance_level
- write input clk_freq to pp_dpm_sclk
- release lock

Exit performance determinism_mode:
- hold a lock
- write auto to power_dpm_force_performance_level
- release lock

Signed-off-by: Divya Shikre <DivyaUday.Shikre@amd.com>
Change-Id: Ia24e27954cdf1c4337ffc83d8948fbdfaf4552d2


[ROCm/rocm_smi_lib commit: 60d0f3052f]
Este cometimento está contido em:
Divya Shikre
2020-10-30 13:40:26 -04:00
ascendente 3c7607d7f0
cometimento 6d4fb11c6e
8 ficheiros modificados com 346 adições e 3 eliminações
+34 -1
Ver ficheiro
@@ -165,8 +165,9 @@ typedef enum {
//!< memory clock
RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK, //!< Stable power state with minimum
//!< system clock
RSMI_DEV_PERF_LEVEL_DETERMINISM, //!< Performance determinism state
RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK,
RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_DETERMINISM,
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 //!< Unknown performance level
} rsmi_dev_perf_level_t;
@@ -2062,6 +2063,37 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent);
rsmi_status_t rsmi_dev_perf_level_get(uint32_t dv_ind,
rsmi_dev_perf_level_t *perf);
/**
* @brief Enter performance determinism mode with provided device index.
*
* @details Given a device index @p dv_ind and @p freq_bitmask, this function
* will enable performance determinism mode, which enforces a GFXCLK frequency
* SoftMax limit per GPU set by the user. This prevents the GFXCLK FLL from
* stretching when running the same workload on different GPUS, making
* performance variation minimal. This call will result in the performance
* level ::rsmi_dev_perf_level_t of the device being
* ::RSMI_DEV_PERF_LEVEL_DETERMINISM. If a bit in @p freq_bitmask has a value
* of 1, then the frequency (as ordered in an ::rsmi_frequencies_t returned
* by rsmi_dev_gpu_clk_freq_get()) corresponding to that bit index will be
* allowed.
* ::rsmi_dev_perf_level_set() should be called with ::RSMI_DEV_PERF_LEVEL_AUTO
* to restore the performance level to the default value.
*
* @param[in] dv_ind a device index
*
* @param[in] freq_bitmask A bitmask indicating the indices of the
* frequencies that are to be enabled (1) and disabled (0). Only the lowest
* ::rsmi_frequencies_t.num_supported bits of this mask are relevant.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t rsmi_perf_determinism_mode_set(uint32_t dv_ind,
uint64_t freq_bitmask);
/**
* @brief Get the overdrive percent associated with the device with provided
* device index.
@@ -2085,6 +2117,7 @@ rsmi_status_t rsmi_dev_perf_level_get(uint32_t dv_ind,
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od);
/**
+47 -1
Ver ficheiro
@@ -626,6 +626,21 @@ def resetXgmiErr(deviceList):
printLogSpacer()
def resetPerfDeterminism(deviceList):
""" Reset Performance Determinism
@param deviceList: Disable Performance Determinism for these devices
"""
printLogSpacer('Disable Performance Determinism')
for device in deviceList:
ret = rocmsmi.rsmi_dev_perf_level_set(device, rsmi_dev_perf_level_t(0)))
if rsmi_ret_ok(ret, device, 'disable performance determinism'):
printLog(device, 'Successfully disabled performance determinism', None)
else:
logging.error('GPU[%s]\t\t: Unable to diable performance determinism', device)
printLogSpacer()
def setClockRange(deviceList, clkType, level, value, autoRespond):
""" Set the range for the specified clktype in the PowerPlay table for a list of devices.
@@ -870,6 +885,30 @@ def setClocks(deviceList, clktype, clk):
printLogSpacer()
def setPerfDeterminism(deviceList, value):
""" Set clock frequency level for a list of devices to enable performance
determinism.
@param deviceList: List of DRM devices (can be a single-item list)
@param value: Clock frequency level to set
"""
global RETCODE
try:
int(value)
except ValueError:
printErrLog(device, 'Unable to set Performance Determinism')
logging.error('%s is not an integer', value)
RETCODE = 1
return
for device in deviceList:
ret = rocmsmi.rsmi_perf_determinism_mode_set(device, int(value))
if rsmi_ret_ok(ret, device):
printLog(device, 'Successfully set clock frequency', str(value))
else:
printErrLog(device, 'Unable to set clock frequency', str(value))
RETCODE = 1
def resetGpu(device):
""" Perform a GPU reset on the specified device
@@ -2493,6 +2532,7 @@ if __name__ == '__main__':
help='Set the maximum GPU power back to the device deafult state',
action='store_true')
groupActionReset.add_argument('--resetxgmierr', help='Reset XGMI error count', action='store_true')
groupAction.add_argument('--resetperfdeterminism', help='Disable performance determinism', action='store_true')
groupAction.add_argument('--setclock', help='Set Clock Frequency Level(s) for specified clock (requires manual Perf level)',
type=str, metavar='LEVEL', nargs=2)
groupAction.add_argument('--setsclk', help='Set GPU Clock Frequency Level(s) (requires manual Perf level)',
@@ -2523,6 +2563,8 @@ if __name__ == '__main__':
groupAction.add_argument('--setprofile',
help='Specify Power Profile level (#) or a quoted string of CUSTOM Profile attributes "# '
'# # #..." (requires manual Perf level)')
groupAction.add_argument('--setperfdeterminism', help='Set clock frequency limit to get minimal performance variation',
type=int, metavar='LEVEL', nargs='+')
groupAction.add_argument('--rasenable', help='Enable RAS for specified block and error type', type=str, nargs=2,
metavar=('BLOCK', 'ERRTYPE'))
groupAction.add_argument('--rasdisable', help='Disable RAS for specified block and error type', type=str, nargs=2,
@@ -2560,7 +2602,7 @@ if __name__ == '__main__':
if args.setsclk or args.setmclk or args.setpcie or args.resetfans or args.setfan or args.setperflevel or \
args.load or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or \
args.setmemoverdrive or args.setpoweroverdrive or args.resetpoweroverdrive or \
args.rasenable or args.rasdisable or args.rasinject or args.gpureset or \
args.rasenable or args.rasdisable or args.rasinject or args.gpureset or args.setperfdeterminism or\
args.setslevel or args.setmlevel or args.setvc or args.setsrange or args.setmrange or args.setclock:
relaunchAsSudo()
@@ -2773,10 +2815,14 @@ if __name__ == '__main__':
setClockRange(deviceList, 'sclk', args.setsrange[0], args.setsrange[1], args.autorespond)
if args.setmrange:
setClockRange(deviceList, 'mclk', args.setmrange[0], args.setmrange[1], args.autorespond)
if args.setperfdeterminism:
setPerfDeterminism(deviceList, args.setperfdeterminism[0])
if args.resetprofile:
resetProfile(deviceList)
if args.resetxgmierr:
resetXgmiErr(deviceList)
if args.resetperfdeterminism:
resetPerfDeterminism(deviceList)
if args.rasenable:
setRas(deviceList, 'enable', args.rasenable[0], args.rasenable[1])
if args.rasdisable:
+2 -1
Ver ficheiro
@@ -71,7 +71,8 @@ class rsmi_dev_perf_level_t(c_int):
RSMI_DEV_PERF_LEVEL_STABLE_PEAK = 5
RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK = 6
RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK = 7
RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK
RSMI_DEV_PERF_LEVEL_DETERMINISM = 8
RSMI_DEV_PERF_LEVEL_LAST = RSMI_DEV_PERF_LEVEL_DETERMINISM
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100
+17
Ver ficheiro
@@ -768,6 +768,23 @@ rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) {
CATCH
}
rsmi_status_t
rsmi_perf_determinism_mode_set(uint32_t dv_ind, uint64_t freq_bitmask) {
TRY
DEVICE_MUTEX
rsmi_status_t ret = rsmi_dev_perf_level_set_v1(dv_ind,
RSMI_DEV_PERF_LEVEL_DETERMINISM);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
ret = rsmi_dev_gpu_clk_freq_set(dv_ind, RSMI_CLK_TYPE_SYS, freq_bitmask);
return ret;
CATCH
}
rsmi_status_t
rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od) {
TRY
+5
Ver ficheiro
@@ -209,6 +209,7 @@ static const char *kDevPerfLevelStandardStr = "profile_standard";
static const char *kDevPerfLevelMinMClkStr = "profile_min_mclk";
static const char *kDevPerfLevelMinSClkStr = "profile_min_sclk";
static const char *kDevPerfLevelPeakStr = "profile_peak";
static const char *kDevPerfLevelDeterminismStr = "performance_determinism";
static const char *kDevPerfLevelUnknownStr = "unknown";
static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
@@ -283,6 +284,7 @@ static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
{RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK, kDevPerfLevelMinMClkStr},
{RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK, kDevPerfLevelMinSClkStr},
{RSMI_DEV_PERF_LEVEL_STABLE_PEAK, kDevPerfLevelPeakStr},
{RSMI_DEV_PERF_LEVEL_DETERMINISM, kDevPerfLevelDeterminismStr},
{RSMI_DEV_PERF_LEVEL_UNKNOWN, kDevPerfLevelUnknownStr},
};
@@ -368,7 +370,10 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_dev_overdrive_level_get", {{kDevOverDriveLevelFName}, {}}},
{"rsmi_dev_power_profile_presets_get", {{kDevPowerProfileModeFName}, {}}},
{"rsmi_dev_perf_level_set", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_perf_level_set_v1", {{kDevPerfLevelFName}, {}}},
{"rsmi_dev_perf_level_get", {{kDevPerfLevelFName}, {}}},
{"rsmi_perf_determinism_mode_set", {{kDevPerfLevelFName,
kDevGPUSClkFName}, {}}},
{"rsmi_dev_overdrive_level_set", {{kDevOverDriveLevelFName}, {}}},
{"rsmi_dev_vbios_version_get", {{kDevVBiosVerFName}, {}}},
{"rsmi_dev_od_volt_info_get", {{kDevPowerODVoltageFName}, {}}},
@@ -0,0 +1,163 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2020, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include <string>
#include <map>
#include <bitset>
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/perf_determinism.h"
#include "rocm_smi_test/test_common.h"
static const std::map<rsmi_dev_perf_level_t, const char *>
kDevPerfLvlNameMap = {
{RSMI_DEV_PERF_LEVEL_AUTO, "RSMI_DEV_PERF_LEVEL_AUTO"},
{RSMI_DEV_PERF_LEVEL_LOW, "RSMI_DEV_PERF_LEVEL_LOW"},
{RSMI_DEV_PERF_LEVEL_HIGH, "RSMI_DEV_PERF_LEVEL_HIGH"},
{RSMI_DEV_PERF_LEVEL_MANUAL, "RSMI_DEV_PERF_LEVEL_MANUAL"},
{RSMI_DEV_PERF_LEVEL_STABLE_STD, "RSMI_DEV_PERF_LEVEL_STABLE_STD"},
{RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK,
"RSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK"},
{RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK,
"RSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK"},
{RSMI_DEV_PERF_LEVEL_STABLE_PEAK, "RSMI_DEV_PERF_LEVEL_STABLE_PEAK"},
{RSMI_DEV_PERF_LEVEL_UNKNOWN, "RSMI_DEV_PERF_LEVEL_UNKNOWN"},
};
TestPerfDeterminism::TestPerfDeterminism() : TestBase() {
set_title("RSMI Performance Determinism Test");
set_description("The Performance Determinism tests verifies Enabling/Disabling "
"performance determinism mode.");
}
TestPerfDeterminism::~TestPerfDeterminism(void) {
}
void TestPerfDeterminism::SetUp(void) {
TestBase::SetUp();
return;
}
void TestPerfDeterminism::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestPerfDeterminism::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestPerfDeterminism::Close() {
// This will close handles opened within rsmitst utility calls and call
// rsmi_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
void TestPerfDeterminism::Run(void) {
rsmi_status_t err;
rsmi_dev_perf_level_t pfl;
rsmi_frequencies_t f;
rsmi_status_t ret;
TestBase::Run();
if (setup_failed_) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
return;
}
// Set clocks to something other than the usual default of the lowest
// frequency.
uint64_t freq_bitmask = 0b01100; // Try the 3rd and 4th clocks
std::string freq_bm_str =
std::bitset<RSMI_MAX_NUM_FREQUENCIES>(freq_bitmask).to_string();
freq_bm_str.erase(0, std::min(freq_bm_str.find_first_not_of('0'),
freq_bm_str.size()-1));
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
PrintDeviceHeader(i);
err = rsmi_perf_determinism_mode_set(i, freq_bitmask);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout << "\t** Not supported on this machine" << std::endl;
}
return;
} else {
ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &f);
if (ret != RSMI_STATUS_SUCCESS) {
return;
}
IF_VERB(STANDARD) {
std::cout << "\tFrequency is now index " << f.current << std::endl;
}
ret = rsmi_dev_perf_level_get(i, &pfl);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**New Perf Level:" << kDevPerfLvlNameMap.at(pfl) <<
std::endl;
}
std::cout << "\tResetting performance determinism" << std::endl;
err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO);;
CHK_ERR_ASRT(err)
ret = rsmi_dev_perf_level_get(i, &pfl);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**New Perf Level:" << kDevPerfLvlNameMap.at(pfl) <<
std::endl;
}
return;
}
}
}
@@ -0,0 +1,73 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2020, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_PERF_DETERMINISM_H_
#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_PERF_DETERMINISM_H_
#include "rocm_smi_test/test_base.h"
class TestPerfDeterminism : public TestBase {
public:
TestPerfDeterminism();
// @Brief: Destructor for test case of TestVOltRead
virtual ~TestPerfDeterminism();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_PERF_DETERMINISM_H_
+5
Ver ficheiro
@@ -83,6 +83,7 @@
#include "functional/init_shutdown_refcount.h"
#include "rocm_smi_test/functional/hw_topology_read.h"
#include "rocm_smi_test/functional/gpu_metrics_read.h"
#include "rocm_smi_test/functional/perf_determinism.h"
static RSMITstGlobals *sRSMIGlvalues = nullptr;
@@ -233,6 +234,10 @@ TEST(rsmitstReadOnly, TestGpuMetricsRead) {
TestGpuMetricsRead tst;
RunGenericTest(&tst);
}
TEST(rsmitstReadOnly, TestPerfDeterminism) {
TestPerfDeterminism tst;
RunGenericTest(&tst);
}
TEST(rsmitstReadWrite, TestXGMIReadWrite) {
TestXGMIReadWrite tst;
RunGenericTest(&tst);