RVS - Use config files and make GPU aware

Change-Id: I7a5c80ed4e6122d102e494d1ae38b4b7d40c42cd
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: f5a4402ce5]
This commit is contained in:
Galantsev, Dmitrii
2025-02-28 11:17:40 +00:00
committed by Galantsev, Dmitrii
orang tua 122ab5c053
melakukan 68c02bda78
63 mengubah file dengan 4673 tambahan dan 148 penghapusan
+15 -5
Melihat File
@@ -160,9 +160,13 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
typedef enum {
RDC_FI_INVALID = 0, //!< Invalid field value
//!< @brief Identifier fields
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
RDC_FI_DEV_NAME, //!< Name of the device
RDC_FI_OAM_ID, //!< OAM ID of the device
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
RDC_FI_DEV_NAME, //!< Name of the device
RDC_FI_OAM_ID, //!< OAM ID of the device
RDC_FI_DEV_ID, //!< Device ID
RDC_FI_REV_ID, //!<
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
/**
* @brief Frequency related fields
@@ -388,6 +392,9 @@ typedef uint32_t rdc_field_grp_t; //!< Field group ID type
*/
typedef struct {
char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device.
uint64_t device_id; //!< The device id of a GPU
uint32_t num_of_compute_units;
uint64_t target_graphics_version;
} rdc_device_attributes_t;
/**
@@ -533,7 +540,8 @@ typedef enum {
RDC_DIAG_RVS_MEMBW_TEST, //!< RVS bandwidth test
RDC_DIAG_RVS_H2DD2H_TEST, //!< RVS Host<->Device transfer speed test
RDC_DIAG_RVS_IET_TEST, //!< RVS IET test
RDC_DIAG_TEST_LAST = RDC_DIAG_RVS_IET_TEST
RDC_DIAG_RVS_CUSTOM, //!< RVS custom test
RDC_DIAG_TEST_LAST,
} rdc_diag_test_cases_t;
/**
@@ -547,7 +555,7 @@ typedef enum {
/**
* @brief The maximum test cases to run
*/
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1)
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST)
/**
* @brief The maximum length of the diagnostic messages
@@ -1607,6 +1615,8 @@ rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
*/
rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
const char* get_rocm_path(const char* search_string);
#ifdef __cplusplus
}
#endif // __cplusplus
@@ -22,61 +22,96 @@ THE SOFTWARE.
#ifndef RDC_MODULES_RDC_RVS_RVSBASE_H_
#define RDC_MODULES_RDC_RVS_RVSBASE_H_
#include <amd_smi/amdsmi.h>
#include <cstddef>
#include <cstdio>
#include <filesystem>
#include <map>
#include <string>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rvs/rvs.h"
static constexpr size_t MAX_CONFIG_LENGTH = 1024;
// NOTE: There MUST be a space after :
static const std::map<rdc_diag_test_cases_t, std::string> test_to_conf = {
// derived from conf/gst_single.conf
{RDC_DIAG_RVS_GST_TEST,
"{actions: [{name: gpustress-9000-sgemm-false, device: all, "
"device_index: '0', module: gst, parallel: false, count: 1, duration: "
"10000, copy_matrix: false, target_stress: 9000, matrix_size_a: 8640, "
"matrix_size_b: 8640, matrix_size_c: 8640, ops_type: sgemm, lda: 8640, "
"ldb: 8640, ldc: 8640}]}"},
// derived from conf/MI300X/babel.conf
{RDC_DIAG_RVS_MEMBW_TEST,
"{actions: [{name: babel-float-256MiB,"
"device: all, module: babel, "
"parallel: false, count: 1, num_iter: 5000, array_size: 268435456, "
"test_type: 1, mibibytes: true, o/p_csv: false, subtest: 5}]}"},
// derived from conf/MI300X/pebb_single.conf
{RDC_DIAG_RVS_H2DD2H_TEST,
"{actions: [{name: h2d-d2h-sequential-64MB,"
"device: all, module: pebb, duration: 120000, device_to_host: true, "
"host_to_device: true, parallel: false, block_size: 67108864, "
"link_type: 2, warm_calls: 10, hot_calls: 100, b2b: true}]}"},
// derived from conf/MI300X/iet_single.conf
{RDC_DIAG_RVS_IET_TEST,
"{actions: [{name: iet-400W-1K-rand-dgemm,"
"device: all, module: iet, parallel: true, duration: 60000, "
"sample_interval: 3000, target_power: 400, matrix_size: 1024, "
"matrix_init: rand, ops_type: dgemm}]}"},
// this map only makes sense in context of test config locations as originally
// designed in RVS
static const std::map<uint64_t, std::string> gfx_to_rvs_conf = {
{0x90a, "MI210"}, // ?
{0x940, "MI300A"}, // ?
{0x941, "MI300A"}, // ?
{0x942, "MI300X"}, // ?
{0x94a, "MI308X"}, // ?
{0x1030, "nv21"}, //
{0x1031, "nv21"}, // ?
{0x1032, "nv21"}, // ?
{0x1033, "nv21"}, // ?
{0x1034, "nv21"}, // ?
{0x1035, "nv21"}, // ?
{0x1100, "nv31"}, // ?
{0x1101, "nv31"}, // ?
{0x1102, "nv31"}, // ?
{0x1103, "nv31"}, // ?
};
static const std::map<rdc_diag_test_cases_t, std::string> test_to_name = {
{RDC_DIAG_RVS_GST_TEST, "RVS_GST_TEST"},
{RDC_DIAG_RVS_MEMBW_TEST, "RVS_MEMBW_TEST"},
{RDC_DIAG_RVS_H2DD2H_TEST, "RVS_H2DD2H_TEST"},
{RDC_DIAG_RVS_IET_TEST, "RVS_IET_TEST"},
{RDC_DIAG_RVS_GST_TEST, "gst_single.conf"}, {RDC_DIAG_RVS_MEMBW_TEST, "babel.conf"},
{RDC_DIAG_RVS_H2DD2H_TEST, "pebb_single.conf"}, {RDC_DIAG_RVS_IET_TEST, "iet_stress.conf"},
{RDC_DIAG_RVS_CUSTOM, "CUSTOM_CONFIG"},
};
namespace amd {
namespace rdc {
inline amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
amdsmi_processor_handle* processor_handle) {
uint32_t socket_count;
uint32_t processor_count;
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
std::vector<amdsmi_processor_handle> all_processors{};
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
for (auto& socket : sockets) {
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_processor_handle> processors(processor_count);
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
for (auto& processor : processors) {
processor_type_t processor_type = {};
ret = amdsmi_get_processor_type(processor, &processor_type);
if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!");
return AMDSMI_STATUS_NOT_SUPPORTED;
}
all_processors.push_back(processor);
}
}
if (gpu_id >= all_processors.size()) {
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
// Get processor handle from GPU id
*processor_handle = all_processors[gpu_id];
return AMDSMI_STATUS_SUCCESS;
}
class RdcRVSBase {
public:
RdcRVSBase() { s_instance = this; };
~RdcRVSBase() {
if (s_instance == this) {
s_instance = nullptr;
}
};
RdcRVSBase();
~RdcRVSBase();
// only one instance allowed
RdcRVSBase(const RdcRVSBase&) = delete;
@@ -87,12 +122,16 @@ class RdcRVSBase {
RdcRVSBase& operator=(RdcRVSBase&&) = delete;
rvs_status_t run_rvs_app(const char* config, size_t config_size, rdc_diag_callback_t* callback);
std::vector<std::string> get_rvs_configs();
std::map<rdc_diag_test_cases_t, std::string> get_test_to_conf();
private:
static RdcRVSBase* s_instance;
volatile rvs_session_state_t _state = RVS_SESSION_STATE_IDLE;
rdc_diag_callback_t* _callback = nullptr;
rvs_session_callback _rvs_callback = nullptr;
std::vector<std::string> _rvs_config_list = {};
std::map<rdc_diag_test_cases_t, std::string> _test_to_conf = {};
// Static callback function that the C API will call
static void static_callback(rvs_session_id_t session_id, const rvs_results_t* results) {
@@ -101,7 +140,7 @@ class RdcRVSBase {
s_instance->session_callback(session_id, results);
}
}
void session_callback(rvs_session_id_t session_id, const rvs_results_t* results) {
void session_callback(rvs_session_id_t /*session_id*/, const rvs_results_t* results) {
_state = results->state;
// std::string output = "\n";
// output += "session id -> " + std::to_string(session_id) + "\n";
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include <dlfcn.h>
#include <string.h>
#include <fstream>
#include <map>
@@ -476,6 +477,7 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) {
return dest;
}
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_t policy) {
if (!p_rdc_handle) {
@@ -532,4 +534,43 @@ rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* r
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_link_status_get(results);
}
}
const char * get_rocm_path(const char * search_string) {
// set default rocm path in case lookup fails
static std::string rocm_path("/opt/rocm");
const char* rocm_path_env = getenv("ROCM_PATH");
if (rocm_path_env != nullptr) {
rocm_path = rocm_path_env;
}
std::ifstream file("/proc/self/maps");
if (!file.is_open()) {
RDC_LOG(RDC_DEBUG, "CANT OPEN FILE");
return rocm_path.c_str();
}
std::string line;
while (getline(file, line)) {
size_t index_end = line.find(search_string);
size_t index_start = index_end;
if (index_end == std::string::npos) {
// no library on this line
continue;
}
// walk index backwards until it reaches a space
while ((index_start > 0) && (line[index_start - 1] != ' ')) {
index_start--;
}
// extract library path, drop library name
rocm_path = line.substr(index_start, index_end - index_start);
// appending "../" should result in "/opt/rocm/lib/.." or similar
rocm_path += "..";
RDC_LOG(RDC_DEBUG, "FOUND SOMETHING!");
return rocm_path.c_str();
}
return rocm_path.c_str();
}
@@ -49,9 +49,12 @@ rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
return RDC_ST_OK;
}
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(
rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* config,
size_t config_size,
rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -74,24 +77,41 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
size_t config_size,
rdc_diag_response_t* response,
rdc_diag_callback_t* callback) {
const bool is_custom = config != nullptr && config_size != 0;
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
std::vector<rdc_diag_test_cases_t> rdc_runs;
std::vector<rdc_diag_test_cases_t> tests_to_search_for;
if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above
rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS);
rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY);
rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS);
rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE);
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
tests_to_search_for.push_back(RDC_DIAG_COMPUTE_PROCESS);
tests_to_search_for.push_back(RDC_DIAG_NODE_TOPOLOGY);
tests_to_search_for.push_back(RDC_DIAG_GPU_PARAMETERS);
tests_to_search_for.push_back(RDC_DIAG_COMPUTE_QUEUE);
tests_to_search_for.push_back(RDC_DIAG_SYS_MEM_CHECK);
}
if (level >= RDC_DIAG_LVL_MED) { // Medium run and above
rdc_runs.push_back(RDC_DIAG_RVS_GST_TEST);
rdc_runs.push_back(RDC_DIAG_RVS_MEMBW_TEST);
rdc_runs.push_back(RDC_DIAG_RVS_H2DD2H_TEST);
rdc_runs.push_back(RDC_DIAG_RVS_IET_TEST);
tests_to_search_for.push_back(RDC_DIAG_RVS_GST_TEST);
tests_to_search_for.push_back(RDC_DIAG_RVS_MEMBW_TEST);
tests_to_search_for.push_back(RDC_DIAG_RVS_H2DD2H_TEST);
tests_to_search_for.push_back(RDC_DIAG_RVS_IET_TEST);
}
std::vector<rdc_diag_test_cases_t> tests_to_run;
if (is_custom) {
// respect custom config
tests_to_run.push_back(RDC_DIAG_RVS_CUSTOM);
} else {
// respect level
for (auto& test : tests_to_search_for) {
if (testcases_to_module_.find(test) != testcases_to_module_.end()) {
tests_to_run.push_back(test);
} else {
RDC_LOG(RDC_DEBUG, "test not found: " << test);
}
}
}
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
@@ -99,15 +119,17 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
callback->callback(callback->cookie, log.data());
}
unsigned int i = 0;
response->results_count = 0;
for (unsigned int i = 0; i < rdc_runs.size(); i++) {
for (i = 0; i < tests_to_run.size(); i++) {
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
std::string log = "Test " + std::to_string(i) + " / " + std::to_string(rdc_runs.size());
std::string log =
"Test " + std::to_string(i + 1) + " / " + std::to_string(tests_to_run.size());
callback->callback(callback->cookie, log.data());
}
response->diag_info[i].test_case = rdc_runs[i];
response->diag_info[i].test_case = tests_to_run[i];
// NOTE: rdc_test_case_run reuses the diagnostic_run callback
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
rdc_test_case_run(tests_to_run[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
config_size, &(response->diag_info[i]), callback);
response->results_count++;
}
@@ -432,33 +432,33 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
constexpr double kGig = 1000000000.0;
static uint64_t sum_xgmi_read(const amdsmi_gpu_metrics_t& gpu_metrics) {
uint64_t total = 0;
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
if (gpu_metrics.xgmi_read_data_acc[i] == not_supported_metrics_data){
continue;
}
total += gpu_metrics.xgmi_read_data_acc[i];
uint64_t total = 0;
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
if (gpu_metrics.xgmi_read_data_acc[i] == not_supported_metrics_data) {
continue;
}
if (total == 0){
return not_supported_metrics_data;
}
return total;
total += gpu_metrics.xgmi_read_data_acc[i];
}
if (total == 0) {
return not_supported_metrics_data;
}
return total;
}
static uint64_t sum_xgmi_write(const amdsmi_gpu_metrics_t& gpu_metrics) {
uint64_t total = 0;
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
if (gpu_metrics.xgmi_write_data_acc[i] == not_supported_metrics_data){
continue;
}
total += gpu_metrics.xgmi_write_data_acc[i];
uint64_t total = 0;
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
if (gpu_metrics.xgmi_write_data_acc[i] == not_supported_metrics_data) {
continue;
}
if (total == 0){
return not_supported_metrics_data;
}
return total;
total += gpu_metrics.xgmi_write_data_acc[i];
}
if (total == 0) {
return not_supported_metrics_data;
}
return total;
}
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
@@ -659,6 +659,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
break;
}
case RDC_FI_DEV_NAME: {
// source values from asic_info
amdsmi_asic_info_t asic_info;
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
value->type = STRING;
@@ -700,17 +701,44 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
value->value.l_int = num_pages;
}
break;
case RDC_FI_OAM_ID: {
case RDC_FI_OAM_ID:
case RDC_FI_DEV_ID:
case RDC_FI_REV_ID:
case RDC_FI_TARGET_GRAPHICS_VERSION:
case RDC_FI_NUM_OF_COMPUTE_UNITS: {
amdsmi_asic_info_t asic_info;
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
if (value->status != AMDSMI_STATUS_SUCCESS) {
break;
}
if (field_id == RDC_FI_OAM_ID) {
// 0xFFFF means not supported for OAM ID
if (asic_info.oam_id == 0xFFFF) {
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
} else {
value->value.l_int = asic_info.oam_id;
}
} else if (field_id == RDC_FI_DEV_ID) {
value->value.l_int = asic_info.device_id;
} else if (field_id == RDC_FI_REV_ID) {
value->value.l_int = asic_info.rev_id;
} else if (field_id == RDC_FI_TARGET_GRAPHICS_VERSION) {
if (asic_info.target_graphics_version == 0xFFFFFFFFFFFFFFFF) {
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
} else {
value->value.l_int = asic_info.target_graphics_version;
}
} else if (field_id == RDC_FI_NUM_OF_COMPUTE_UNITS) {
if (asic_info.num_of_compute_units == 0xFFFFFFFF) {
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
} else {
value->value.l_int = asic_info.num_of_compute_units;
}
} else {
// this should never happen as all fields are handled above
RDC_LOG(RDC_ERROR, "Unexpected field id: " << field_id);
value->status = AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
break;
}
@@ -726,7 +754,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
uint64_t timestamp;
value->status = amdsmi_get_utilization_count(processor_handle, utilization_counters,
kUTILIZATION_COUNTERS, &timestamp);
kUTILIZATION_COUNTERS, &timestamp);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(utilization_counters[0].value);
@@ -858,32 +886,29 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
uint32_t num_pages = 0;
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, nullptr);
if (AMDSMI_STATUS_SUCCESS == ret) {
if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
value->status = Smi2RdcError(ret);
value->type = INTEGER;
value->value.l_int = static_cast<int64_t>(num_pages);
break;
}
if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
value->status = Smi2RdcError(ret);
value->type = INTEGER;
value->value.l_int = static_cast<int64_t>(num_pages);
break;
}
if ((0 < num_pages) &&
(RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages,
bad_page_info.data());
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (AMDSMI_STATUS_SUCCESS == ret) {
uint64_t pending_page_num = 0;
for (uint32_t i=0; i < num_pages; i++) {
if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status)
pending_page_num++;
}
value->value.l_int = static_cast<int64_t>(pending_page_num);
if ((0 < num_pages) && (RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, bad_page_info.data());
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (AMDSMI_STATUS_SUCCESS == ret) {
uint64_t pending_page_num = 0;
for (uint32_t i = 0; i < num_pages; i++) {
if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status) pending_page_num++;
}
value->value.l_int = static_cast<int64_t>(pending_page_num);
}
}
} else
value->status = Smi2RdcError(ret);
value->status = Smi2RdcError(ret);
break;
}
@@ -61,4 +61,13 @@ if(BUILD_RVS)
TARGET ${RDC_RVS_LIB}
POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_RVS_LIB_COMPONENT}.so)
endif()
# Install RVS config files into /opt/rocm/share/rdc/conf/rvs/
#file(GLOB RDC_RVS_CONFIG_FILES "${SRC_DIR}/conf/*")
install(DIRECTORY "${SRC_DIR}/conf/"
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${RDC}/conf/rvs/
COMPONENT ${SERVER_COMPONENT})
#install(FILES ${RDC_RVS_CONFIG_FILES}
# DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${RDC}/conf
# COMPONENT ${RDC_RVS_LIB_COMPONENT})
endif()
@@ -21,15 +21,43 @@ THE SOFTWARE.
*/
#include <string.h>
#include <algorithm>
#include <filesystem>
#include "rdc/rdc.h"
#include "rdc_lib/RdcDiagnosticLibInterface.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rvs/RvsBase.h"
rdc_status_t rdc_diag_init(uint64_t) { return RDC_ST_OK; }
std::unique_ptr<amd::rdc::RdcRVSBase> rvs_p;
rdc_status_t rdc_diag_destroy() { return RDC_ST_OK; }
bool is_rvs_disabled() {
const char* value = std::getenv("RDC_DISABLE_RVS");
if (value == nullptr) return false;
std::string value_str = value;
std::transform(value_str.begin(), value_str.end(), value_str.begin(),
[](unsigned char c) { return std::tolower(c); });
const std::vector<const char*> positive_list = {"yes", "true", "1", "on", "y", "t"};
return std::any_of(positive_list.begin(), positive_list.end(),
[&value_str](const char* val) { return value_str == val; });
}
rdc_status_t rdc_diag_init(uint64_t) {
if (is_rvs_disabled()) {
return RDC_ST_DISABLED_MODULE;
}
rvs_p = std::unique_ptr<amd::rdc::RdcRVSBase>(new amd::rdc::RdcRVSBase);
return RDC_ST_OK;
}
rdc_status_t rdc_diag_destroy() {
rvs_p.reset();
return RDC_ST_OK;
}
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
@@ -37,12 +65,11 @@ rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST
return RDC_ST_BAD_PARAMETER;
}
*test_case_count = 3;
test_cases[0] = RDC_DIAG_RVS_GST_TEST;
test_cases[1] = RDC_DIAG_RVS_MEMBW_TEST;
test_cases[2] = RDC_DIAG_RVS_H2DD2H_TEST;
// Temporarily disabled due to configuration issues
// test_cases[3] = RDC_DIAG_RVS_IET_TEST;
auto test_to_conf = rvs_p->get_test_to_conf();
*test_case_count = test_to_conf.size();
for (auto& [key, value] : test_to_conf) {
*test_cases++ = key;
}
return RDC_ST_OK;
}
@@ -52,12 +79,20 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
const bool is_custom = config != nullptr && config_size != 0;
rvs_status_t rvs_status = RVS_STATUS_SUCCESS;
if (result == nullptr || gpu_count == 0) {
return RDC_ST_BAD_PARAMETER;
}
amd::rdc::RdcRVSBase rvs_base;
if (rvs_p == nullptr) {
RDC_LOG(RDC_ERROR, "rvs_p is not set!");
return RDC_ST_FAIL_LOAD_MODULE;
}
// get test_to_conf
auto test_to_conf = rvs_p->get_test_to_conf();
// init the return data
*result = {};
@@ -69,23 +104,39 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
std::string str = "RVS test [" + test_to_name.at(test_case) + "]";
callback->callback(callback->cookie, str.data());
}
// if config is given - only run one test and return
// do not care about test_case
if (is_custom) {
rvs_status = rvs_p->run_rvs_app(config, config_size + 1, callback);
if (rvs_status != RVS_STATUS_SUCCESS) {
result->status = RDC_DIAG_RESULT_FAIL;
}
return RDC_ST_OK;
}
switch (test_case) {
case RDC_DIAG_RVS_GST_TEST:
case RDC_DIAG_RVS_MEMBW_TEST:
case RDC_DIAG_RVS_H2DD2H_TEST:
case RDC_DIAG_RVS_IET_TEST: {
const std::string test_name = "Finished running " + test_to_name.at(test_case);
const std::string predefined_config = test_to_conf.at(test_case);
// +1 to copy null
strncpy_with_null(result->info, test_name.c_str(), test_name.length() + 1);
if (config == nullptr || config_size == 0) {
rvs_status = rvs_base.run_rvs_app(predefined_config.c_str(), predefined_config.length() + 1,
callback);
} else {
rvs_status = rvs_base.run_rvs_app(config, config_size, callback);
if (test_to_conf.find(test_case) == test_to_conf.end()) {
RDC_LOG(RDC_ERROR, "cannot find test " << test_to_name.at(test_case));
return RDC_ST_NOT_FOUND;
}
const std::string predefined_config = test_to_conf.at(test_case);
// +1 to copy null
strncpy_with_null(result->info, test_name.c_str(), test_name.length() + 1);
rvs_status =
rvs_p->run_rvs_app(predefined_config.c_str(), predefined_config.length() + 1, callback);
break;
}
case RDC_DIAG_RVS_CUSTOM:
RDC_LOG(RDC_ERROR, "custom config cannot be bundled with other tests!");
result->status = RDC_DIAG_RESULT_SKIP;
return RDC_ST_BAD_PARAMETER;
break;
default:
result->status = RDC_DIAG_RESULT_SKIP;
strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH);
@@ -23,19 +23,113 @@ THE SOFTWARE.
#include <string.h>
#include <string>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rvs/RvsBase.h"
#include "rvs/rvs.h"
// TODO: Make generic test
// TODO: Allow for user to override defaults with a custom string
#define CHECK_RVS(STATUS, SESSION) \
do { \
static_assert(std::is_same<decltype(STATUS), rvs_status_t>::value || \
std::is_same<decltype(STATUS), rvs_status_t&>::value || \
std::is_convertible<decltype(STATUS), rvs_status_t>::value, \
"STATUS must be of type rvs_status_t"); \
static_assert(std::is_same<decltype(SESSION), rvs_session_id_t>::value || \
std::is_same<decltype(SESSION), rvs_session_id_t&>::value, \
"SESSION must be of type rvs_session_t"); \
if ((STATUS) != RVS_STATUS_SUCCESS) { \
RDC_LOG(RDC_ERROR, \
"RVS failed at[" << __FILE__ << ":" << __LINE__ << "] with status: " << (STATUS)); \
rvs_session_destroy((SESSION)); \
return (STATUS); \
} \
} while (0)
amd::rdc::RdcRVSBase* amd::rdc::RdcRVSBase::s_instance = nullptr;
namespace amd::rdc {
rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t config_size,
rdc_diag_callback_t* callback) {
RdcRVSBase* RdcRVSBase::s_instance = nullptr;
RdcRVSBase::RdcRVSBase() {
std::string config_path(get_rocm_path("librdc.so"));
s_instance = this;
// these configs are installed with RDC and are mostly stripped down
// versions of RVS configs
config_path.append("/share/rdc/conf/rvs/");
amdsmi_processor_handle processor_handle = nullptr;
auto err = get_processor_handle_from_id(0, &processor_handle);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "get_processor_handle_from_id failed! " << err);
return;
}
amdsmi_asic_info_t asic_info;
err = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "amdsmi_get_gpu_asic_info failed! " << err);
return;
}
auto found_gpu = gfx_to_rvs_conf.find(asic_info.target_graphics_version);
if (found_gpu == gfx_to_rvs_conf.end()) {
// gpu name is not found
RDC_LOG(RDC_INFO, "RVS couldn't match GFX version to name. Using \"default\"");
config_path.append("default");
} else {
// gpu name is found - look up the name
config_path.append(found_gpu->second);
}
RDC_LOG(RDC_DEBUG, "RVS CONFIG PATH: " << config_path);
// populate configs
for (auto& ent : std::filesystem::directory_iterator(config_path)) {
if (ent.is_regular_file()) {
_rvs_config_list.push_back(ent.path().string());
}
}
// map test enums to config paths
for (rdc_diag_test_cases_t i = RDC_DIAG_TEST_FIRST; i < RDC_DIAG_TEST_LAST;
i = static_cast<rdc_diag_test_cases_t>(i + 1)) {
if (test_to_name.find(i) == test_to_name.end()) {
continue;
}
for (int j = 0; j < _rvs_config_list.size(); j++) {
std::filesystem::path config_path(_rvs_config_list.at(j));
// error handling for path
if (!config_path.has_filename()) {
RDC_LOG(RDC_ERROR, "RVS config path has no filename: " << _rvs_config_list.at(j));
continue;
}
// strip path, only keep filename
std::string config = config_path.filename().string();
if (test_to_name.at(i) == config) {
_test_to_conf[i] = config_path.string();
RDC_LOG(RDC_DEBUG, "TEST_ADDED " << test_to_name.at(i) << " = " << _test_to_conf[i]);
}
}
}
// manually add custom config
_test_to_conf[RDC_DIAG_RVS_CUSTOM] = "";
auto status = rvs_initialize();
if (status != RVS_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "rvs initialization failed");
}
};
RdcRVSBase::~RdcRVSBase() {
if (s_instance == this) {
s_instance = nullptr;
}
};
std::vector<std::string> RdcRVSBase::get_rvs_configs() { return _rvs_config_list; }
rvs_status_t RdcRVSBase::run_rvs_app(const char* config, const size_t config_size,
rdc_diag_callback_t* callback) {
char active_config[MAX_CONFIG_LENGTH];
rvs_session_property_t session_property = {RVS_SESSION_TYPE_DEFAULT_CONF, {{RVS_MODULE_GST}}};
rvs_session_id_t session_id;
@@ -44,9 +138,8 @@ rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t
// Meaning RDC index has no impact on RVS index.
if ((config == nullptr) || (config_size == 0)) {
RDC_LOG(RDC_INFO, "given config is NULL! Using predefined gst_config");
strncpy_with_null(active_config, test_to_conf.at(RDC_DIAG_RVS_GST_TEST).c_str(),
test_to_conf.at(RDC_DIAG_RVS_GST_TEST).length()+1);
RDC_LOG(RDC_ERROR, "given config is NULL! Cannot run tests!");
return RVS_STATUS_INVALID_ARGUMENT;
} else if (config_size > MAX_CONFIG_LENGTH) {
RDC_LOG(RDC_ERROR, "given config size is too large! Expected at most "
<< MAX_CONFIG_LENGTH << ", got " << config_size << " instead.");
@@ -56,12 +149,6 @@ rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t
strncpy_with_null(active_config, config, config_size);
}
status = rvs_initialize();
if (status == RVS_STATUS_FAILED) {
RDC_LOG(RDC_ERROR, "rvs initialization failed");
return status;
}
/*******************************/
_state = RVS_SESSION_STATE_IDLE;
@@ -71,28 +158,34 @@ rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t
_callback = callback;
status = rvs_session_create(&session_id, &RdcRVSBase::static_callback);
session_property.type = RVS_SESSION_TYPE_CUSTOM_ACTION;
CHECK_RVS(status, session_id);
session_property.type = RVS_SESSION_TYPE_CUSTOM_CONF;
session_property.custom_action.config = active_config;
status = rvs_session_set_property(session_id, &session_property);
CHECK_RVS(status, session_id);
status = rvs_session_execute(session_id);
if (status != RVS_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "RVS session execute failed with status: " << status);
rvs_session_destroy(session_id);
return status;
}
CHECK_RVS(status, session_id);
// TODO: remove?
while (_state != RVS_SESSION_STATE_COMPLETED) {
};
_callback = nullptr;
status = rvs_session_destroy(session_id);
if (status != RVS_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "RVS session destroy failed with status: " << status);
}
// this will try to destroy the session again, but it shouldn't matter
// I don't want to define a second macro.
CHECK_RVS(status, session_id);
return status;
}
std::map<rdc_diag_test_cases_t, std::string> RdcRVSBase::get_test_to_conf() {
return _test_to_conf;
}
} // namespace amd::rdc
@@ -0,0 +1,51 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# BABEL test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
# Set parallel execution to false
# Set buffer size to reflect the buffer you want to test
# Set run count to 1 (test will run once)
#
actions:
- name: babel-256MiB
device: all
module: babel # Name of the module
parallel: true # Parallel true or false
count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time)
num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
array_size: 268435456 # Buffer size the test operates, this is 256 MiB
test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB
o/p_csv: false # o/p as csv file
subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
dwords_per_lane: 4 # Number of dwords per lane
chunks_per_block: 4 # Number of chunks per block
@@ -0,0 +1,174 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# Run test with testscript or binary:
#
# Using Testscript -
# cd /opt/rocm/share/rocm-validation-suite/testscripts
# sudo ./gpup.new.sh
#
# Using Binary -
# cd /opt/rocm/share/rocm-validation-suite/conf
# cd /opt/rocm/bin
# sudo ./rvs -c /opt/rocm/share/rocm-validation-suite/conf/gpup_single.conf
#
# Note: Paths may vary with the ROCm version or ROCm installation path.
# GPUP test #1
#
# Preconditions:
# all AMD compatible GPUs
# all types of devices
# all gpu properties, all io_links properties
#
# Expected result:
# Test passes with displaying all properties values for any GPUs
actions:
- name: RVS-GPUP-TC1
device: all
module: gpup
properties:
all:
io_links-properties:
all:
# GPUP test #2
#
# Preconditions:
# all AMD compatible GPUs
# all types of devices
# no regular expressions
# only a subset of gpu properties, only a subset of io_link properties
#
# Expected result:
# Test passes with displaying subsets of properties and io_link properties values for any GPUs
- name: RVS-GPUP-TC2
device: all
module: gpup
properties:
simd_count:
mem_banks_count:
io_links_count:
vendor_id:
location_id:
max_engine_clk_ccompute:
io_links-properties:
version_major:
type:
version_major:
version_minor:
node_from:
node_to:
recommended_transfer_size:
flags:
# GPUP test #3
#
# Preconditions:
# only a subset of AMD compatible GPUs (device filtering)
# all types of devices
# all gpu properties, all io_link properties
#
# Expected result:
# Test passes with displaying all properties and io_link properties values for subset of GPUs
#
# Note:
# Testing specific device, if device numbers are changed in system it should be changed in the test
- name: RVS-GPUP-TC3
device: all
module: gpup
properties:
all:
io_links-properties:
all:
# GPUP test #4
#
# Preconditions:
# all AMD compatible GPUs
# a given device type (deviceid filtering), this must be filled based on deviceid in sysfs/ ./rvs -g.
# Default is 0=> no filtering
# all gpu properties, all io_link properties
#
# Expected result:
# Test passes with displaying all properties and io_link properties values for all GPUs and given deviceid
- name: RVS-GPUP-TC4
device: all
module: gpup
deviceid: 0
properties:
all:
io_links-properties:
all:
# GPUP test #5
#
# Preconditions:
# only a subset of AMD compatible GPUs (device filtering)
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
# Default is 0=> no filtering
# all gpu properties, all io_link properties
#
# Expected result:
# Test passes with displaying all properties and io_link properties values for subset of GPUs and given deviceid
#
# Note:
# Testing specific device, if device numbers are changed in system it should be changed in the test
- name: RVS-GPUP-TC5
device: all
module: gpup
deviceid: 0
properties:
all:
io_links-properties:
all:
# GPUP test #6
#
# Preconditions:
# only a subset of AMD compatible GPUs (device filtering)
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
# Default is 0=> no filtering
# only a subset of gpu properties, only a subset of io_link properties
#
# Expected result:
# Test passes with displaying subset of properties and io_link properties values for subset of GPUs and given deviceid
#
# Note:
# Testing specific device, if device numbers are changed in system it should be changed in the test
- name: RVS-GPUP-TC6
device: all
module: gpup
deviceid: 0
properties:
mem_banks_count:
io_links-properties:
version_major:
@@ -0,0 +1,132 @@
# ################################################################################
# #
# # Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# GST test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space
# Set parallel execution to false
# Set matrix_size to 8640 (for Vega 10 cards). For Vega 20, the recommended matrix_size is 8640
# Set run count to 2 (each test will run twice)
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
#
# Run test with:
# cd bin
# sudo ./rvs -c conf/gst_1.conf -d 3
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves 5000 gflops
# in maximum 7 seconds and then the GPU sustains the gflops
# for the rest of the test duration (total duration is 18 seconds).
# A single Gflops violation (with a 7% tolerance) is allowed.
# FALSE otherwise
actions:
- name: gpustress-41000-fp32-false
device: all
module: gst
parallel: true
count: 1
duration: 10000
copy_matrix: false
target_stress: 41000
matrix_size_a: 28000
matrix_size_b: 28000
matrix_size_c: 28000
data_type: fp32_r
lda: 28000
ldb: 28000
ldc: 28000
blas_source: hipblaslt
- name: gpustress-30000-dgemm-false
device: all
module: gst
parallel: true
count: 1
#hot_calls: 1000
duration: 15000
copy_matrix: false
target_stress: 30000
matrix_size_a: 8192
matrix_size_b: 8192
matrix_size_c: 8192
matrix_init: trig
ops_type: dgemm
lda: 8192
ldb: 8192
ldc: 8192
- name: gst-8096-150000-fp16
device: all
module: gst
parallel: true
log_interval: 3000
ramp_interval: 5000
duration: 15000
copy_matrix: false
target_stress: 150000
matrix_size_a: 8096
matrix_size_b: 8096
matrix_size_c: 8096
data_type: fp16_r
lda: 8096
ldb: 8096
ldc: 8096
ldd: 8096
transa: 1
transb: 0
alpha: 1
beta: 0
blas_source: hipblaslt
- name: gst-160Tflops-8K8K8K-rand-i8
device: all
module: gst
parallel: true
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 500
copy_matrix: false
target_stress: 160000
matrix_size_a: 8192
matrix_size_b: 8192
matrix_size_c: 8192
matrix_init: rand
data_type: i8_r
lda: 8192
ldb: 8192
ldc: 8192
transa: 1
transb: 0
alpha: 1
beta: 0
blas_source: hipblaslt
@@ -0,0 +1,146 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: action_1
device: all
module: iet
parallel: true
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 700
log_interval: 700
max_violations: 1
target_power: 300
tolerance: 0.06
matrix_size: 8640
ops_type: dgemm
- name: action_2
device: all
module: iet
parallel: true
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 1500
log_interval: 2000
max_violations: 1
target_power: 300
tolerance: 0.2
matrix_size: 8640
ops_type: dgemm
- name: action_3
device: all
module: iet
parallel: false
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 500
log_interval: 500
max_violations: 1
target_power: 300
tolerance: 0.1
matrix_size: 8640
ops_type: dgemm
# IET test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to true
# Set matrix_size to 8640 (for Vega 10 cards). For Vega 20, the recommended matrix_size is 8640
# Set run count to 2 (each test will run twice)
#
# Run test with:
# cd bin
# sudo ./rvs -c conf/iet4.conf -d 3
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches 150W
# in maximum 5 seconds and then the GPU sustains the same power
# for the rest of the test duration (total duration is 10 seconds).
# A single power violation (with a 10% tolerance) is allowed.
# FALSE otherwise
- name: action_4
device: all
module: iet
parallel: true
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 500
log_interval: 500
max_violations: 1
target_power: 300
tolerance: 0.1
matrix_size: 8640
ops_type: sgemm
# IET test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to false
# Set matrix_size to 8640 (for Vega 10 cards). For Vega 20, the recommended matrix_size is 8640
# Set run count to 2 (each test will run twice)
#
# Run test with:
# cd bin
# sudo ./rvs -c conf/iet5.conf -d 3
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches 50W
# in maximum 5 seconds and then the GPU sustains the same power
# for the rest of the test duration (total duration is 10 seconds).
# A single power violation (with a 10% tolerance) is allowed.
# FALSE otherwise
- name: action_5
device: all
module: iet
parallel: false
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 1500
log_interval: 2000
max_violations: 1
target_power: 300
tolerance: 0.1
matrix_size: 8640
ops_type: sgemm
@@ -0,0 +1,182 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: action_1
device: all
module: pbqt
log_interval: 800
duration: 5000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
block_size: 1000000 2000000 10000000
device_id: all
- name: action_2
device: all
module: pbqt
log_interval: 1000
count: 3
duration: 10000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_3
device: all
module: pbqt
log_interval: 800
duration: 4000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_4
device: all
module: pbqt
log_interval: 1000
duration: 5000
count: 1
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_5
device: all
module: pbqt
log_interval: 800
duration: 4000
count: 1
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_6
device: all
module: pbqt
log_interval: 800
duration: 8000
count: 1
peers: all
test_bandwidth: true
bidirectional: false
parallel: false
device_id: all
- name: action_7
device: all
module: pbqt
peers: all
count: 1
test_bandwidth: false
device_id: all
- name: action_8
device: all
module: pbqt
peers: all
test_bandwidth: true
bidirectional: true
parallel : true
device_id: all
- name: action_9
device: all
module: pbqt
log_interval: 500
duration: 1000
peers: all
test_bandwidth: true
bidirectional: false
parallel: true
device_id: all
- name: action_10
device: all
module: pbqt
log_interval: 500
duration: 1000
peers: all
peer_device_id: all
test_bandwidth: true
bidirectional: false
parallel: true
- name: action_11
device: all
module: pbqt
log_interval: 0
duration: 10000
peers: all
peer_device_id: all
test_bandwidth: true
bidirectional: true
parallel: false
device_id: all
- name: action_12
device: all
module: pbqt
log_interval: 0
duration: 1000
count: 3
wait: 1000
peers: all
peer_device_id: all
test_bandwidth: true
bidirectional: true
parallel: true
- name: action_13
device: all
module: pbqt
log_interval: 1000
duration: 10000
peers: all
device_id: all
peer_device_id: all
test_bandwidth: true
bidirectional: true
parallel: true
- name: action_14
device: all
module: pbqt
log_interval: 500
duration: 10000
peers: all
test_bandwidth: true
bidirectional: true
device_id: all
@@ -0,0 +1,236 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# PEBB test #1
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device
#
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test1.conf -d 3
#
actions:
- name: h2d-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 50000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 2 # PCIe
# PEBB test #2
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. device to host
#
# Run test with :
# cd bin
# ./rvs -c conf/pebb_test2.conf -d 3
#
- name: d2h-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 2 # PCIe
# PEBB test #3
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
#
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test3.conf -d 3
#
- name: h2d-d2h-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 2 # PCIe
# PEBB test #4
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device
# 4. parallel transfers
# 5. random block sizes
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test4.conf -d 3
- name: h2d-parallel-xMB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
link_type: 2 # PCIe
# PEBB test #5
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. device to host
# 4. parallel transfers
# 5. random block sizes
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test5.conf -d 3
- name: d2h-parallel-xMB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
link_type: 2 # PCIe
# PEBB test #6
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
# 4. parallel transfers
# 5. random block sizes
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test6.conf -d 3
- name: h2d-d2h-xMB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
link_type: 2 # PCIe
# PEBB test #7
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device
# 4. parallel transfers
# 5. back-to-back 51MB
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test7.conf -d 3
- name: h2d-b2b-51MB
device: all
module: pebb
log_interval: 800
duration: 34000
device_to_host: false
host_to_device: true
b2b_block_size: 51200
parallel: false
link_type: 2 # PCIe
# PEBB test #8
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host-to-device and device-to-host
# 4. parallel back-to-back transfers
# 5. back-to-back 51MB
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test8.conf -d 3
- name: d2h-b2b-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
b2b_block_size: 51200
parallel: true
link_type: 2 # PCIe
# PEBB test #9
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
# 4. PCIe ponly
# 5. parallel back-to-back transfers
# Run test with:
# cd bin
# ./rvs -c conf/pebb_test9.conf -d 3
- name: h2d-d2h-b2b-51MB
device: all
module: pebb
log_interval: 800
duration: 34000
device_to_host: true
host_to_device: true
b2b_block_size: 51200
parallel: false
link_type: 2 # PCIe
@@ -0,0 +1,91 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# TST test
#
# Preconditions:
# Set device to all and execution as sequential.
# Workload set as dgemm operations with matrix size as 8640.
# Throttle temperature set as 100 degree celsius.
#
# Run test with:
# ./rvs -c conf/tst.conf -d 3
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU junction temperature
# reaches the target temperature. If it reaches the throttle temperature
# during test duration is also monitored.
#
actions:
- name: action_1
device: all
device_index: all
module: tst
parallel: false
count: 1
wait: 100
duration: 30000
ramp_interval: 10000
sample_interval: 2000
log_interval: 2000
max_violations: 1
throttle_temp: 100
target_temp: 50
tolerance: 0.06
matrix_size: 8640
ops_type: dgemm
# TST test
#
# Preconditions:
# Set device to all and execution in parallel.
# Workload set as dgemm operations with matrix size as 8640.
# Throttle temperature set as 100 degree celsius.
#
# Run test with:
# ./rvs -c conf/tst.conf -d 3
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU junction temperature
# reaches the target temperature. If it reaches the throttle temperature
# during test duration is also monitored.
#
- name: action_2
device: all
device_index: all
module: tst
parallel: true
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 700
log_interval: 700
target_temp: 50
throttle_temp: 100
tolerance: 0.06
matrix_size: 8640
ops_type: sgemm
@@ -0,0 +1,63 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# IET stress test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by comma.
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
# Set gemm operation type as dgemm.
# Set matrix_size to 28000.
# Test duration set to 10 mins.
# Target power set to 550W for each GPU.
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/iet_stress.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves power target of 550W.
#
actions:
- name: iet-stress-550W-dgemm-true
device: all
module: iet
parallel: true
duration: 60000
ramp_interval: 10000
sample_interval: 3000
log_interval: 3000
target_power: 550
matrix_size: 28000
ops_type: dgemm
lda: 28000
ldb: 28000
ldc: 28000
alpha: 1
beta: 1
@@ -0,0 +1,229 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# PEBB test #1
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
actions:
- name: h2d-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 50000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 4 # XGMI
# PEBB test #2
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. device to host
#
# Run test with :
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: d2h-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 4 # XGMI
# PEBB test #3
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: h2d-d2h-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 4 # XGMI
# PEBB test #4
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device
# 4. parallel transfers
# 5. random block sizes
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: h2d-parallel-xMB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
link_type: 4 # XGMI
# PEBB test #5
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. device to host
# 4. parallel transfers
# 5. random block sizes
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: d2h-parallel-xMB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
link_type: 4 # XGMI
# PEBB test #6
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
# 4. parallel transfers
# 5. random block sizes
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: h2d-d2h-xMB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
link_type: 4 # XGMI
# PEBB test #7
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device
# 4. parallel transfers
# 5. back-to-back 51MB
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: h2d-b2b-51MB
device: all
module: pebb
log_interval: 800
duration: 34000
device_to_host: false
host_to_device: true
b2b_block_size: 51200
parallel: false
link_type: 4 # XGMI
# PEBB test #8
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host-to-device and device-to-host
# 4. parallel back-to-back transfers
# 5. back-to-back 51MB
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: d2h-b2b-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
b2b_block_size: 51200
parallel: true
link_type: 4 # XGMI
# PEBB test #9
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
# 4. XGMI only
# 5. parallel back-to-back transfers
# Run test with:
# cd bin
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
#
- name: h2d-d2h-b2b-51MB
device: all
module: pebb
log_interval: 800
duration: 34000
device_to_host: true
host_to_device: true
b2b_block_size: 51200
parallel: false
link_type: 4 # XGMI
@@ -0,0 +1,49 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# BABEL test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
# Set parallel execution to false
# Set buffer size to reflect the buffer you want to test
# Set run count to 1 (test will run once)
#
actions:
- name: babel-float-256MiB
device: all
module: babel # Name of the module
parallel: false # Parallel true or false
count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time)
num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
array_size: 268435456 # Buffer size the test operates, this is 256 MiB
test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB
o/p_csv: false # o/p as csv file
subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
@@ -0,0 +1,94 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: gst-1000Tflops-8KB-fp8_r-false
device: all
module: gst
parallel: false
count: 1
duration: 30000
copy_matrix: false
target_stress: 1000000
matrix_size_a: 8192
matrix_size_b: 8192
matrix_size_c: 8192
data_type: fp8_r
transa: 1
transb: 0
alpha: 1
beta: 0
- name: gst-1000Tflops-8KB-fp8_r-true
device: all
module: gst
parallel: true
count: 1
duration: 60000
copy_matrix: false
target_stress: 1000000
matrix_size_a: 8192
matrix_size_b: 8192
matrix_size_c: 8192
data_type: fp8_r
transa: 1
transb: 0
alpha: 1
beta: 0
- name: gst-500Tflops-4KB-bf16_r-false
device: all
module: gst
parallel: false
count: 1
duration: 30000
copy_matrix: false
target_stress: 500000
matrix_size_a: 4096
matrix_size_b: 4096
matrix_size_c: 8192
data_type: bf16_r
transa: 1
transb: 0
alpha: 1
beta: 0
- name: gst-500Tflops-4KB-bf16_r-true
device: all
module: gst
parallel: true
count: 1
duration: 60000
copy_matrix: false
target_stress: 500000
matrix_size_a: 4096
matrix_size_b: 4096
matrix_size_c: 8192
data_type: bf16_r
transa: 1
transb: 0
alpha: 1
beta: 0
@@ -0,0 +1,181 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# GST self-check & accuracy-check test - gst-3K-sgemm-check
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space
# Set matrices sizes to 3072 * 3072 * 3072
# Set gemm operation as sgemm
# Set matrix data initialization method as random integer
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
# Set target stress GFLOPS as 1215000 (1215 TFLOPS)
# Set self-check gemm self checking as enabled
# Set accu-check gemm accuracy checking as enabled (applicable for sgemm & dgemm only)
# Set error-inject gemm error injection as enabled (For TEST purpose only)
# Set error-freq error injection frequency as 2 (For TEST purpose only)
# Set error-count error injection count as 1 (For TEST purpose only)
#
# Expected result:
# Report self-check and accu-error at regular intervals as per set error parameters.
# The test on each GPU passes (TRUE) if the GPU achieves 100 TFLOPS or more
# within the test duration of 2 mins after ramp-up duration of 5 seconds.
# Else test on the GPU fails (FALSE).
actions:
- name: gst-3K-sgemm-check
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 120000
hot_calls: 1
copy_matrix: false
target_stress: 100000
matrix_size_a: 3072
matrix_size_b: 3072
matrix_size_c: 3072
matrix_init: rand
ops_type: sgemm
lda: 3072
ldb: 3072
ldc: 3072
transa: 1
transb: 0
alpha: 1
beta: 0
self_check: true
accuracy_check: true
error_inject: true
error_freq: 2
error_count: 1
- name: gst-3K-dgemm-check
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 120000
hot_calls: 1
copy_matrix: false
target_stress: 85000
matrix_size_a: 3072
matrix_size_b: 3072
matrix_size_c: 3072
matrix_init: rand
ops_type: dgemm
lda: 3072
ldb: 3072
ldc: 3072
transa: 1
transb: 0
alpha: 1
beta: 0
self_check: true
accuracy_check: true
error_inject: true
error_freq: 2
error_count: 1
- name: gst-3K-fp16-check
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 30000
hot_calls: 1
copy_matrix: false
target_stress: 150000
matrix_size_a: 3072
matrix_size_b: 3072
matrix_size_c: 3072
matrix_init: rand
data_type: fp16_r
lda: 3072
ldb: 3072
ldc: 3072
transa: 1
transb: 0
alpha: 1
beta: 0
self_check: true
error_inject: true
error_freq: 2
error_count: 1
- name: gst-3K-bf16-check
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 30000
hot_calls: 1
copy_matrix: false
target_stress: 250000
matrix_size_a: 3072
matrix_size_b: 3072
matrix_size_c: 3072
matrix_init: rand
data_type: bf16_r
lda: 3072
ldb: 3072
ldc: 3072
transa: 1
transb: 0
alpha: 1
beta: 0
self_check: true
error_inject: true
error_freq: 2
error_count: 1
- name: gst-3K-fp8-check
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 30000
hot_calls: 1
copy_matrix: false
target_stress: 300000
matrix_size_a: 3072
matrix_size_b: 3072
matrix_size_c: 3072
matrix_init: rand
data_type: fp8_r
lda: 3072
ldb: 3072
ldc: 3072
transa: 1
transb: 0
alpha: 1
beta: 0
self_check: true
error_inject: true
error_freq: 2
error_count: 1
@@ -0,0 +1,186 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# GST test - gst-1215Tflops-4K4K8K-rand-fp8
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space
# Set matrices sizes to 4864 * 4096 * 8192
# Set matrix data type as fp8 real number
# Set matrix data initialization method as random integer
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
# Set target stress GFLOPS as 1215000 (1215 TFLOPS)
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves 1215 TFLOPS or more
# within the test duration of 15 seconds after ramp-up duration of 5 seconds.
# Else test on the GPU fails (FALSE).
actions:
- name: gst-1215Tflops-4K4K8K-rand-fp8
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 1000
copy_matrix: false
target_stress: 1215000
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: rand
data_type: fp8_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-981Tflops-4K4K8K-trig-fp8
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 1000
copy_matrix: false
target_stress: 981000
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: trig
data_type: fp8_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-639Tflops-4K4K8K-rand-fp16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 1000
copy_matrix: false
target_stress: 639000
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: rand
data_type: fp16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-523Tflops-4K4K8K-trig-fp16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 1000
copy_matrix: false
target_stress: 523000
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: trig
data_type: fp16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-581Tflops-4K4K8K-rand-bf16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 1000
copy_matrix: false
target_stress: 581000
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: rand
data_type: bf16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-552Tflops-4K4K8K-trig-bf16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 1000
copy_matrix: false
target_stress: 552000
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: trig
data_type: bf16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
@@ -0,0 +1,63 @@
# ################################################################################
# #
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# GST test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space
# Set parallel execution to true (workload execution on all GPUs in parallel)
# Set matrix_size to 28000.
# Set run count to 1 (each test will run twice)
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
#
# Run test with:
# cd bin
# ./rvs -c conf/gst_stress.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves 50000 gflops
actions:
- name: gpustress-50000-dgemm-true
device: all
module: gst
parallel: true
count: 1
duration: 60000
copy_matrix: false
target_stress: 50000
matrix_size_a: 28000
matrix_size_b: 28000
matrix_size_c: 28000
ops_type: dgemm
lda: 28000
ldb: 28000
ldc: 28000
alpha: 1
beta: 1
matrix_init: hiprand
@@ -0,0 +1,118 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# Test #1 - iet-400W-1K-rand-dgemm
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to true
# Set matrix_size to 1024 for dgemm operations
# Set target power to 400 Watts
# Set test duration to 1 min
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/iet_single.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches at least 400 Watts,
# FALSE otherwise
actions:
- name: iet-400W-1K-rand-dgemm
device: all
module: iet
parallel: true
duration: 60000
sample_interval: 3000
target_power: 400
matrix_size: 1024
matrix_init: rand
ops_type: dgemm
# Test #2 - iet-wait-750W-28K-rand-dgemm
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to true
# Set matrix_size to 28000 for dgemm operations
# Set target power to 750 Watts
# Set wait duration to 30 seconds (GPU idle period)
# Set test duration to 2 mins
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/iet_single.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches at least 750 Watts,
# FALSE otherwise
- name: iet-wait-750W-28K-rand-dgemm
device: all
module: iet
parallel: true
wait: 30000 # Wait for 30 secs before the test starts
duration: 60000
sample_interval: 3000
target_power: 750
matrix_size: 28000
matrix_init: hiprand
ops_type: dgemm
# Test #3 - iet-wait-400W-1K-rand-dgemm
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to true
# Set matrix_size to 1024 for dgemm operations
# Set target power to 400 Watts
# Set wait duration to 30 seconds (GPU idle period)
# Set test duration to 1 min
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/iet_single.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches at least 400 Watts,
# FALSE otherwise
- name: iet-wait-400W-1K-rand-dgemm
device: all
module: iet
parallel: true
wait: 30000 # Wait for 30 secs before the test starts
duration: 60000
sample_interval: 3000
log_interval: 3000
target_power: 400
matrix_size: 1024
matrix_init: rand
ops_type: dgemm
@@ -0,0 +1,64 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# IET stress test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by comma.
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
# Set gemm operation type as dgemm.
# Set matrix_size to 28000.
# Test duration set to 10 mins.
# Target power set to 750W for each GPU.
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/iet_stress.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves power target of 750W.
#
actions:
- name: iet-stress-750W-dgemm-true
device: all
module: iet
parallel: true
duration: 60000
ramp_interval: 10000
sample_interval: 5000
log_interval: 5000
target_power: 750
matrix_size: 28000
ops_type: dgemm
lda: 28000
ldb: 28000
ldc: 28000
alpha: 1
beta: 1
matrix_init: hiprand
@@ -0,0 +1,98 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: p2p-unidir-sequential-64MB
device: all
module: pbqt
log_interval: 5000
duration: 60000
peers: all
test_bandwidth: true
bidirectional: false
parallel: false
block_size: 67108864
device_id: all
- name: p2p-unidir-parallel-64MB
device: all
module: pbqt
log_interval: 5000
duration: 60000
peers: all
test_bandwidth: true
bidirectional: false
parallel: true
block_size: 67108864
device_id: all
- name: p2p-bidir-sequential-64MB
device: all
module: pbqt
log_interval: 5000
duration: 60000
peers: all
test_bandwidth: true
bidirectional: true
parallel: false
block_size: 67108864
device_id: all
- name: p2p-bidir-parallel-64MB
device: all
module: pbqt
log_interval: 5000
duration: 60000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
block_size: 67108864
device_id: all
- name: p2p-bidir-sequential-64-128-256MB
device: all
module: pbqt
log_interval: 5000
duration: 60000
peers: all
test_bandwidth: true
bidirectional: true
parallel: false
block_size: 67108864 134217728 268435456
device_id: all
- name: p2p-bidir-parallel-64-128-256MB
device: all
module: pbqt
log_interval: 5000
duration: 60000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
block_size: 67108864 134217728 268435456
device_id: all
@@ -0,0 +1,95 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# PEBB test #1
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. device to host
# 4. Transfer block size 64MB
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/pebb_single.conf
#
actions:
- name: d2h-sequential-64MB
device: all
module: pebb
duration: 60000
device_to_host: true
host_to_device: false
parallel: false
block_size: 67108864
link_type: 2 # PCIe
# PEBB test #2
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. device to host
# 4. Transfer block size 64MB
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/pebb_single.conf
#
- name: h2d-sequential-64MB
device: all
module: pebb
duration: 60000
device_to_host: false
host_to_device: true
parallel: false
block_size: 67108864
link_type: 2 # PCIe
# PEBB test #3
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. host to device & device to host
# 4. Transfer block size 64MB
#
# Run test with:
# cd bin
# ./rvs -c conf/MI300X/pebb_single.conf
#
- name: h2d-d2h-sequential-64MB
device: all
module: pebb
duration: 60000
device_to_host: true
host_to_device: true
parallel: false
block_size: 67108864
link_type: 2 # PCIe
warm_calls: 10
hot_calls: 100
b2b: true
@@ -0,0 +1,51 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# BABEL test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
# Set parallel execution to false
# Set buffer size to reflect the buffer you want to test
# Set run count to 1 (test will run once)
#
actions:
- name: babel-float-256MiB
device: all
module: babel # Name of the module
parallel: false # Parallel true or false
count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time)
num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
array_size: 268435456 # Buffer size the test operates, this is 256 MiB
test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB
o/p_csv: false # o/p as csv file
subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
dwords_per_lane: 4 # Number of dwords per lane
chunks_per_block: 4 # Number of chunks per block
@@ -0,0 +1,256 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# GST test - gst-96Tflops-8K12K4K-trig-tf32
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space
# Set matrices sizes to 8192 * 12288 * 4096
# Set matrix data type as fp32 real number
# Set compute type as tf32 (xf32)
# Set matrix data initialization method as trignometric float
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
# Set target stress GFLOPS as 96 TFLOPS
# Set blas source (backend) as hipblaslt
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves 96 TFLOPS or more
# within the test duration of 15 seconds after ramp-up duration of 5 seconds.
# Else test on the GPU fails (FALSE).
actions:
- name: gst-96Tflops-8K12K4K-trig-tf32
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 50
copy_matrix: false
target_stress: 96000
matrix_size_a: 8192
matrix_size_b: 12288
matrix_size_c: 4096
matrix_init: trig
data_type: fp32_r
compute_type: xf32_r
transa: 0
transb: 0
alpha: 1
beta: 1
blas_source: hipblaslt
parallel: true
- name: gst-406Tflops-8K13K17K-trig-i8
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 500
copy_matrix: false
target_stress: 406000
matrix_size_a: 8192
matrix_size_b: 13312
matrix_size_c: 17792
matrix_init: trig
data_type: i8_r
compute_type: i32_r
transa: 1
transb: 0
alpha: 1
beta: 0
blas_source: hipblaslt
parallel: true
- name: gst-26Tflops-8K8K8K-trig-fp32
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 100
copy_matrix: false
target_stress: 26000
matrix_size_a: 8192
matrix_size_b: 8960
matrix_size_c: 8192
matrix_init: trig
data_type: fp32_r
compute_type: fp32_r
transa: 0
transb: 0
alpha: 1
beta: 1
blas_source: hipblaslt
parallel: true
- name: gst-343Tflops-4K4K8K-rand-fp8
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 3000
copy_matrix: false
target_stress: 343415
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: rand
data_type: fp8_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-336Tflops-4K4K8K-trig-fp8
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 170000
copy_matrix: false
target_stress: 336441
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: trig
data_type: fp8_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-176Tflops-4K4K8K-rand-fp16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 3000
copy_matrix: false
target_stress: 176191
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: rand
data_type: fp16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-172Tflops-4K4K8K-trig-fp16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 90000
copy_matrix: false
target_stress: 172333
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: trig
data_type: fp16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-174Tflops-4K4K8K-rand-bf16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 3000
copy_matrix: false
target_stress: 174364
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: rand
data_type: bf16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
- name: gst-172Tflops-4K4K8K-trig-bf16
device: all
module: gst
log_interval: 3000
ramp_interval: 5000
duration: 15000
hot_calls: 90000
copy_matrix: false
target_stress: 172333
matrix_size_a: 4864
matrix_size_b: 4096
matrix_size_c: 8192
matrix_init: trig
data_type: bf16_r
lda: 8320
ldb: 8320
ldc: 4992
ldd: 4992
transa: 1
transb: 0
alpha: 1
beta: 0
parallel: true
@@ -0,0 +1,70 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# GST thermal test - gst-thermal-dgemm-true
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space
# Set matrices sizes to 8640 * 8640 * 8640
# Set matrices batch size to 96
# Set gemm operation type as dgemm real
# Set gemm operation mode as batched strided gemm
# Set matrix data initialization method as hip random integer
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
# Set target stress GFLOPS as 24700 GFLOPS (~24.7 TFLOPS)
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves 24.7 TFLOPS or more
# within the test duration of 10 mins after ramp-up duration of 10 seconds.
# Else test on the GPU fails (FALSE).
actions:
- name: gst-thermal-dgemm-true
device: all
module: gst
parallel: true
log_interval: 5000
ramp_interval: 10000
duration: 60000
copy_matrix: false
target_stress: 24700
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
lda: 8640
ldb: 8640
ldc: 8640
ldd: 8640
gemm_mode: strided_batched
batch_size: 96
matrix_init: hiprand
ops_type: dgemm
transa: 0
transb: 1
alpha: 2.71828
beta: 3.14159
@@ -0,0 +1,110 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# Test #1 - iet-260W-1K-rand-dgemm
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to true
# Set matrix_size to 1024 for dgemm operations
# Set target power to 260 Watts
# Set test duration to 2 mins
#
# Run test with:
# cd bin
# ./rvs -c conf/MI308X/iet_single.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches at least 260 Watts,
# FALSE otherwise
actions:
- name: iet-260W-1K-rand-dgemm
device: all
module: iet
parallel: true
duration: 60000
sample_interval: 1000
target_power: 260
matrix_size: 1024
matrix_init: rand
ops_type: dgemm
# Test #2 - iet-wait-350W-8K-rand-dgemm
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# Set parallel execution to true
# Set matrix_size to 8096 for dgemm operations
# Set target power to 350 Watts
# Set wait duration to 30 seconds (GPU idle period)
# Set test duration to 2 mins
#
# Run test with:
# cd bin
# ./rvs -c conf/MI308X/iet_single.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU power reaches at least 350 Watts,
# FALSE otherwise
- name: iet-wait-350W-8K-rand-dgemm
device: all
module: iet
parallel: true
wait: 30000 # Wait for 30 secs before the test starts
duration: 60000
sample_interval: 1000
target_power: 350
matrix_size: 8096
matrix_init: rand
ops_type: dgemm
- name: iet-wait-420W-16K-rand-dgemm
device: all
module: iet
parallel: true
wait: 30000 # Wait for 30 secs before the test starts
duration: 60000
sample_interval: 1000
target_power: 420
matrix_size: 16182
matrix_init: rand
ops_type: dgemm
- name: iet-wait-stress-650W-bw
device: all
module: iet
parallel: true
wait: 30000 # Wait for 30 secs before the test starts
duration: 60000
sample_interval: 1000
target_power: 650
bw_workload: true
cp_workload: false
tolerance: 0.05
@@ -0,0 +1,58 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# IET stress test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by comma.
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
# Test duration set to 10 mins.
# Target power set to 650W for each GPU.
# Tolerance set to 5% of target power.
#
# Run test with:
# cd bin
# ./rvs -c conf/MI308X/iet_stress.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves power target of 750W.
#
actions:
- name: iet-stress-650W-true
device: all
module: iet
parallel: true
duration: 60000
ramp_interval: 1000
sample_interval: 5000
log_interval: 5000
target_power: 650
tolerance: 0.05
bw_workload: true
cp_workload: false
@@ -0,0 +1,69 @@
# ################################################################################
# #
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# IET thermal test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by comma.
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
# Test duration set to 10 mins.
# Target power set to 600W for each GPU.
#
# Run test with:
# cd bin
# ./rvs -c conf/MI308X/iet_thermal.conf
#
# Expected result:
# The test on each GPU passes (TRUE) if the GPU achieves power target of 600W.
#
actions:
- name: iet-thermal-dgemm-true
device: all
module: iet
parallel: true
sample_interval: 5000
ramp_interval: 20000
duration: 60000
copy_matrix: false
target_power: 600
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
lda: 8640
ldb: 8640
ldc: 8640
ldd: 8640
gemm_mode: strided_batched
batch_size: 96
matrix_init: hiprand
ops_type: dgemm
transa: 0
transb: 1
alpha: 2.71828
beta: 3.14159
@@ -0,0 +1 @@
nv21
@@ -0,0 +1 @@
../nv31/gpup_single.conf
@@ -0,0 +1,41 @@
# ################################################################################
# #
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: gpustress-9000-sgemm-false
device: all
module: gst
parallel: true
count: 1
duration: 10000
copy_matrix: false
target_stress: 6000
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
ops_type: sgemm
lda: 8640
ldb: 8640
ldc: 8640
@@ -0,0 +1 @@
../nv31/gst_stress_3_hrs.conf
@@ -0,0 +1 @@
../nv31/iet_stress.conf
@@ -0,0 +1 @@
../nv31/mem.conf
@@ -0,0 +1 @@
../nv31/pbqt_single.conf
@@ -0,0 +1 @@
../nv31/pebb_single.conf
@@ -0,0 +1 @@
../nv31/peqt_single.conf
@@ -0,0 +1 @@
../nv31/pesm_1.conf
@@ -0,0 +1 @@
../nv31/rcqt_single.conf
@@ -0,0 +1,174 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# Run test with testscript or binary:
#
# Using Testscript -
# cd /opt/rocm/share/rocm-validation-suite/testscripts
# sudo ./gpup.new.sh
#
# Using Binary -
# cd /opt/rocm/share/rocm-validation-suite/conf
# cd /opt/rocm/bin
# sudo ./rvs -c /opt/rocm/share/rocm-validation-suite/conf/gpup_single.conf
#
# Note: Paths may vary with the ROCm version or ROCm installation path.
# GPUP test #1
#
# Preconditions:
# all AMD compatible GPUs
# all types of devices
# all gpu properties, all io_links properties
#
# Expected result:
# Test passes with displaying all properties values for any GPUs
actions:
- name: RVS-GPUP-TC1
device: all
module: gpup
properties:
all:
io_links-properties:
all:
# GPUP test #2
#
# Preconditions:
# all AMD compatible GPUs
# all types of devices
# no regular expressions
# only a subset of gpu properties, only a subset of io_link properties
#
# Expected result:
# Test passes with displaying subsets of properties and io_link properties values for any GPUs
- name: RVS-GPUP-TC2
device: all
module: gpup
properties:
simd_count:
mem_banks_count:
io_links_count:
vendor_id:
location_id:
max_engine_clk_ccompute:
io_links-properties:
version_major:
type:
version_major:
version_minor:
node_from:
node_to:
recommended_transfer_size:
flags:
# GPUP test #3
#
# Preconditions:
# only a subset of AMD compatible GPUs (device filtering)
# all types of devices
# all gpu properties, all io_link properties
#
# Expected result:
# Test passes with displaying all properties and io_link properties values for subset of GPUs
#
# Note:
# Testing specific device, if device numbers are changed in system it should be changed in the test
- name: RVS-GPUP-TC3
device: all
module: gpup
properties:
all:
io_links-properties:
all:
# GPUP test #4
#
# Preconditions:
# all AMD compatible GPUs
# a given device type (deviceid filtering), this must be filled based on deviceid in sysfs/ ./rvs -g.
# Default is 0=> no filtering
# all gpu properties, all io_link properties
#
# Expected result:
# Test passes with displaying all properties and io_link properties values for all GPUs and given deviceid
- name: RVS-GPUP-TC4
device: all
module: gpup
deviceid: 0
properties:
all:
io_links-properties:
all:
# GPUP test #5
#
# Preconditions:
# only a subset of AMD compatible GPUs (device filtering)
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
# Default is 0=> no filtering
# all gpu properties, all io_link properties
#
# Expected result:
# Test passes with displaying all properties and io_link properties values for subset of GPUs and given deviceid
#
# Note:
# Testing specific device, if device numbers are changed in system it should be changed in the test
- name: RVS-GPUP-TC5
device: all
module: gpup
deviceid: 0
properties:
all:
io_links-properties:
all:
# GPUP test #6
#
# Preconditions:
# only a subset of AMD compatible GPUs (device filtering)
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
# Default is 0=> no filtering
# only a subset of gpu properties, only a subset of io_link properties
#
# Expected result:
# Test passes with displaying subset of properties and io_link properties values for subset of GPUs and given deviceid
#
# Note:
# Testing specific device, if device numbers are changed in system it should be changed in the test
- name: RVS-GPUP-TC6
device: all
module: gpup
deviceid: 0
properties:
mem_banks_count:
io_links-properties:
version_major:
@@ -0,0 +1,41 @@
# ################################################################################
# #
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: gpustress-10000-sgemm-false
device: all
module: gst
parallel: true
count: 1
duration: 10000
copy_matrix: false
target_stress: 10000
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
ops_type: sgemm
lda: 8640
ldb: 8640
ldc: 8640
@@ -0,0 +1,43 @@
# ################################################################################
# #
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: gpustress-3hrs
device: all
module: gst
parallel: true
count: 1
duration: 10800000
ramp_interval: 300000
log_interval: 6000
target_stress: 5000
max_violations: 1
copy_matrix: false
tolerance: 0.01
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
ops_type: sgemm
@@ -0,0 +1,41 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: action_1
device: all
module: iet
parallel: true
count: 1
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 700
log_interval: 700
max_violations: 1
target_power: 127
tolerance: 0.06
matrix_size: 8640
ops_type: dgemm
@@ -0,0 +1,68 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# Memory test
#
# Preconditions:
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
# Set run count to how many times we want each test to run
#
# Run test with:
# ./rvs -c conf/mem.conf -d 3
#
# Expected result:
# The test on each GPU passes (TRUE) if no memory errors are seen
# FALSE otherwise
#
# To omit individual actions specify number of test in exclude tag's value, numbers as specified below
# 0: Walking 1 bit
# 1: Own address test
# 2: Moving inversions, ones&zeros
# 3: Moving inversions, 8 bit pattern
# 4: Moving inversions, random pattern
# 5: Block move, 64 moves
# 6: Moving inversions, 32 bit pattern
# 7: Random number sequence
# 8: Modulo 20, random pattern
# 9: Bit fade test
# 10: Memory stress test
#
actions:
- name: action_1
device: all
module: mem
parallel: true
count: 1
wait: 100
mapped_memory: false
mem_blocks: 128
num_passes: 500
thrds_per_blk: 64
stress: true
num_iter: 50000
exclude : 9 10
@@ -0,0 +1,182 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: action_1
device: all
module: pbqt
log_interval: 800
duration: 5000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
block_size: 1000000 2000000 10000000
device_id: all
- name: action_2
device: all
module: pbqt
log_interval: 1000
count: 3
duration: 10000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_3
device: all
module: pbqt
log_interval: 800
duration: 4000
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_4
device: all
module: pbqt
log_interval: 1000
duration: 5000
count: 1
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_5
device: all
module: pbqt
log_interval: 800
duration: 4000
count: 1
peers: all
test_bandwidth: true
bidirectional: true
parallel: true
device_id: all
- name: action_6
device: all
module: pbqt
log_interval: 800
duration: 8000
count: 1
peers: all
test_bandwidth: true
bidirectional: false
parallel: false
device_id: all
- name: action_7
device: all
module: pbqt
peers: all
count: 1
test_bandwidth: false
device_id: all
- name: action_8
device: all
module: pbqt
peers: all
test_bandwidth: true
bidirectional: true
parallel : true
device_id: all
- name: action_9
device: all
module: pbqt
log_interval: 500
duration: 1000
peers: all
test_bandwidth: true
bidirectional: false
parallel: true
device_id: all
- name: action_10
device: all
module: pbqt
log_interval: 500
duration: 1000
peers: all
peer_device_id: all
test_bandwidth: true
bidirectional: false
parallel: true
- name: action_11
device: all
module: pbqt
log_interval: 0
duration: 10000
peers: all
peer_device_id: all
test_bandwidth: true
bidirectional: true
parallel: false
device_id: all
- name: action_12
device: all
module: pbqt
log_interval: 0
duration: 1000
count: 3
wait: 1000
peers: all
peer_device_id: all
test_bandwidth: true
bidirectional: true
parallel: true
- name: action_13
device: all
module: pbqt
log_interval: 1000
duration: 10000
peers: all
device_id: all
peer_device_id: all
test_bandwidth: true
bidirectional: true
parallel: true
- name: action_14
device: all
module: pbqt
log_interval: 500
duration: 10000
peers: all
test_bandwidth: true
bidirectional: true
device_id: all
@@ -0,0 +1,43 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# PEBB test #3
#
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. bidirectional
actions:
- name: h2d-d2h-sequential-51MB
device: all
module: pebb
log_interval: 800
duration: 5000
device_to_host: true
host_to_device: true
parallel: true
block_size: 51200000
link_type: 2 # PCIe
@@ -0,0 +1,593 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# Run test with testscript or binary:
#
# Using Testscript -
# cd /opt/rocm/share/rocm-validation-suite/testscripts
# sudo ./peqt.new.sh
#
# Using Binary -
# cd /opt/rocm/share/rocm-validation-suite/conf
# cd /opt/rocm/bin
# ./rvs -c /opt/rocm/share/rocm-validation-suite/conf/peqt_single.conf
#
# Note: Paths may vary with the ROCm version or ROCm installation path.
# PEQT test #1
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. no regular expressions
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if at least one AMD compatible GPU is registered within the system, FALSE otherwise
actions:
- name: pcie_act_1
device: all
module: peqt
capability:
link_cap_max_speed:
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #2
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 2. all types of devices
# 3. no regular expressions
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list, FALSE otherwise
- name: pcie_act_2
module: peqt
device: all
capability:
link_cap_max_speed:
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
atomic_op_128_CAS_completer:
# PEQT test #3
# testing conditions:
# 1. all AMD compatible GPUs
# 3. no regular expressions
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if at least one AMD compatible GPU (registered within the system), FALSE otherwise
- name: pcie_act_3
module: peqt
device: all
capability:
link_cap_max_speed:
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #4
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 3. no regular expressions
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
# , FALSE otherwise
- name: pcie_act_4
module: peqt
device: all
capability:
link_cap_max_speed:
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #5
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 2. a given device type (deviceid filtering)(replace 0 with appropriate deviceid
# 3. no regular expressions
# 4. only a subset of PCIe capabilities
# Expected PCIe check RESULT = TRUE if at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
# and also matches the <deviceid>, FALSE otherwise
- name: pcie_act_5
module: peqt
device: all
deviceid: 0
capability:
link_cap_max_speed:
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
dev_serial_num:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #6
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. a simple regular expression for <link_cap_max_speed> capability
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if all <link_cap_max_speed> values match the given regular expression
# and at least one AMD compatible GPU is registered within the system
# FALSE otherwise
- name: pcie_act_6
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #7
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. 2 simple regular expressions, as follows: one for <link_cap_max_speed> capability and another one for the <link_stat_cur_speed>
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - at least one AMD compatible GPU is registered within the system and
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression
# FALSE otherwise
- name: pcie_act_7
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #8
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and an erroneous one for <slot_pwr_limit_value>
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - at least one AMD compatible GPU is registered within the system and
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression
# FALSE otherwise
# Notice: <slot_pwr_limit_value> regular expression is not valid and will be skipped
# without affecting the PEQT modules' check RESULT (however, an error will be logged out)
- name: pcie_act_8
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width:
slot_pwr_limit_value: '[a-b][d-'
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #9
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 2. all types of devices
# 3. 2 simple regular expressions, as follows: one for <link_cap_max_speed> capability and another one for the <link_stat_cur_speed>
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list and
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression
# FALSE otherwise
- name: pcie_act_9
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #10
# testing conditions:
# 1. all AMD compatible GPUs
# 2. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and one for <kernel_driver>
# 3. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression and
# - all <kernel_driver> values match the given regular expression
# FALSE otherwise
- name: pcie_act_10
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver: ^amdgpu$
dev_serial_num:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #11
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 3. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and one for <kernel_driver>
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> lis
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression and
# - all <kernel_driver> values match the given regular expression
# FALSE otherwise
- name: pcie_act_11
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver: ^amdgpu$
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #12
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 3. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and one for <kernel_driver>
# 4. only a subset of PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> value smatch the given regular expression and
# - all <kernel_driver> values match the given regular expression
# FALSE otherwise
- name: pcie_act_12
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width:
link_stat_cur_speed: '^(\d+ GT\/s)$'
vendor_id:
kernel_driver: ^amdgpu$
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #13
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 3. 5 simple regular expressions, as follows:
# - one for <link_cap_max_speed> PCIe capability
# - one for the <link_stat_cur_speed> PCIe capability
# - one for <kernel_driver>
# - one for <link_cap_max_width> PCIe capability
# - one for <link_stat_neg_width> PCIe capability
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression and
# - all <kernel_driver> values match the given regular expression
# - all <link_cap_max_width> values match the given regular expression
# - all <link_stat_neg_width> values match the given regular expression
# FALSE otherwise
- name: pcie_act_13
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width: ^(x8|x16)$
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width: ^(x8|x16)$
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver: ^amdgpu$
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:
# PEQT test #14
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 3. 6 simple regular expressions, as follows:
# - one for <link_cap_max_speed> PCIe capability
# - one for the <link_stat_cur_speed> PCIe capability
# - one for <kernel_driver>
# - one for <link_cap_max_width> PCIe capability
# - one for <link_stat_neg_width> PCIe capability
# - one for <atomic_op_completer> PCIe capability
# 4. all PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression and
# - all <kernel_driver> values match the given regular expression
# - all <link_cap_max_width> values match the given regular expression
# - all <link_stat_neg_width> values match the given regular expression
# - all <atomic_op_completer> values match the given regular expression (4 TRUE/FALSE values with whitespace between them)
# FALSE otherwise
- name: pcie_act_14
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width: ^(x8|x16)$
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width: ^(x8|x16)$
slot_pwr_limit_value:
slot_physical_num:
deviceid:
vendor_id:
kernel_driver: ^amdgpu$
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing: ^((TRUE|FALSE){1})$
atomic_op_32_completer: ^((TRUE|FALSE){1})$
atomic_op_64_completer: ^((TRUE|FALSE){1})$
atomic_op_128_CAS_completer: ^((TRUE|FALSE){1})$
# PEQT test #15
# testing conditions:
# 1. only a subset of AMD compatible GPUs (device filtering)
# 3. 6 simple regular expressions, as follows:
# - one for <link_cap_max_speed> PCIe capability
# - one for the <link_stat_cur_speed> PCIe capability
# - one for <kernel_driver>
# - one for <link_cap_max_width> PCIe capability
# - one for <link_stat_neg_width> PCIe capability
# - one for <atomic_op_completer> PCIe capability
# 4. only a subset of PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression and
# - all <kernel_driver> values match the given regular expression
# - all <link_cap_max_width> values match the given regular expression
# - all <link_stat_neg_width> values match the given regular expression
# - all <atomic_op_completer> values match the given regular expression (4 TRUE/FALSE values with whitespace between them)
# FALSE otherwise
- name: pcie_act_15
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width: ^(x8|x16)$
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width: ^(x8|x16)$
kernel_driver: ^amdgpu$
atomic_op_routing: ^((TRUE|FALSE){1})$
atomic_op_32_completer: ^((TRUE|FALSE){1})$
# PEQT test #16
# testing conditions:
# 1. all AMD compatible GPUs
# 3. 6 simple regular expressions, as follows:
# - one for <link_cap_max_speed> PCIe capability
# - one for the <link_stat_cur_speed> PCIe capability
# - one for <kernel_driver>
# - one for <link_cap_max_width> PCIe capability
# - one for <link_stat_neg_width> PCIe capability
# - one for <atomic_op_completer> PCIe capability
# 4. only a subset of PCIe capabilities
# Expected PCIe check RESULT = TRUE if
# - all <link_cap_max_speed> values match the given regular expression and
# - all <link_stat_cur_speed> values match the given regular expression and
# - all <kernel_driver> values match the given regular expression
# - all <link_cap_max_width> values match the given regular expression
# - all <link_stat_neg_width> values match the given regular expression
# - all <atomic_op_completer> values match the given regular expression (4 TRUE/FALSE values with whitespace between them)
# FALSE otherwise
- name: pcie_act_16
module: peqt
device: all
capability:
link_cap_max_speed: '^(\d+ GT\/s)$'
link_cap_max_width: ^(x8|x16)$
link_stat_cur_speed: '^(\d+ GT\/s)$'
link_stat_neg_width: ^(x8|x16)$
kernel_driver: ^amdgpu$
atomic_op_routing: ^((TRUE|FALSE){1})$
atomic_op_32_completer: ^((TRUE|FALSE){1})$
atomic_op_64_completer: ^((TRUE|FALSE){1})$
atomic_op_128_CAS_completer: ^((TRUE|FALSE){1})$
# PEQT test #17
# testing conditions:
# 1. all AMD compatible GPUs
# 2. all types of devices
# 3. no regular expressions
# 4. bus and slot number
# Expected PCIe check RESULT = TRUE if at least one AMD compatible GPU is registered within the system, FALSE otherwise
- name: pcie_act_17
module: peqt
device: all
capability:
bus_id:
slot_physical_num:
@@ -0,0 +1,47 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
# PESM test #1
#
# Preconditions:
# Set device id to an existing AMD deviceid values
#
# Run test with:
# cd bin
# sudo ./rvs -c conf/pesm2.conf
#
# Expected result:
# Test passes without displaying data for any GPUs
actions:
- name: act1
device: all
deviceid: 26720
module: pesm
monitor: true
- name: act2
device: all
debugwait: 3000
module: pesm
monitor: false
@@ -0,0 +1,36 @@
# ################################################################################
# #
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: action_1
device: all
module: rcqt
package: rocm-hip-sdk
- name: action_2
device: all
module: rcqt
packagelist: rocm-hip-libraries rocm-core rocm-dev rocm-hip-runtime-devel rocm-language-runtime rocm-hip-runtime rocm-hip-sdk rocm-utils rocm-smi-lib rocalution rocm-debug-agent rocm-clang-ocl rocm-device-libs hsa-rocr-devel hipcub-devel rocm-ocl-icd rocsolver rocsparse rocsolver-devel rocminfo hipfft-devel rocm-gdb rocm-dbgapi rocfft hipblas-devel rocthrust-devel openmp-extras comgr rccl rocblas hipblas roctracer-dev hip-doc amdgpu-install rocrand hsa-rocr hipfft hipsparse-devel rocsparse-devel rocrand-devel rocm-opencl hip-devel rocprim-devel hipsolver-devel rocfft-devel hsa-amd-aqlprofile hipify-clang miopen-hip-devel rocm-llvm hip-runtime-amd hip-samples rocalution-devel rccl-devel hipsolver rocprofiler-dev miopen-hip rocm-cmake hipsparse rocblas-devel rocm-opencl-devel
@@ -0,0 +1 @@
../nv31/gpup_single.conf
@@ -0,0 +1,41 @@
# ################################################################################
# #
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
# #
# # MIT LICENSE:
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
# # this software and associated documentation files (the "Software"), to deal in
# # the Software without restriction, including without limitation the rights to
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# # of the Software, and to permit persons to whom the Software is furnished to do
# # so, subject to the following conditions:
# #
# # The above copyright notice and this permission notice shall be included in all
# # copies or substantial portions of the Software.
# #
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# # SOFTWARE.
# #
# ###############################################################################
actions:
- name: gpustress-9000-sgemm-false
device: all
module: gst
parallel: true
count: 1
duration: 10000
copy_matrix: false
target_stress: 6000
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
ops_type: sgemm
lda: 8640
ldb: 8640
ldc: 8640
@@ -0,0 +1 @@
../nv31/gst_stress_3_hrs.conf
@@ -0,0 +1 @@
../nv31/iet_stress.conf
@@ -0,0 +1 @@
../nv31/mem.conf
@@ -0,0 +1 @@
../nv31/pbqt_single.conf
@@ -0,0 +1 @@
../nv31/pebb_single.conf
@@ -0,0 +1 @@
../nv31/peqt_single.conf
@@ -0,0 +1 @@
../nv31/pesm_1.conf
@@ -0,0 +1 @@
../nv31/rcqt_single.conf