RVS - Use config files and make GPU aware
Change-Id: I7a5c80ed4e6122d102e494d1ae38b4b7d40c42cd Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
Tá an tiomantas seo le fáil i:
tiomanta ag
Galantsev, Dmitrii
tuismitheoir
247c8c7d5e
tiomantas
f5a4402ce5
@@ -160,9 +160,13 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
|
||||
typedef enum {
|
||||
RDC_FI_INVALID = 0, //!< Invalid field value
|
||||
//!< @brief Identifier fields
|
||||
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
|
||||
RDC_FI_DEV_NAME, //!< Name of the device
|
||||
RDC_FI_OAM_ID, //!< OAM ID of the device
|
||||
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
|
||||
RDC_FI_DEV_NAME, //!< Name of the device
|
||||
RDC_FI_OAM_ID, //!< OAM ID of the device
|
||||
RDC_FI_DEV_ID, //!< Device ID
|
||||
RDC_FI_REV_ID, //!<
|
||||
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
|
||||
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
|
||||
|
||||
/**
|
||||
* @brief Frequency related fields
|
||||
@@ -388,6 +392,9 @@ typedef uint32_t rdc_field_grp_t; //!< Field group ID type
|
||||
*/
|
||||
typedef struct {
|
||||
char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device.
|
||||
uint64_t device_id; //!< The device id of a GPU
|
||||
uint32_t num_of_compute_units;
|
||||
uint64_t target_graphics_version;
|
||||
} rdc_device_attributes_t;
|
||||
|
||||
/**
|
||||
@@ -533,7 +540,8 @@ typedef enum {
|
||||
RDC_DIAG_RVS_MEMBW_TEST, //!< RVS bandwidth test
|
||||
RDC_DIAG_RVS_H2DD2H_TEST, //!< RVS Host<->Device transfer speed test
|
||||
RDC_DIAG_RVS_IET_TEST, //!< RVS IET test
|
||||
RDC_DIAG_TEST_LAST = RDC_DIAG_RVS_IET_TEST
|
||||
RDC_DIAG_RVS_CUSTOM, //!< RVS custom test
|
||||
RDC_DIAG_TEST_LAST,
|
||||
} rdc_diag_test_cases_t;
|
||||
|
||||
/**
|
||||
@@ -547,7 +555,7 @@ typedef enum {
|
||||
/**
|
||||
* @brief The maximum test cases to run
|
||||
*/
|
||||
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1)
|
||||
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST)
|
||||
|
||||
/**
|
||||
* @brief The maximum length of the diagnostic messages
|
||||
@@ -1607,6 +1615,8 @@ rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
*/
|
||||
rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
||||
|
||||
const char* get_rocm_path(const char* search_string);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -22,61 +22,96 @@ THE SOFTWARE.
|
||||
#ifndef RDC_MODULES_RDC_RVS_RVSBASE_H_
|
||||
#define RDC_MODULES_RDC_RVS_RVSBASE_H_
|
||||
|
||||
#include <amd_smi/amdsmi.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <filesystem>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rvs/rvs.h"
|
||||
|
||||
static constexpr size_t MAX_CONFIG_LENGTH = 1024;
|
||||
// NOTE: There MUST be a space after :
|
||||
static const std::map<rdc_diag_test_cases_t, std::string> test_to_conf = {
|
||||
// derived from conf/gst_single.conf
|
||||
{RDC_DIAG_RVS_GST_TEST,
|
||||
"{actions: [{name: gpustress-9000-sgemm-false, device: all, "
|
||||
"device_index: '0', module: gst, parallel: false, count: 1, duration: "
|
||||
"10000, copy_matrix: false, target_stress: 9000, matrix_size_a: 8640, "
|
||||
"matrix_size_b: 8640, matrix_size_c: 8640, ops_type: sgemm, lda: 8640, "
|
||||
"ldb: 8640, ldc: 8640}]}"},
|
||||
// derived from conf/MI300X/babel.conf
|
||||
{RDC_DIAG_RVS_MEMBW_TEST,
|
||||
"{actions: [{name: babel-float-256MiB,"
|
||||
"device: all, module: babel, "
|
||||
"parallel: false, count: 1, num_iter: 5000, array_size: 268435456, "
|
||||
"test_type: 1, mibibytes: true, o/p_csv: false, subtest: 5}]}"},
|
||||
// derived from conf/MI300X/pebb_single.conf
|
||||
{RDC_DIAG_RVS_H2DD2H_TEST,
|
||||
"{actions: [{name: h2d-d2h-sequential-64MB,"
|
||||
"device: all, module: pebb, duration: 120000, device_to_host: true, "
|
||||
"host_to_device: true, parallel: false, block_size: 67108864, "
|
||||
"link_type: 2, warm_calls: 10, hot_calls: 100, b2b: true}]}"},
|
||||
// derived from conf/MI300X/iet_single.conf
|
||||
{RDC_DIAG_RVS_IET_TEST,
|
||||
"{actions: [{name: iet-400W-1K-rand-dgemm,"
|
||||
"device: all, module: iet, parallel: true, duration: 60000, "
|
||||
"sample_interval: 3000, target_power: 400, matrix_size: 1024, "
|
||||
"matrix_init: rand, ops_type: dgemm}]}"},
|
||||
// this map only makes sense in context of test config locations as originally
|
||||
// designed in RVS
|
||||
static const std::map<uint64_t, std::string> gfx_to_rvs_conf = {
|
||||
{0x90a, "MI210"}, // ?
|
||||
{0x940, "MI300A"}, // ?
|
||||
{0x941, "MI300A"}, // ?
|
||||
{0x942, "MI300X"}, // ?
|
||||
{0x94a, "MI308X"}, // ?
|
||||
{0x1030, "nv21"}, //
|
||||
{0x1031, "nv21"}, // ?
|
||||
{0x1032, "nv21"}, // ?
|
||||
{0x1033, "nv21"}, // ?
|
||||
{0x1034, "nv21"}, // ?
|
||||
{0x1035, "nv21"}, // ?
|
||||
{0x1100, "nv31"}, // ?
|
||||
{0x1101, "nv31"}, // ?
|
||||
{0x1102, "nv31"}, // ?
|
||||
{0x1103, "nv31"}, // ?
|
||||
};
|
||||
static const std::map<rdc_diag_test_cases_t, std::string> test_to_name = {
|
||||
{RDC_DIAG_RVS_GST_TEST, "RVS_GST_TEST"},
|
||||
{RDC_DIAG_RVS_MEMBW_TEST, "RVS_MEMBW_TEST"},
|
||||
{RDC_DIAG_RVS_H2DD2H_TEST, "RVS_H2DD2H_TEST"},
|
||||
{RDC_DIAG_RVS_IET_TEST, "RVS_IET_TEST"},
|
||||
{RDC_DIAG_RVS_GST_TEST, "gst_single.conf"}, {RDC_DIAG_RVS_MEMBW_TEST, "babel.conf"},
|
||||
{RDC_DIAG_RVS_H2DD2H_TEST, "pebb_single.conf"}, {RDC_DIAG_RVS_IET_TEST, "iet_stress.conf"},
|
||||
{RDC_DIAG_RVS_CUSTOM, "CUSTOM_CONFIG"},
|
||||
};
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
inline amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
amdsmi_processor_handle* processor_handle) {
|
||||
uint32_t socket_count;
|
||||
uint32_t processor_count;
|
||||
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
std::vector<amdsmi_processor_handle> all_processors{};
|
||||
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
for (auto& socket : sockets) {
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_processor_handle> processors(processor_count);
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (auto& processor : processors) {
|
||||
processor_type_t processor_type = {};
|
||||
ret = amdsmi_get_processor_type(processor, &processor_type);
|
||||
if (processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!");
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
all_processors.push_back(processor);
|
||||
}
|
||||
}
|
||||
|
||||
if (gpu_id >= all_processors.size()) {
|
||||
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
// Get processor handle from GPU id
|
||||
*processor_handle = all_processors[gpu_id];
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
class RdcRVSBase {
|
||||
public:
|
||||
RdcRVSBase() { s_instance = this; };
|
||||
~RdcRVSBase() {
|
||||
if (s_instance == this) {
|
||||
s_instance = nullptr;
|
||||
}
|
||||
};
|
||||
RdcRVSBase();
|
||||
|
||||
~RdcRVSBase();
|
||||
|
||||
// only one instance allowed
|
||||
RdcRVSBase(const RdcRVSBase&) = delete;
|
||||
@@ -87,12 +122,16 @@ class RdcRVSBase {
|
||||
RdcRVSBase& operator=(RdcRVSBase&&) = delete;
|
||||
|
||||
rvs_status_t run_rvs_app(const char* config, size_t config_size, rdc_diag_callback_t* callback);
|
||||
std::vector<std::string> get_rvs_configs();
|
||||
std::map<rdc_diag_test_cases_t, std::string> get_test_to_conf();
|
||||
|
||||
private:
|
||||
static RdcRVSBase* s_instance;
|
||||
volatile rvs_session_state_t _state = RVS_SESSION_STATE_IDLE;
|
||||
rdc_diag_callback_t* _callback = nullptr;
|
||||
rvs_session_callback _rvs_callback = nullptr;
|
||||
std::vector<std::string> _rvs_config_list = {};
|
||||
std::map<rdc_diag_test_cases_t, std::string> _test_to_conf = {};
|
||||
|
||||
// Static callback function that the C API will call
|
||||
static void static_callback(rvs_session_id_t session_id, const rvs_results_t* results) {
|
||||
@@ -101,7 +140,7 @@ class RdcRVSBase {
|
||||
s_instance->session_callback(session_id, results);
|
||||
}
|
||||
}
|
||||
void session_callback(rvs_session_id_t session_id, const rvs_results_t* results) {
|
||||
void session_callback(rvs_session_id_t /*session_id*/, const rvs_results_t* results) {
|
||||
_state = results->state;
|
||||
// std::string output = "\n";
|
||||
// output += "session id -> " + std::to_string(session_id) + "\n";
|
||||
|
||||
@@ -21,6 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include <dlfcn.h>
|
||||
#include <string.h>
|
||||
#include <fstream>
|
||||
|
||||
#include <map>
|
||||
|
||||
@@ -476,6 +477,7 @@ char* strncpy_with_null(char* dest, const char* src, size_t n) {
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_policy_t policy) {
|
||||
if (!p_rdc_handle) {
|
||||
@@ -532,4 +534,43 @@ rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* r
|
||||
}
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_link_status_get(results);
|
||||
}
|
||||
}
|
||||
|
||||
const char * get_rocm_path(const char * search_string) {
|
||||
// set default rocm path in case lookup fails
|
||||
static std::string rocm_path("/opt/rocm");
|
||||
const char* rocm_path_env = getenv("ROCM_PATH");
|
||||
if (rocm_path_env != nullptr) {
|
||||
rocm_path = rocm_path_env;
|
||||
}
|
||||
|
||||
std::ifstream file("/proc/self/maps");
|
||||
|
||||
if (!file.is_open()) {
|
||||
RDC_LOG(RDC_DEBUG, "CANT OPEN FILE");
|
||||
return rocm_path.c_str();
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (getline(file, line)) {
|
||||
size_t index_end = line.find(search_string);
|
||||
size_t index_start = index_end;
|
||||
if (index_end == std::string::npos) {
|
||||
// no library on this line
|
||||
continue;
|
||||
}
|
||||
// walk index backwards until it reaches a space
|
||||
while ((index_start > 0) && (line[index_start - 1] != ' ')) {
|
||||
index_start--;
|
||||
}
|
||||
// extract library path, drop library name
|
||||
rocm_path = line.substr(index_start, index_end - index_start);
|
||||
// appending "../" should result in "/opt/rocm/lib/.." or similar
|
||||
rocm_path += "..";
|
||||
RDC_LOG(RDC_DEBUG, "FOUND SOMETHING!");
|
||||
return rocm_path.c_str();
|
||||
}
|
||||
|
||||
return rocm_path.c_str();
|
||||
}
|
||||
|
||||
|
||||
@@ -49,9 +49,12 @@ rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
|
||||
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count, const char* config,
|
||||
size_t config_size,
|
||||
rdc_diag_test_result_t* result,
|
||||
rdc_diag_callback_t* callback) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -74,24 +77,41 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
|
||||
size_t config_size,
|
||||
rdc_diag_response_t* response,
|
||||
rdc_diag_callback_t* callback) {
|
||||
const bool is_custom = config != nullptr && config_size != 0;
|
||||
|
||||
if (response == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
std::vector<rdc_diag_test_cases_t> rdc_runs;
|
||||
std::vector<rdc_diag_test_cases_t> tests_to_search_for;
|
||||
if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above
|
||||
rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS);
|
||||
rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY);
|
||||
rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS);
|
||||
rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE);
|
||||
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
|
||||
tests_to_search_for.push_back(RDC_DIAG_COMPUTE_PROCESS);
|
||||
tests_to_search_for.push_back(RDC_DIAG_NODE_TOPOLOGY);
|
||||
tests_to_search_for.push_back(RDC_DIAG_GPU_PARAMETERS);
|
||||
tests_to_search_for.push_back(RDC_DIAG_COMPUTE_QUEUE);
|
||||
tests_to_search_for.push_back(RDC_DIAG_SYS_MEM_CHECK);
|
||||
}
|
||||
|
||||
if (level >= RDC_DIAG_LVL_MED) { // Medium run and above
|
||||
rdc_runs.push_back(RDC_DIAG_RVS_GST_TEST);
|
||||
rdc_runs.push_back(RDC_DIAG_RVS_MEMBW_TEST);
|
||||
rdc_runs.push_back(RDC_DIAG_RVS_H2DD2H_TEST);
|
||||
rdc_runs.push_back(RDC_DIAG_RVS_IET_TEST);
|
||||
tests_to_search_for.push_back(RDC_DIAG_RVS_GST_TEST);
|
||||
tests_to_search_for.push_back(RDC_DIAG_RVS_MEMBW_TEST);
|
||||
tests_to_search_for.push_back(RDC_DIAG_RVS_H2DD2H_TEST);
|
||||
tests_to_search_for.push_back(RDC_DIAG_RVS_IET_TEST);
|
||||
}
|
||||
|
||||
std::vector<rdc_diag_test_cases_t> tests_to_run;
|
||||
if (is_custom) {
|
||||
// respect custom config
|
||||
tests_to_run.push_back(RDC_DIAG_RVS_CUSTOM);
|
||||
} else {
|
||||
// respect level
|
||||
for (auto& test : tests_to_search_for) {
|
||||
if (testcases_to_module_.find(test) != testcases_to_module_.end()) {
|
||||
tests_to_run.push_back(test);
|
||||
} else {
|
||||
RDC_LOG(RDC_DEBUG, "test not found: " << test);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
|
||||
@@ -99,15 +119,17 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
|
||||
callback->callback(callback->cookie, log.data());
|
||||
}
|
||||
|
||||
unsigned int i = 0;
|
||||
response->results_count = 0;
|
||||
for (unsigned int i = 0; i < rdc_runs.size(); i++) {
|
||||
for (i = 0; i < tests_to_run.size(); i++) {
|
||||
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
|
||||
std::string log = "Test " + std::to_string(i) + " / " + std::to_string(rdc_runs.size());
|
||||
std::string log =
|
||||
"Test " + std::to_string(i + 1) + " / " + std::to_string(tests_to_run.size());
|
||||
callback->callback(callback->cookie, log.data());
|
||||
}
|
||||
response->diag_info[i].test_case = rdc_runs[i];
|
||||
response->diag_info[i].test_case = tests_to_run[i];
|
||||
// NOTE: rdc_test_case_run reuses the diagnostic_run callback
|
||||
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
|
||||
rdc_test_case_run(tests_to_run[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
|
||||
config_size, &(response->diag_info[i]), callback);
|
||||
response->results_count++;
|
||||
}
|
||||
|
||||
@@ -432,33 +432,33 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
|
||||
constexpr double kGig = 1000000000.0;
|
||||
|
||||
static uint64_t sum_xgmi_read(const amdsmi_gpu_metrics_t& gpu_metrics) {
|
||||
uint64_t total = 0;
|
||||
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
|
||||
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
|
||||
if (gpu_metrics.xgmi_read_data_acc[i] == not_supported_metrics_data){
|
||||
continue;
|
||||
}
|
||||
total += gpu_metrics.xgmi_read_data_acc[i];
|
||||
uint64_t total = 0;
|
||||
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
|
||||
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
|
||||
if (gpu_metrics.xgmi_read_data_acc[i] == not_supported_metrics_data) {
|
||||
continue;
|
||||
}
|
||||
if (total == 0){
|
||||
return not_supported_metrics_data;
|
||||
}
|
||||
return total;
|
||||
total += gpu_metrics.xgmi_read_data_acc[i];
|
||||
}
|
||||
if (total == 0) {
|
||||
return not_supported_metrics_data;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
static uint64_t sum_xgmi_write(const amdsmi_gpu_metrics_t& gpu_metrics) {
|
||||
uint64_t total = 0;
|
||||
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
|
||||
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
|
||||
if (gpu_metrics.xgmi_write_data_acc[i] == not_supported_metrics_data){
|
||||
continue;
|
||||
}
|
||||
total += gpu_metrics.xgmi_write_data_acc[i];
|
||||
uint64_t total = 0;
|
||||
const auto not_supported_metrics_data = std::numeric_limits<uint64_t>::max();
|
||||
for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
|
||||
if (gpu_metrics.xgmi_write_data_acc[i] == not_supported_metrics_data) {
|
||||
continue;
|
||||
}
|
||||
if (total == 0){
|
||||
return not_supported_metrics_data;
|
||||
}
|
||||
return total;
|
||||
total += gpu_metrics.xgmi_write_data_acc[i];
|
||||
}
|
||||
if (total == 0) {
|
||||
return not_supported_metrics_data;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
@@ -659,6 +659,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
break;
|
||||
}
|
||||
case RDC_FI_DEV_NAME: {
|
||||
// source values from asic_info
|
||||
amdsmi_asic_info_t asic_info;
|
||||
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
value->type = STRING;
|
||||
@@ -700,17 +701,44 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
value->value.l_int = num_pages;
|
||||
}
|
||||
break;
|
||||
case RDC_FI_OAM_ID: {
|
||||
case RDC_FI_OAM_ID:
|
||||
case RDC_FI_DEV_ID:
|
||||
case RDC_FI_REV_ID:
|
||||
case RDC_FI_TARGET_GRAPHICS_VERSION:
|
||||
case RDC_FI_NUM_OF_COMPUTE_UNITS: {
|
||||
amdsmi_asic_info_t asic_info;
|
||||
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
if (value->status != AMDSMI_STATUS_SUCCESS) {
|
||||
break;
|
||||
}
|
||||
if (field_id == RDC_FI_OAM_ID) {
|
||||
// 0xFFFF means not supported for OAM ID
|
||||
if (asic_info.oam_id == 0xFFFF) {
|
||||
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
} else {
|
||||
value->value.l_int = asic_info.oam_id;
|
||||
}
|
||||
} else if (field_id == RDC_FI_DEV_ID) {
|
||||
value->value.l_int = asic_info.device_id;
|
||||
} else if (field_id == RDC_FI_REV_ID) {
|
||||
value->value.l_int = asic_info.rev_id;
|
||||
} else if (field_id == RDC_FI_TARGET_GRAPHICS_VERSION) {
|
||||
if (asic_info.target_graphics_version == 0xFFFFFFFFFFFFFFFF) {
|
||||
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
} else {
|
||||
value->value.l_int = asic_info.target_graphics_version;
|
||||
}
|
||||
} else if (field_id == RDC_FI_NUM_OF_COMPUTE_UNITS) {
|
||||
if (asic_info.num_of_compute_units == 0xFFFFFFFF) {
|
||||
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
} else {
|
||||
value->value.l_int = asic_info.num_of_compute_units;
|
||||
}
|
||||
} else {
|
||||
// this should never happen as all fields are handled above
|
||||
RDC_LOG(RDC_ERROR, "Unexpected field id: " << field_id);
|
||||
value->status = AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -726,7 +754,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
uint64_t timestamp;
|
||||
|
||||
value->status = amdsmi_get_utilization_count(processor_handle, utilization_counters,
|
||||
kUTILIZATION_COUNTERS, ×tamp);
|
||||
kUTILIZATION_COUNTERS, ×tamp);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(utilization_counters[0].value);
|
||||
@@ -858,32 +886,29 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
uint32_t num_pages = 0;
|
||||
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, nullptr);
|
||||
if (AMDSMI_STATUS_SUCCESS == ret) {
|
||||
if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
value->value.l_int = static_cast<int64_t>(num_pages);
|
||||
break;
|
||||
}
|
||||
if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
value->value.l_int = static_cast<int64_t>(num_pages);
|
||||
break;
|
||||
}
|
||||
|
||||
if ((0 < num_pages) &&
|
||||
(RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
|
||||
std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
|
||||
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages,
|
||||
bad_page_info.data());
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
if (AMDSMI_STATUS_SUCCESS == ret) {
|
||||
uint64_t pending_page_num = 0;
|
||||
for (uint32_t i=0; i < num_pages; i++) {
|
||||
if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status)
|
||||
pending_page_num++;
|
||||
}
|
||||
|
||||
value->value.l_int = static_cast<int64_t>(pending_page_num);
|
||||
if ((0 < num_pages) && (RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
|
||||
std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
|
||||
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, bad_page_info.data());
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
if (AMDSMI_STATUS_SUCCESS == ret) {
|
||||
uint64_t pending_page_num = 0;
|
||||
for (uint32_t i = 0; i < num_pages; i++) {
|
||||
if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status) pending_page_num++;
|
||||
}
|
||||
|
||||
value->value.l_int = static_cast<int64_t>(pending_page_num);
|
||||
}
|
||||
}
|
||||
} else
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->status = Smi2RdcError(ret);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -61,4 +61,13 @@ if(BUILD_RVS)
|
||||
TARGET ${RDC_RVS_LIB}
|
||||
POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_RVS_LIB_COMPONENT}.so)
|
||||
endif()
|
||||
|
||||
# Install RVS config files into /opt/rocm/share/rdc/conf/rvs/
|
||||
#file(GLOB RDC_RVS_CONFIG_FILES "${SRC_DIR}/conf/*")
|
||||
install(DIRECTORY "${SRC_DIR}/conf/"
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${RDC}/conf/rvs/
|
||||
COMPONENT ${SERVER_COMPONENT})
|
||||
#install(FILES ${RDC_RVS_CONFIG_FILES}
|
||||
# DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${RDC}/conf
|
||||
# COMPONENT ${RDC_RVS_LIB_COMPONENT})
|
||||
endif()
|
||||
|
||||
@@ -21,15 +21,43 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcDiagnosticLibInterface.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_modules/rdc_rvs/RvsBase.h"
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t) { return RDC_ST_OK; }
|
||||
std::unique_ptr<amd::rdc::RdcRVSBase> rvs_p;
|
||||
|
||||
rdc_status_t rdc_diag_destroy() { return RDC_ST_OK; }
|
||||
bool is_rvs_disabled() {
|
||||
const char* value = std::getenv("RDC_DISABLE_RVS");
|
||||
if (value == nullptr) return false;
|
||||
|
||||
std::string value_str = value;
|
||||
std::transform(value_str.begin(), value_str.end(), value_str.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
|
||||
const std::vector<const char*> positive_list = {"yes", "true", "1", "on", "y", "t"};
|
||||
|
||||
return std::any_of(positive_list.begin(), positive_list.end(),
|
||||
[&value_str](const char* val) { return value_str == val; });
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t) {
|
||||
if (is_rvs_disabled()) {
|
||||
return RDC_ST_DISABLED_MODULE;
|
||||
}
|
||||
rvs_p = std::unique_ptr<amd::rdc::RdcRVSBase>(new amd::rdc::RdcRVSBase);
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diag_destroy() {
|
||||
rvs_p.reset();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) {
|
||||
@@ -37,12 +65,11 @@ rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
*test_case_count = 3;
|
||||
test_cases[0] = RDC_DIAG_RVS_GST_TEST;
|
||||
test_cases[1] = RDC_DIAG_RVS_MEMBW_TEST;
|
||||
test_cases[2] = RDC_DIAG_RVS_H2DD2H_TEST;
|
||||
// Temporarily disabled due to configuration issues
|
||||
// test_cases[3] = RDC_DIAG_RVS_IET_TEST;
|
||||
auto test_to_conf = rvs_p->get_test_to_conf();
|
||||
*test_case_count = test_to_conf.size();
|
||||
for (auto& [key, value] : test_to_conf) {
|
||||
*test_cases++ = key;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -52,12 +79,20 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
|
||||
const bool is_custom = config != nullptr && config_size != 0;
|
||||
|
||||
rvs_status_t rvs_status = RVS_STATUS_SUCCESS;
|
||||
if (result == nullptr || gpu_count == 0) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
amd::rdc::RdcRVSBase rvs_base;
|
||||
if (rvs_p == nullptr) {
|
||||
RDC_LOG(RDC_ERROR, "rvs_p is not set!");
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
// get test_to_conf
|
||||
auto test_to_conf = rvs_p->get_test_to_conf();
|
||||
|
||||
// init the return data
|
||||
*result = {};
|
||||
@@ -69,23 +104,39 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
std::string str = "RVS test [" + test_to_name.at(test_case) + "]";
|
||||
callback->callback(callback->cookie, str.data());
|
||||
}
|
||||
|
||||
// if config is given - only run one test and return
|
||||
// do not care about test_case
|
||||
if (is_custom) {
|
||||
rvs_status = rvs_p->run_rvs_app(config, config_size + 1, callback);
|
||||
if (rvs_status != RVS_STATUS_SUCCESS) {
|
||||
result->status = RDC_DIAG_RESULT_FAIL;
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
switch (test_case) {
|
||||
case RDC_DIAG_RVS_GST_TEST:
|
||||
case RDC_DIAG_RVS_MEMBW_TEST:
|
||||
case RDC_DIAG_RVS_H2DD2H_TEST:
|
||||
case RDC_DIAG_RVS_IET_TEST: {
|
||||
const std::string test_name = "Finished running " + test_to_name.at(test_case);
|
||||
const std::string predefined_config = test_to_conf.at(test_case);
|
||||
// +1 to copy null
|
||||
strncpy_with_null(result->info, test_name.c_str(), test_name.length() + 1);
|
||||
if (config == nullptr || config_size == 0) {
|
||||
rvs_status = rvs_base.run_rvs_app(predefined_config.c_str(), predefined_config.length() + 1,
|
||||
callback);
|
||||
} else {
|
||||
rvs_status = rvs_base.run_rvs_app(config, config_size, callback);
|
||||
if (test_to_conf.find(test_case) == test_to_conf.end()) {
|
||||
RDC_LOG(RDC_ERROR, "cannot find test " << test_to_name.at(test_case));
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
const std::string predefined_config = test_to_conf.at(test_case);
|
||||
// +1 to copy null
|
||||
strncpy_with_null(result->info, test_name.c_str(), test_name.length() + 1);
|
||||
rvs_status =
|
||||
rvs_p->run_rvs_app(predefined_config.c_str(), predefined_config.length() + 1, callback);
|
||||
break;
|
||||
}
|
||||
case RDC_DIAG_RVS_CUSTOM:
|
||||
RDC_LOG(RDC_ERROR, "custom config cannot be bundled with other tests!");
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
break;
|
||||
default:
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH);
|
||||
|
||||
@@ -23,19 +23,113 @@ THE SOFTWARE.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_modules/rdc_rvs/RvsBase.h"
|
||||
#include "rvs/rvs.h"
|
||||
|
||||
// TODO: Make generic test
|
||||
// TODO: Allow for user to override defaults with a custom string
|
||||
#define CHECK_RVS(STATUS, SESSION) \
|
||||
do { \
|
||||
static_assert(std::is_same<decltype(STATUS), rvs_status_t>::value || \
|
||||
std::is_same<decltype(STATUS), rvs_status_t&>::value || \
|
||||
std::is_convertible<decltype(STATUS), rvs_status_t>::value, \
|
||||
"STATUS must be of type rvs_status_t"); \
|
||||
static_assert(std::is_same<decltype(SESSION), rvs_session_id_t>::value || \
|
||||
std::is_same<decltype(SESSION), rvs_session_id_t&>::value, \
|
||||
"SESSION must be of type rvs_session_t"); \
|
||||
if ((STATUS) != RVS_STATUS_SUCCESS) { \
|
||||
RDC_LOG(RDC_ERROR, \
|
||||
"RVS failed at[" << __FILE__ << ":" << __LINE__ << "] with status: " << (STATUS)); \
|
||||
rvs_session_destroy((SESSION)); \
|
||||
return (STATUS); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
amd::rdc::RdcRVSBase* amd::rdc::RdcRVSBase::s_instance = nullptr;
|
||||
namespace amd::rdc {
|
||||
|
||||
rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t config_size,
|
||||
rdc_diag_callback_t* callback) {
|
||||
RdcRVSBase* RdcRVSBase::s_instance = nullptr;
|
||||
RdcRVSBase::RdcRVSBase() {
|
||||
std::string config_path(get_rocm_path("librdc.so"));
|
||||
s_instance = this;
|
||||
// these configs are installed with RDC and are mostly stripped down
|
||||
// versions of RVS configs
|
||||
config_path.append("/share/rdc/conf/rvs/");
|
||||
amdsmi_processor_handle processor_handle = nullptr;
|
||||
auto err = get_processor_handle_from_id(0, &processor_handle);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "get_processor_handle_from_id failed! " << err);
|
||||
return;
|
||||
}
|
||||
amdsmi_asic_info_t asic_info;
|
||||
err = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "amdsmi_get_gpu_asic_info failed! " << err);
|
||||
return;
|
||||
}
|
||||
|
||||
auto found_gpu = gfx_to_rvs_conf.find(asic_info.target_graphics_version);
|
||||
if (found_gpu == gfx_to_rvs_conf.end()) {
|
||||
// gpu name is not found
|
||||
RDC_LOG(RDC_INFO, "RVS couldn't match GFX version to name. Using \"default\"");
|
||||
config_path.append("default");
|
||||
} else {
|
||||
// gpu name is found - look up the name
|
||||
config_path.append(found_gpu->second);
|
||||
}
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "RVS CONFIG PATH: " << config_path);
|
||||
|
||||
// populate configs
|
||||
for (auto& ent : std::filesystem::directory_iterator(config_path)) {
|
||||
if (ent.is_regular_file()) {
|
||||
_rvs_config_list.push_back(ent.path().string());
|
||||
}
|
||||
}
|
||||
|
||||
// map test enums to config paths
|
||||
for (rdc_diag_test_cases_t i = RDC_DIAG_TEST_FIRST; i < RDC_DIAG_TEST_LAST;
|
||||
i = static_cast<rdc_diag_test_cases_t>(i + 1)) {
|
||||
if (test_to_name.find(i) == test_to_name.end()) {
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < _rvs_config_list.size(); j++) {
|
||||
std::filesystem::path config_path(_rvs_config_list.at(j));
|
||||
// error handling for path
|
||||
if (!config_path.has_filename()) {
|
||||
RDC_LOG(RDC_ERROR, "RVS config path has no filename: " << _rvs_config_list.at(j));
|
||||
continue;
|
||||
}
|
||||
// strip path, only keep filename
|
||||
std::string config = config_path.filename().string();
|
||||
if (test_to_name.at(i) == config) {
|
||||
_test_to_conf[i] = config_path.string();
|
||||
RDC_LOG(RDC_DEBUG, "TEST_ADDED " << test_to_name.at(i) << " = " << _test_to_conf[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// manually add custom config
|
||||
_test_to_conf[RDC_DIAG_RVS_CUSTOM] = "";
|
||||
|
||||
auto status = rvs_initialize();
|
||||
if (status != RVS_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "rvs initialization failed");
|
||||
}
|
||||
};
|
||||
|
||||
RdcRVSBase::~RdcRVSBase() {
|
||||
if (s_instance == this) {
|
||||
s_instance = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::string> RdcRVSBase::get_rvs_configs() { return _rvs_config_list; }
|
||||
|
||||
rvs_status_t RdcRVSBase::run_rvs_app(const char* config, const size_t config_size,
|
||||
rdc_diag_callback_t* callback) {
|
||||
char active_config[MAX_CONFIG_LENGTH];
|
||||
rvs_session_property_t session_property = {RVS_SESSION_TYPE_DEFAULT_CONF, {{RVS_MODULE_GST}}};
|
||||
rvs_session_id_t session_id;
|
||||
@@ -44,9 +138,8 @@ rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t
|
||||
// Meaning RDC index has no impact on RVS index.
|
||||
|
||||
if ((config == nullptr) || (config_size == 0)) {
|
||||
RDC_LOG(RDC_INFO, "given config is NULL! Using predefined gst_config");
|
||||
strncpy_with_null(active_config, test_to_conf.at(RDC_DIAG_RVS_GST_TEST).c_str(),
|
||||
test_to_conf.at(RDC_DIAG_RVS_GST_TEST).length()+1);
|
||||
RDC_LOG(RDC_ERROR, "given config is NULL! Cannot run tests!");
|
||||
return RVS_STATUS_INVALID_ARGUMENT;
|
||||
} else if (config_size > MAX_CONFIG_LENGTH) {
|
||||
RDC_LOG(RDC_ERROR, "given config size is too large! Expected at most "
|
||||
<< MAX_CONFIG_LENGTH << ", got " << config_size << " instead.");
|
||||
@@ -56,12 +149,6 @@ rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t
|
||||
strncpy_with_null(active_config, config, config_size);
|
||||
}
|
||||
|
||||
status = rvs_initialize();
|
||||
if (status == RVS_STATUS_FAILED) {
|
||||
RDC_LOG(RDC_ERROR, "rvs initialization failed");
|
||||
return status;
|
||||
}
|
||||
|
||||
/*******************************/
|
||||
|
||||
_state = RVS_SESSION_STATE_IDLE;
|
||||
@@ -71,28 +158,34 @@ rvs_status_t amd::rdc::RdcRVSBase::run_rvs_app(const char* config, const size_t
|
||||
_callback = callback;
|
||||
status = rvs_session_create(&session_id, &RdcRVSBase::static_callback);
|
||||
|
||||
session_property.type = RVS_SESSION_TYPE_CUSTOM_ACTION;
|
||||
CHECK_RVS(status, session_id);
|
||||
|
||||
session_property.type = RVS_SESSION_TYPE_CUSTOM_CONF;
|
||||
session_property.custom_action.config = active_config;
|
||||
|
||||
status = rvs_session_set_property(session_id, &session_property);
|
||||
|
||||
CHECK_RVS(status, session_id);
|
||||
|
||||
status = rvs_session_execute(session_id);
|
||||
|
||||
if (status != RVS_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "RVS session execute failed with status: " << status);
|
||||
rvs_session_destroy(session_id);
|
||||
return status;
|
||||
}
|
||||
CHECK_RVS(status, session_id);
|
||||
|
||||
// TODO: remove?
|
||||
while (_state != RVS_SESSION_STATE_COMPLETED) {
|
||||
};
|
||||
|
||||
_callback = nullptr;
|
||||
|
||||
status = rvs_session_destroy(session_id);
|
||||
if (status != RVS_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "RVS session destroy failed with status: " << status);
|
||||
}
|
||||
// this will try to destroy the session again, but it shouldn't matter
|
||||
// I don't want to define a second macro.
|
||||
CHECK_RVS(status, session_id);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
std::map<rdc_diag_test_cases_t, std::string> RdcRVSBase::get_test_to_conf() {
|
||||
return _test_to_conf;
|
||||
}
|
||||
} // namespace amd::rdc
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# BABEL test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
|
||||
# Set parallel execution to false
|
||||
# Set buffer size to reflect the buffer you want to test
|
||||
# Set run count to 1 (test will run once)
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: babel-256MiB
|
||||
device: all
|
||||
module: babel # Name of the module
|
||||
parallel: true # Parallel true or false
|
||||
count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time)
|
||||
num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
|
||||
array_size: 268435456 # Buffer size the test operates, this is 256 MiB
|
||||
test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
|
||||
mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB
|
||||
o/p_csv: false # o/p as csv file
|
||||
subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
|
||||
dwords_per_lane: 4 # Number of dwords per lane
|
||||
chunks_per_block: 4 # Number of chunks per block
|
||||
|
||||
@@ -0,0 +1,174 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# Run test with testscript or binary:
|
||||
#
|
||||
# Using Testscript -
|
||||
# cd /opt/rocm/share/rocm-validation-suite/testscripts
|
||||
# sudo ./gpup.new.sh
|
||||
#
|
||||
# Using Binary -
|
||||
# cd /opt/rocm/share/rocm-validation-suite/conf
|
||||
# cd /opt/rocm/bin
|
||||
# sudo ./rvs -c /opt/rocm/share/rocm-validation-suite/conf/gpup_single.conf
|
||||
#
|
||||
# Note: Paths may vary with the ROCm version or ROCm installation path.
|
||||
|
||||
# GPUP test #1
|
||||
#
|
||||
# Preconditions:
|
||||
# all AMD compatible GPUs
|
||||
# all types of devices
|
||||
# all gpu properties, all io_links properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties values for any GPUs
|
||||
|
||||
actions:
|
||||
- name: RVS-GPUP-TC1
|
||||
device: all
|
||||
module: gpup
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #2
|
||||
#
|
||||
# Preconditions:
|
||||
# all AMD compatible GPUs
|
||||
# all types of devices
|
||||
# no regular expressions
|
||||
# only a subset of gpu properties, only a subset of io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying subsets of properties and io_link properties values for any GPUs
|
||||
|
||||
- name: RVS-GPUP-TC2
|
||||
device: all
|
||||
module: gpup
|
||||
properties:
|
||||
simd_count:
|
||||
mem_banks_count:
|
||||
io_links_count:
|
||||
vendor_id:
|
||||
location_id:
|
||||
max_engine_clk_ccompute:
|
||||
io_links-properties:
|
||||
version_major:
|
||||
type:
|
||||
version_major:
|
||||
version_minor:
|
||||
node_from:
|
||||
node_to:
|
||||
recommended_transfer_size:
|
||||
flags:
|
||||
|
||||
# GPUP test #3
|
||||
#
|
||||
# Preconditions:
|
||||
# only a subset of AMD compatible GPUs (device filtering)
|
||||
# all types of devices
|
||||
# all gpu properties, all io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties and io_link properties values for subset of GPUs
|
||||
#
|
||||
# Note:
|
||||
# Testing specific device, if device numbers are changed in system it should be changed in the test
|
||||
|
||||
- name: RVS-GPUP-TC3
|
||||
device: all
|
||||
module: gpup
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #4
|
||||
#
|
||||
# Preconditions:
|
||||
# all AMD compatible GPUs
|
||||
# a given device type (deviceid filtering), this must be filled based on deviceid in sysfs/ ./rvs -g.
|
||||
# Default is 0=> no filtering
|
||||
# all gpu properties, all io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties and io_link properties values for all GPUs and given deviceid
|
||||
|
||||
- name: RVS-GPUP-TC4
|
||||
device: all
|
||||
module: gpup
|
||||
deviceid: 0
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #5
|
||||
#
|
||||
# Preconditions:
|
||||
# only a subset of AMD compatible GPUs (device filtering)
|
||||
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
|
||||
# Default is 0=> no filtering
|
||||
# all gpu properties, all io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties and io_link properties values for subset of GPUs and given deviceid
|
||||
#
|
||||
# Note:
|
||||
# Testing specific device, if device numbers are changed in system it should be changed in the test
|
||||
|
||||
- name: RVS-GPUP-TC5
|
||||
device: all
|
||||
module: gpup
|
||||
deviceid: 0
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #6
|
||||
#
|
||||
# Preconditions:
|
||||
# only a subset of AMD compatible GPUs (device filtering)
|
||||
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
|
||||
# Default is 0=> no filtering
|
||||
# only a subset of gpu properties, only a subset of io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying subset of properties and io_link properties values for subset of GPUs and given deviceid
|
||||
#
|
||||
# Note:
|
||||
# Testing specific device, if device numbers are changed in system it should be changed in the test
|
||||
|
||||
- name: RVS-GPUP-TC6
|
||||
device: all
|
||||
module: gpup
|
||||
deviceid: 0
|
||||
properties:
|
||||
mem_banks_count:
|
||||
io_links-properties:
|
||||
version_major:
|
||||
@@ -0,0 +1,132 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
|
||||
|
||||
# GST test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space
|
||||
# Set parallel execution to false
|
||||
# Set matrix_size to 8640 (for Vega 10 cards). For Vega 20, the recommended matrix_size is 8640
|
||||
# Set run count to 2 (each test will run twice)
|
||||
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# sudo ./rvs -c conf/gst_1.conf -d 3
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves 5000 gflops
|
||||
# in maximum 7 seconds and then the GPU sustains the gflops
|
||||
# for the rest of the test duration (total duration is 18 seconds).
|
||||
# A single Gflops violation (with a 7% tolerance) is allowed.
|
||||
# FALSE otherwise
|
||||
|
||||
actions:
|
||||
- name: gpustress-41000-fp32-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 10000
|
||||
copy_matrix: false
|
||||
target_stress: 41000
|
||||
matrix_size_a: 28000
|
||||
matrix_size_b: 28000
|
||||
matrix_size_c: 28000
|
||||
data_type: fp32_r
|
||||
lda: 28000
|
||||
ldb: 28000
|
||||
ldc: 28000
|
||||
blas_source: hipblaslt
|
||||
|
||||
- name: gpustress-30000-dgemm-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
#hot_calls: 1000
|
||||
duration: 15000
|
||||
copy_matrix: false
|
||||
target_stress: 30000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 8192
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
ops_type: dgemm
|
||||
lda: 8192
|
||||
ldb: 8192
|
||||
ldc: 8192
|
||||
|
||||
- name: gst-8096-150000-fp16
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
copy_matrix: false
|
||||
target_stress: 150000
|
||||
matrix_size_a: 8096
|
||||
matrix_size_b: 8096
|
||||
matrix_size_c: 8096
|
||||
data_type: fp16_r
|
||||
lda: 8096
|
||||
ldb: 8096
|
||||
ldc: 8096
|
||||
ldd: 8096
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
blas_source: hipblaslt
|
||||
|
||||
- name: gst-160Tflops-8K8K8K-rand-i8
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 500
|
||||
copy_matrix: false
|
||||
target_stress: 160000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 8192
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: i8_r
|
||||
lda: 8192
|
||||
ldb: 8192
|
||||
ldc: 8192
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
blas_source: hipblaslt
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: action_1
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 700
|
||||
log_interval: 700
|
||||
max_violations: 1
|
||||
target_power: 300
|
||||
tolerance: 0.06
|
||||
matrix_size: 8640
|
||||
ops_type: dgemm
|
||||
|
||||
- name: action_2
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 1500
|
||||
log_interval: 2000
|
||||
max_violations: 1
|
||||
target_power: 300
|
||||
tolerance: 0.2
|
||||
matrix_size: 8640
|
||||
ops_type: dgemm
|
||||
|
||||
- name: action_3
|
||||
device: all
|
||||
module: iet
|
||||
parallel: false
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 500
|
||||
log_interval: 500
|
||||
max_violations: 1
|
||||
target_power: 300
|
||||
tolerance: 0.1
|
||||
matrix_size: 8640
|
||||
ops_type: dgemm
|
||||
|
||||
# IET test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to true
|
||||
# Set matrix_size to 8640 (for Vega 10 cards). For Vega 20, the recommended matrix_size is 8640
|
||||
# Set run count to 2 (each test will run twice)
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# sudo ./rvs -c conf/iet4.conf -d 3
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches 150W
|
||||
# in maximum 5 seconds and then the GPU sustains the same power
|
||||
# for the rest of the test duration (total duration is 10 seconds).
|
||||
# A single power violation (with a 10% tolerance) is allowed.
|
||||
# FALSE otherwise
|
||||
|
||||
- name: action_4
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 500
|
||||
log_interval: 500
|
||||
max_violations: 1
|
||||
target_power: 300
|
||||
tolerance: 0.1
|
||||
matrix_size: 8640
|
||||
ops_type: sgemm
|
||||
|
||||
# IET test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to false
|
||||
# Set matrix_size to 8640 (for Vega 10 cards). For Vega 20, the recommended matrix_size is 8640
|
||||
# Set run count to 2 (each test will run twice)
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# sudo ./rvs -c conf/iet5.conf -d 3
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches 50W
|
||||
# in maximum 5 seconds and then the GPU sustains the same power
|
||||
# for the rest of the test duration (total duration is 10 seconds).
|
||||
# A single power violation (with a 10% tolerance) is allowed.
|
||||
# FALSE otherwise
|
||||
|
||||
- name: action_5
|
||||
device: all
|
||||
module: iet
|
||||
parallel: false
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 1500
|
||||
log_interval: 2000
|
||||
max_violations: 1
|
||||
target_power: 300
|
||||
tolerance: 0.1
|
||||
matrix_size: 8640
|
||||
ops_type: sgemm
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: action_1
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
block_size: 1000000 2000000 10000000
|
||||
device_id: all
|
||||
|
||||
- name: action_2
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 1000
|
||||
count: 3
|
||||
duration: 10000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_3
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 4000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_4
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 1000
|
||||
duration: 5000
|
||||
count: 1
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_5
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 4000
|
||||
count: 1
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_6
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 8000
|
||||
count: 1
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: false
|
||||
device_id: all
|
||||
|
||||
- name: action_7
|
||||
device: all
|
||||
module: pbqt
|
||||
peers: all
|
||||
count: 1
|
||||
test_bandwidth: false
|
||||
device_id: all
|
||||
|
||||
- name: action_8
|
||||
device: all
|
||||
module: pbqt
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel : true
|
||||
device_id: all
|
||||
|
||||
- name: action_9
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 500
|
||||
duration: 1000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_10
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 500
|
||||
duration: 1000
|
||||
peers: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: true
|
||||
|
||||
- name: action_11
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 0
|
||||
duration: 10000
|
||||
peers: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: false
|
||||
device_id: all
|
||||
|
||||
- name: action_12
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 0
|
||||
duration: 1000
|
||||
count: 3
|
||||
wait: 1000
|
||||
peers: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
|
||||
- name: action_13
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 1000
|
||||
duration: 10000
|
||||
peers: all
|
||||
device_id: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
|
||||
- name: action_14
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 500
|
||||
duration: 10000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
device_id: all
|
||||
@@ -0,0 +1,236 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# PEBB test #1
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test1.conf -d 3
|
||||
#
|
||||
|
||||
|
||||
actions:
|
||||
- name: h2d-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 50000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
# PEBB test #2
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. device to host
|
||||
#
|
||||
# Run test with :
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test2.conf -d 3
|
||||
#
|
||||
|
||||
|
||||
- name: d2h-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
|
||||
# PEBB test #3
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test3.conf -d 3
|
||||
#
|
||||
|
||||
- name: h2d-d2h-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
|
||||
|
||||
# PEBB test #4
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device
|
||||
# 4. parallel transfers
|
||||
# 5. random block sizes
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test4.conf -d 3
|
||||
|
||||
- name: h2d-parallel-xMB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
# PEBB test #5
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. device to host
|
||||
# 4. parallel transfers
|
||||
# 5. random block sizes
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test5.conf -d 3
|
||||
|
||||
- name: d2h-parallel-xMB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
|
||||
# PEBB test #6
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
# 4. parallel transfers
|
||||
# 5. random block sizes
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test6.conf -d 3
|
||||
|
||||
- name: h2d-d2h-xMB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
# PEBB test #7
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device
|
||||
# 4. parallel transfers
|
||||
# 5. back-to-back 51MB
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test7.conf -d 3
|
||||
|
||||
- name: h2d-b2b-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 34000
|
||||
device_to_host: false
|
||||
host_to_device: true
|
||||
b2b_block_size: 51200
|
||||
parallel: false
|
||||
link_type: 2 # PCIe
|
||||
|
||||
|
||||
# PEBB test #8
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host-to-device and device-to-host
|
||||
# 4. parallel back-to-back transfers
|
||||
# 5. back-to-back 51MB
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test8.conf -d 3
|
||||
|
||||
- name: d2h-b2b-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
b2b_block_size: 51200
|
||||
parallel: true
|
||||
link_type: 2 # PCIe
|
||||
|
||||
# PEBB test #9
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
# 4. PCIe ponly
|
||||
# 5. parallel back-to-back transfers
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/pebb_test9.conf -d 3
|
||||
|
||||
- name: h2d-d2h-b2b-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 34000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
b2b_block_size: 51200
|
||||
parallel: false
|
||||
link_type: 2 # PCIe
|
||||
@@ -0,0 +1,91 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# TST test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all and execution as sequential.
|
||||
# Workload set as dgemm operations with matrix size as 8640.
|
||||
# Throttle temperature set as 100 degree celsius.
|
||||
#
|
||||
# Run test with:
|
||||
# ./rvs -c conf/tst.conf -d 3
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU junction temperature
|
||||
# reaches the target temperature. If it reaches the throttle temperature
|
||||
# during test duration is also monitored.
|
||||
#
|
||||
actions:
|
||||
- name: action_1
|
||||
device: all
|
||||
device_index: all
|
||||
module: tst
|
||||
parallel: false
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 30000
|
||||
ramp_interval: 10000
|
||||
sample_interval: 2000
|
||||
log_interval: 2000
|
||||
max_violations: 1
|
||||
throttle_temp: 100
|
||||
target_temp: 50
|
||||
tolerance: 0.06
|
||||
matrix_size: 8640
|
||||
ops_type: dgemm
|
||||
|
||||
# TST test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all and execution in parallel.
|
||||
# Workload set as dgemm operations with matrix size as 8640.
|
||||
# Throttle temperature set as 100 degree celsius.
|
||||
#
|
||||
# Run test with:
|
||||
# ./rvs -c conf/tst.conf -d 3
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU junction temperature
|
||||
# reaches the target temperature. If it reaches the throttle temperature
|
||||
# during test duration is also monitored.
|
||||
#
|
||||
- name: action_2
|
||||
device: all
|
||||
device_index: all
|
||||
module: tst
|
||||
parallel: true
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 700
|
||||
log_interval: 700
|
||||
target_temp: 50
|
||||
throttle_temp: 100
|
||||
tolerance: 0.06
|
||||
matrix_size: 8640
|
||||
ops_type: sgemm
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# IET stress test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by comma.
|
||||
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
|
||||
# Set gemm operation type as dgemm.
|
||||
# Set matrix_size to 28000.
|
||||
# Test duration set to 10 mins.
|
||||
# Target power set to 550W for each GPU.
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/iet_stress.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves power target of 550W.
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: iet-stress-550W-dgemm-true
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
duration: 60000
|
||||
ramp_interval: 10000
|
||||
sample_interval: 3000
|
||||
log_interval: 3000
|
||||
target_power: 550
|
||||
matrix_size: 28000
|
||||
ops_type: dgemm
|
||||
lda: 28000
|
||||
ldb: 28000
|
||||
ldc: 28000
|
||||
alpha: 1
|
||||
beta: 1
|
||||
|
||||
@@ -0,0 +1,229 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# PEBB test #1
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
actions:
|
||||
- name: h2d-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 50000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #2
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. device to host
|
||||
#
|
||||
# Run test with :
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: d2h-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #3
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: h2d-d2h-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #4
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device
|
||||
# 4. parallel transfers
|
||||
# 5. random block sizes
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: h2d-parallel-xMB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #5
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. device to host
|
||||
# 4. parallel transfers
|
||||
# 5. random block sizes
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: d2h-parallel-xMB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #6
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
# 4. parallel transfers
|
||||
# 5. random block sizes
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: h2d-d2h-xMB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #7
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device
|
||||
# 4. parallel transfers
|
||||
# 5. back-to-back 51MB
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: h2d-b2b-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 34000
|
||||
device_to_host: false
|
||||
host_to_device: true
|
||||
b2b_block_size: 51200
|
||||
parallel: false
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #8
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host-to-device and device-to-host
|
||||
# 4. parallel back-to-back transfers
|
||||
# 5. back-to-back 51MB
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: d2h-b2b-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
b2b_block_size: 51200
|
||||
parallel: true
|
||||
link_type: 4 # XGMI
|
||||
|
||||
|
||||
# PEBB test #9
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
# 4. XGMI only
|
||||
# 5. parallel back-to-back transfers
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300A/pebb_single.conf -d 3
|
||||
#
|
||||
- name: h2d-d2h-b2b-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 34000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
b2b_block_size: 51200
|
||||
parallel: false
|
||||
link_type: 4 # XGMI
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# BABEL test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
|
||||
# Set parallel execution to false
|
||||
# Set buffer size to reflect the buffer you want to test
|
||||
# Set run count to 1 (test will run once)
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: babel-float-256MiB
|
||||
device: all
|
||||
module: babel # Name of the module
|
||||
parallel: false # Parallel true or false
|
||||
count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time)
|
||||
num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
|
||||
array_size: 268435456 # Buffer size the test operates, this is 256 MiB
|
||||
test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
|
||||
mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB
|
||||
o/p_csv: false # o/p as csv file
|
||||
subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: gst-1000Tflops-8KB-fp8_r-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: false
|
||||
count: 1
|
||||
duration: 30000
|
||||
copy_matrix: false
|
||||
target_stress: 1000000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 8192
|
||||
matrix_size_c: 8192
|
||||
data_type: fp8_r
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
|
||||
- name: gst-1000Tflops-8KB-fp8_r-true
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 1000000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 8192
|
||||
matrix_size_c: 8192
|
||||
data_type: fp8_r
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
|
||||
- name: gst-500Tflops-4KB-bf16_r-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: false
|
||||
count: 1
|
||||
duration: 30000
|
||||
copy_matrix: false
|
||||
target_stress: 500000
|
||||
matrix_size_a: 4096
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
data_type: bf16_r
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
|
||||
- name: gst-500Tflops-4KB-bf16_r-true
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 500000
|
||||
matrix_size_a: 4096
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
data_type: bf16_r
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
|
||||
@@ -0,0 +1,181 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
|
||||
# GST self-check & accuracy-check test - gst-3K-sgemm-check
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space
|
||||
# Set matrices sizes to 3072 * 3072 * 3072
|
||||
# Set gemm operation as sgemm
|
||||
# Set matrix data initialization method as random integer
|
||||
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
|
||||
# Set target stress GFLOPS as 1215000 (1215 TFLOPS)
|
||||
# Set self-check gemm self checking as enabled
|
||||
# Set accu-check gemm accuracy checking as enabled (applicable for sgemm & dgemm only)
|
||||
# Set error-inject gemm error injection as enabled (For TEST purpose only)
|
||||
# Set error-freq error injection frequency as 2 (For TEST purpose only)
|
||||
# Set error-count error injection count as 1 (For TEST purpose only)
|
||||
#
|
||||
# Expected result:
|
||||
# Report self-check and accu-error at regular intervals as per set error parameters.
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves 100 TFLOPS or more
|
||||
# within the test duration of 2 mins after ramp-up duration of 5 seconds.
|
||||
# Else test on the GPU fails (FALSE).
|
||||
actions:
|
||||
- name: gst-3K-sgemm-check
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 120000
|
||||
hot_calls: 1
|
||||
copy_matrix: false
|
||||
target_stress: 100000
|
||||
matrix_size_a: 3072
|
||||
matrix_size_b: 3072
|
||||
matrix_size_c: 3072
|
||||
matrix_init: rand
|
||||
ops_type: sgemm
|
||||
lda: 3072
|
||||
ldb: 3072
|
||||
ldc: 3072
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
self_check: true
|
||||
accuracy_check: true
|
||||
error_inject: true
|
||||
error_freq: 2
|
||||
error_count: 1
|
||||
|
||||
- name: gst-3K-dgemm-check
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 120000
|
||||
hot_calls: 1
|
||||
copy_matrix: false
|
||||
target_stress: 85000
|
||||
matrix_size_a: 3072
|
||||
matrix_size_b: 3072
|
||||
matrix_size_c: 3072
|
||||
matrix_init: rand
|
||||
ops_type: dgemm
|
||||
lda: 3072
|
||||
ldb: 3072
|
||||
ldc: 3072
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
self_check: true
|
||||
accuracy_check: true
|
||||
error_inject: true
|
||||
error_freq: 2
|
||||
error_count: 1
|
||||
|
||||
- name: gst-3K-fp16-check
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 30000
|
||||
hot_calls: 1
|
||||
copy_matrix: false
|
||||
target_stress: 150000
|
||||
matrix_size_a: 3072
|
||||
matrix_size_b: 3072
|
||||
matrix_size_c: 3072
|
||||
matrix_init: rand
|
||||
data_type: fp16_r
|
||||
lda: 3072
|
||||
ldb: 3072
|
||||
ldc: 3072
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
self_check: true
|
||||
error_inject: true
|
||||
error_freq: 2
|
||||
error_count: 1
|
||||
|
||||
- name: gst-3K-bf16-check
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 30000
|
||||
hot_calls: 1
|
||||
copy_matrix: false
|
||||
target_stress: 250000
|
||||
matrix_size_a: 3072
|
||||
matrix_size_b: 3072
|
||||
matrix_size_c: 3072
|
||||
matrix_init: rand
|
||||
data_type: bf16_r
|
||||
lda: 3072
|
||||
ldb: 3072
|
||||
ldc: 3072
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
self_check: true
|
||||
error_inject: true
|
||||
error_freq: 2
|
||||
error_count: 1
|
||||
|
||||
- name: gst-3K-fp8-check
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 30000
|
||||
hot_calls: 1
|
||||
copy_matrix: false
|
||||
target_stress: 300000
|
||||
matrix_size_a: 3072
|
||||
matrix_size_b: 3072
|
||||
matrix_size_c: 3072
|
||||
matrix_init: rand
|
||||
data_type: fp8_r
|
||||
lda: 3072
|
||||
ldb: 3072
|
||||
ldc: 3072
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
self_check: true
|
||||
error_inject: true
|
||||
error_freq: 2
|
||||
error_count: 1
|
||||
|
||||
@@ -0,0 +1,186 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# GST test - gst-1215Tflops-4K4K8K-rand-fp8
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space
|
||||
# Set matrices sizes to 4864 * 4096 * 8192
|
||||
# Set matrix data type as fp8 real number
|
||||
# Set matrix data initialization method as random integer
|
||||
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
|
||||
# Set target stress GFLOPS as 1215000 (1215 TFLOPS)
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves 1215 TFLOPS or more
|
||||
# within the test duration of 15 seconds after ramp-up duration of 5 seconds.
|
||||
# Else test on the GPU fails (FALSE).
|
||||
|
||||
actions:
|
||||
- name: gst-1215Tflops-4K4K8K-rand-fp8
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 1000
|
||||
copy_matrix: false
|
||||
target_stress: 1215000
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: fp8_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-981Tflops-4K4K8K-trig-fp8
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 1000
|
||||
copy_matrix: false
|
||||
target_stress: 981000
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: fp8_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-639Tflops-4K4K8K-rand-fp16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 1000
|
||||
copy_matrix: false
|
||||
target_stress: 639000
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: fp16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-523Tflops-4K4K8K-trig-fp16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 1000
|
||||
copy_matrix: false
|
||||
target_stress: 523000
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: fp16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-581Tflops-4K4K8K-rand-bf16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 1000
|
||||
copy_matrix: false
|
||||
target_stress: 581000
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: bf16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-552Tflops-4K4K8K-trig-bf16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 1000
|
||||
copy_matrix: false
|
||||
target_stress: 552000
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: bf16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
@@ -0,0 +1,63 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# GST test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space
|
||||
# Set parallel execution to true (workload execution on all GPUs in parallel)
|
||||
# Set matrix_size to 28000.
|
||||
# Set run count to 1 (each test will run twice)
|
||||
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/gst_stress.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves 50000 gflops
|
||||
|
||||
actions:
|
||||
- name: gpustress-50000-dgemm-true
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 50000
|
||||
matrix_size_a: 28000
|
||||
matrix_size_b: 28000
|
||||
matrix_size_c: 28000
|
||||
ops_type: dgemm
|
||||
lda: 28000
|
||||
ldb: 28000
|
||||
ldc: 28000
|
||||
alpha: 1
|
||||
beta: 1
|
||||
matrix_init: hiprand
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# Test #1 - iet-400W-1K-rand-dgemm
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to true
|
||||
# Set matrix_size to 1024 for dgemm operations
|
||||
# Set target power to 400 Watts
|
||||
# Set test duration to 1 min
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/iet_single.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches at least 400 Watts,
|
||||
# FALSE otherwise
|
||||
|
||||
actions:
|
||||
- name: iet-400W-1K-rand-dgemm
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
duration: 60000
|
||||
sample_interval: 3000
|
||||
target_power: 400
|
||||
matrix_size: 1024
|
||||
matrix_init: rand
|
||||
ops_type: dgemm
|
||||
|
||||
# Test #2 - iet-wait-750W-28K-rand-dgemm
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to true
|
||||
# Set matrix_size to 28000 for dgemm operations
|
||||
# Set target power to 750 Watts
|
||||
# Set wait duration to 30 seconds (GPU idle period)
|
||||
# Set test duration to 2 mins
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/iet_single.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches at least 750 Watts,
|
||||
# FALSE otherwise
|
||||
|
||||
- name: iet-wait-750W-28K-rand-dgemm
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
wait: 30000 # Wait for 30 secs before the test starts
|
||||
duration: 60000
|
||||
sample_interval: 3000
|
||||
target_power: 750
|
||||
matrix_size: 28000
|
||||
matrix_init: hiprand
|
||||
ops_type: dgemm
|
||||
|
||||
# Test #3 - iet-wait-400W-1K-rand-dgemm
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to true
|
||||
# Set matrix_size to 1024 for dgemm operations
|
||||
# Set target power to 400 Watts
|
||||
# Set wait duration to 30 seconds (GPU idle period)
|
||||
# Set test duration to 1 min
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/iet_single.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches at least 400 Watts,
|
||||
# FALSE otherwise
|
||||
|
||||
- name: iet-wait-400W-1K-rand-dgemm
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
wait: 30000 # Wait for 30 secs before the test starts
|
||||
duration: 60000
|
||||
sample_interval: 3000
|
||||
log_interval: 3000
|
||||
target_power: 400
|
||||
matrix_size: 1024
|
||||
matrix_init: rand
|
||||
ops_type: dgemm
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# IET stress test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by comma.
|
||||
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
|
||||
# Set gemm operation type as dgemm.
|
||||
# Set matrix_size to 28000.
|
||||
# Test duration set to 10 mins.
|
||||
# Target power set to 750W for each GPU.
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/iet_stress.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves power target of 750W.
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: iet-stress-750W-dgemm-true
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
duration: 60000
|
||||
ramp_interval: 10000
|
||||
sample_interval: 5000
|
||||
log_interval: 5000
|
||||
target_power: 750
|
||||
matrix_size: 28000
|
||||
ops_type: dgemm
|
||||
lda: 28000
|
||||
ldb: 28000
|
||||
ldc: 28000
|
||||
alpha: 1
|
||||
beta: 1
|
||||
matrix_init: hiprand
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: p2p-unidir-sequential-64MB
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 5000
|
||||
duration: 60000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: false
|
||||
block_size: 67108864
|
||||
device_id: all
|
||||
|
||||
- name: p2p-unidir-parallel-64MB
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 5000
|
||||
duration: 60000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: true
|
||||
block_size: 67108864
|
||||
device_id: all
|
||||
|
||||
- name: p2p-bidir-sequential-64MB
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 5000
|
||||
duration: 60000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: false
|
||||
block_size: 67108864
|
||||
device_id: all
|
||||
|
||||
- name: p2p-bidir-parallel-64MB
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 5000
|
||||
duration: 60000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
block_size: 67108864
|
||||
device_id: all
|
||||
|
||||
- name: p2p-bidir-sequential-64-128-256MB
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 5000
|
||||
duration: 60000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: false
|
||||
block_size: 67108864 134217728 268435456
|
||||
device_id: all
|
||||
|
||||
- name: p2p-bidir-parallel-64-128-256MB
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 5000
|
||||
duration: 60000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
block_size: 67108864 134217728 268435456
|
||||
device_id: all
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# PEBB test #1
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. device to host
|
||||
# 4. Transfer block size 64MB
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/pebb_single.conf
|
||||
#
|
||||
actions:
|
||||
- name: d2h-sequential-64MB
|
||||
device: all
|
||||
module: pebb
|
||||
duration: 60000
|
||||
device_to_host: true
|
||||
host_to_device: false
|
||||
parallel: false
|
||||
block_size: 67108864
|
||||
link_type: 2 # PCIe
|
||||
|
||||
# PEBB test #2
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. device to host
|
||||
# 4. Transfer block size 64MB
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/pebb_single.conf
|
||||
#
|
||||
- name: h2d-sequential-64MB
|
||||
device: all
|
||||
module: pebb
|
||||
duration: 60000
|
||||
device_to_host: false
|
||||
host_to_device: true
|
||||
parallel: false
|
||||
block_size: 67108864
|
||||
link_type: 2 # PCIe
|
||||
|
||||
# PEBB test #3
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. host to device & device to host
|
||||
# 4. Transfer block size 64MB
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI300X/pebb_single.conf
|
||||
#
|
||||
- name: h2d-d2h-sequential-64MB
|
||||
device: all
|
||||
module: pebb
|
||||
duration: 60000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: false
|
||||
block_size: 67108864
|
||||
link_type: 2 # PCIe
|
||||
warm_calls: 10
|
||||
hot_calls: 100
|
||||
b2b: true
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# BABEL test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
|
||||
# Set parallel execution to false
|
||||
# Set buffer size to reflect the buffer you want to test
|
||||
# Set run count to 1 (test will run once)
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: babel-float-256MiB
|
||||
device: all
|
||||
module: babel # Name of the module
|
||||
parallel: false # Parallel true or false
|
||||
count: 1 # Number of times you want to repeat the test from the begin ( A clean start every time)
|
||||
num_iter: 5000 # Number of iterations, this many kernels are launched simultaneosuly and stresses the system
|
||||
array_size: 268435456 # Buffer size the test operates, this is 256 MiB
|
||||
test_type: 1 # type of test, 1: Float, 2: Double, 3: Triad float, 4: Triad double
|
||||
mibibytes: true # mibibytes (MiB) or megabytes (MB), true for MiB
|
||||
o/p_csv: false # o/p as csv file
|
||||
subtest: 5 # 1: copy 2: copy+mul 3: copy+mul+add 4: copy+mul+add+traid 5: copy+mul+add+traid+dot
|
||||
dwords_per_lane: 4 # Number of dwords per lane
|
||||
chunks_per_block: 4 # Number of chunks per block
|
||||
|
||||
@@ -0,0 +1,256 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# GST test - gst-96Tflops-8K12K4K-trig-tf32
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space
|
||||
# Set matrices sizes to 8192 * 12288 * 4096
|
||||
# Set matrix data type as fp32 real number
|
||||
# Set compute type as tf32 (xf32)
|
||||
# Set matrix data initialization method as trignometric float
|
||||
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
|
||||
# Set target stress GFLOPS as 96 TFLOPS
|
||||
# Set blas source (backend) as hipblaslt
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves 96 TFLOPS or more
|
||||
# within the test duration of 15 seconds after ramp-up duration of 5 seconds.
|
||||
# Else test on the GPU fails (FALSE).
|
||||
|
||||
actions:
|
||||
- name: gst-96Tflops-8K12K4K-trig-tf32
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 50
|
||||
copy_matrix: false
|
||||
target_stress: 96000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 12288
|
||||
matrix_size_c: 4096
|
||||
matrix_init: trig
|
||||
data_type: fp32_r
|
||||
compute_type: xf32_r
|
||||
transa: 0
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 1
|
||||
blas_source: hipblaslt
|
||||
parallel: true
|
||||
|
||||
- name: gst-406Tflops-8K13K17K-trig-i8
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 500
|
||||
copy_matrix: false
|
||||
target_stress: 406000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 13312
|
||||
matrix_size_c: 17792
|
||||
matrix_init: trig
|
||||
data_type: i8_r
|
||||
compute_type: i32_r
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
blas_source: hipblaslt
|
||||
parallel: true
|
||||
|
||||
- name: gst-26Tflops-8K8K8K-trig-fp32
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 100
|
||||
copy_matrix: false
|
||||
target_stress: 26000
|
||||
matrix_size_a: 8192
|
||||
matrix_size_b: 8960
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: fp32_r
|
||||
compute_type: fp32_r
|
||||
transa: 0
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 1
|
||||
blas_source: hipblaslt
|
||||
parallel: true
|
||||
|
||||
|
||||
- name: gst-343Tflops-4K4K8K-rand-fp8
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 3000
|
||||
copy_matrix: false
|
||||
target_stress: 343415
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: fp8_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-336Tflops-4K4K8K-trig-fp8
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 170000
|
||||
copy_matrix: false
|
||||
target_stress: 336441
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: fp8_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-176Tflops-4K4K8K-rand-fp16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 3000
|
||||
copy_matrix: false
|
||||
target_stress: 176191
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: fp16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-172Tflops-4K4K8K-trig-fp16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 90000
|
||||
copy_matrix: false
|
||||
target_stress: 172333
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: fp16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-174Tflops-4K4K8K-rand-bf16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 3000
|
||||
copy_matrix: false
|
||||
target_stress: 174364
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: rand
|
||||
data_type: bf16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
- name: gst-172Tflops-4K4K8K-trig-bf16
|
||||
device: all
|
||||
module: gst
|
||||
log_interval: 3000
|
||||
ramp_interval: 5000
|
||||
duration: 15000
|
||||
hot_calls: 90000
|
||||
copy_matrix: false
|
||||
target_stress: 172333
|
||||
matrix_size_a: 4864
|
||||
matrix_size_b: 4096
|
||||
matrix_size_c: 8192
|
||||
matrix_init: trig
|
||||
data_type: bf16_r
|
||||
lda: 8320
|
||||
ldb: 8320
|
||||
ldc: 4992
|
||||
ldd: 4992
|
||||
transa: 1
|
||||
transb: 0
|
||||
alpha: 1
|
||||
beta: 0
|
||||
parallel: true
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# GST thermal test - gst-thermal-dgemm-true
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space
|
||||
# Set matrices sizes to 8640 * 8640 * 8640
|
||||
# Set matrices batch size to 96
|
||||
# Set gemm operation type as dgemm real
|
||||
# Set gemm operation mode as batched strided gemm
|
||||
# Set matrix data initialization method as hip random integer
|
||||
# Set copy_matrix to false (the matrices will be copied to GPUs only once)
|
||||
# Set target stress GFLOPS as 24700 GFLOPS (~24.7 TFLOPS)
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves 24.7 TFLOPS or more
|
||||
# within the test duration of 10 mins after ramp-up duration of 10 seconds.
|
||||
# Else test on the GPU fails (FALSE).
|
||||
|
||||
actions:
|
||||
- name: gst-thermal-dgemm-true
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
log_interval: 5000
|
||||
ramp_interval: 10000
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 24700
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
lda: 8640
|
||||
ldb: 8640
|
||||
ldc: 8640
|
||||
ldd: 8640
|
||||
gemm_mode: strided_batched
|
||||
batch_size: 96
|
||||
matrix_init: hiprand
|
||||
ops_type: dgemm
|
||||
transa: 0
|
||||
transb: 1
|
||||
alpha: 2.71828
|
||||
beta: 3.14159
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# Test #1 - iet-260W-1K-rand-dgemm
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to true
|
||||
# Set matrix_size to 1024 for dgemm operations
|
||||
# Set target power to 260 Watts
|
||||
# Set test duration to 2 mins
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI308X/iet_single.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches at least 260 Watts,
|
||||
# FALSE otherwise
|
||||
|
||||
actions:
|
||||
- name: iet-260W-1K-rand-dgemm
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
duration: 60000
|
||||
sample_interval: 1000
|
||||
target_power: 260
|
||||
matrix_size: 1024
|
||||
matrix_init: rand
|
||||
ops_type: dgemm
|
||||
|
||||
# Test #2 - iet-wait-350W-8K-rand-dgemm
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# Set parallel execution to true
|
||||
# Set matrix_size to 8096 for dgemm operations
|
||||
# Set target power to 350 Watts
|
||||
# Set wait duration to 30 seconds (GPU idle period)
|
||||
# Set test duration to 2 mins
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI308X/iet_single.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU power reaches at least 350 Watts,
|
||||
# FALSE otherwise
|
||||
|
||||
- name: iet-wait-350W-8K-rand-dgemm
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
wait: 30000 # Wait for 30 secs before the test starts
|
||||
duration: 60000
|
||||
sample_interval: 1000
|
||||
target_power: 350
|
||||
matrix_size: 8096
|
||||
matrix_init: rand
|
||||
ops_type: dgemm
|
||||
|
||||
- name: iet-wait-420W-16K-rand-dgemm
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
wait: 30000 # Wait for 30 secs before the test starts
|
||||
duration: 60000
|
||||
sample_interval: 1000
|
||||
target_power: 420
|
||||
matrix_size: 16182
|
||||
matrix_init: rand
|
||||
ops_type: dgemm
|
||||
|
||||
- name: iet-wait-stress-650W-bw
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
wait: 30000 # Wait for 30 secs before the test starts
|
||||
duration: 60000
|
||||
sample_interval: 1000
|
||||
target_power: 650
|
||||
bw_workload: true
|
||||
cp_workload: false
|
||||
tolerance: 0.05
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# IET stress test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by comma.
|
||||
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
|
||||
# Test duration set to 10 mins.
|
||||
# Target power set to 650W for each GPU.
|
||||
# Tolerance set to 5% of target power.
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI308X/iet_stress.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves power target of 750W.
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: iet-stress-650W-true
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
duration: 60000
|
||||
ramp_interval: 1000
|
||||
sample_interval: 5000
|
||||
log_interval: 5000
|
||||
target_power: 650
|
||||
tolerance: 0.05
|
||||
bw_workload: true
|
||||
cp_workload: false
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# IET thermal test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by comma.
|
||||
# Set parallel execution to true (gemm workload execution on all GPUs in parallel)
|
||||
# Test duration set to 10 mins.
|
||||
# Target power set to 600W for each GPU.
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# ./rvs -c conf/MI308X/iet_thermal.conf
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if the GPU achieves power target of 600W.
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: iet-thermal-dgemm-true
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
sample_interval: 5000
|
||||
ramp_interval: 20000
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_power: 600
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
lda: 8640
|
||||
ldb: 8640
|
||||
ldc: 8640
|
||||
ldd: 8640
|
||||
gemm_mode: strided_batched
|
||||
batch_size: 96
|
||||
matrix_init: hiprand
|
||||
ops_type: dgemm
|
||||
transa: 0
|
||||
transb: 1
|
||||
alpha: 2.71828
|
||||
beta: 3.14159
|
||||
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
nv21
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/gpup_single.conf
|
||||
@@ -0,0 +1,41 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: gpustress-9000-sgemm-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 10000
|
||||
copy_matrix: false
|
||||
target_stress: 6000
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
ops_type: sgemm
|
||||
lda: 8640
|
||||
ldb: 8640
|
||||
ldc: 8640
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/gst_stress_3_hrs.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/iet_stress.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/mem.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/pbqt_single.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/pebb_single.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/peqt_single.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/pesm_1.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/rcqt_single.conf
|
||||
@@ -0,0 +1,174 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# Run test with testscript or binary:
|
||||
#
|
||||
# Using Testscript -
|
||||
# cd /opt/rocm/share/rocm-validation-suite/testscripts
|
||||
# sudo ./gpup.new.sh
|
||||
#
|
||||
# Using Binary -
|
||||
# cd /opt/rocm/share/rocm-validation-suite/conf
|
||||
# cd /opt/rocm/bin
|
||||
# sudo ./rvs -c /opt/rocm/share/rocm-validation-suite/conf/gpup_single.conf
|
||||
#
|
||||
# Note: Paths may vary with the ROCm version or ROCm installation path.
|
||||
|
||||
# GPUP test #1
|
||||
#
|
||||
# Preconditions:
|
||||
# all AMD compatible GPUs
|
||||
# all types of devices
|
||||
# all gpu properties, all io_links properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties values for any GPUs
|
||||
|
||||
actions:
|
||||
- name: RVS-GPUP-TC1
|
||||
device: all
|
||||
module: gpup
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #2
|
||||
#
|
||||
# Preconditions:
|
||||
# all AMD compatible GPUs
|
||||
# all types of devices
|
||||
# no regular expressions
|
||||
# only a subset of gpu properties, only a subset of io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying subsets of properties and io_link properties values for any GPUs
|
||||
|
||||
- name: RVS-GPUP-TC2
|
||||
device: all
|
||||
module: gpup
|
||||
properties:
|
||||
simd_count:
|
||||
mem_banks_count:
|
||||
io_links_count:
|
||||
vendor_id:
|
||||
location_id:
|
||||
max_engine_clk_ccompute:
|
||||
io_links-properties:
|
||||
version_major:
|
||||
type:
|
||||
version_major:
|
||||
version_minor:
|
||||
node_from:
|
||||
node_to:
|
||||
recommended_transfer_size:
|
||||
flags:
|
||||
|
||||
# GPUP test #3
|
||||
#
|
||||
# Preconditions:
|
||||
# only a subset of AMD compatible GPUs (device filtering)
|
||||
# all types of devices
|
||||
# all gpu properties, all io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties and io_link properties values for subset of GPUs
|
||||
#
|
||||
# Note:
|
||||
# Testing specific device, if device numbers are changed in system it should be changed in the test
|
||||
|
||||
- name: RVS-GPUP-TC3
|
||||
device: all
|
||||
module: gpup
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #4
|
||||
#
|
||||
# Preconditions:
|
||||
# all AMD compatible GPUs
|
||||
# a given device type (deviceid filtering), this must be filled based on deviceid in sysfs/ ./rvs -g.
|
||||
# Default is 0=> no filtering
|
||||
# all gpu properties, all io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties and io_link properties values for all GPUs and given deviceid
|
||||
|
||||
- name: RVS-GPUP-TC4
|
||||
device: all
|
||||
module: gpup
|
||||
deviceid: 0
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #5
|
||||
#
|
||||
# Preconditions:
|
||||
# only a subset of AMD compatible GPUs (device filtering)
|
||||
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
|
||||
# Default is 0=> no filtering
|
||||
# all gpu properties, all io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying all properties and io_link properties values for subset of GPUs and given deviceid
|
||||
#
|
||||
# Note:
|
||||
# Testing specific device, if device numbers are changed in system it should be changed in the test
|
||||
|
||||
- name: RVS-GPUP-TC5
|
||||
device: all
|
||||
module: gpup
|
||||
deviceid: 0
|
||||
properties:
|
||||
all:
|
||||
io_links-properties:
|
||||
all:
|
||||
|
||||
# GPUP test #6
|
||||
#
|
||||
# Preconditions:
|
||||
# only a subset of AMD compatible GPUs (device filtering)
|
||||
# a given device type (deviceid filtering) this must be filled based on deviceid in sysfs/ ./rvs -g
|
||||
# Default is 0=> no filtering
|
||||
# only a subset of gpu properties, only a subset of io_link properties
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes with displaying subset of properties and io_link properties values for subset of GPUs and given deviceid
|
||||
#
|
||||
# Note:
|
||||
# Testing specific device, if device numbers are changed in system it should be changed in the test
|
||||
|
||||
- name: RVS-GPUP-TC6
|
||||
device: all
|
||||
module: gpup
|
||||
deviceid: 0
|
||||
properties:
|
||||
mem_banks_count:
|
||||
io_links-properties:
|
||||
version_major:
|
||||
@@ -0,0 +1,41 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: gpustress-10000-sgemm-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 10000
|
||||
copy_matrix: false
|
||||
target_stress: 10000
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
ops_type: sgemm
|
||||
lda: 8640
|
||||
ldb: 8640
|
||||
ldc: 8640
|
||||
@@ -0,0 +1,43 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: gpustress-3hrs
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 10800000
|
||||
ramp_interval: 300000
|
||||
log_interval: 6000
|
||||
target_stress: 5000
|
||||
max_violations: 1
|
||||
copy_matrix: false
|
||||
tolerance: 0.01
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
ops_type: sgemm
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: action_1
|
||||
device: all
|
||||
module: iet
|
||||
parallel: true
|
||||
count: 1
|
||||
wait: 100
|
||||
duration: 50000
|
||||
ramp_interval: 5000
|
||||
sample_interval: 700
|
||||
log_interval: 700
|
||||
max_violations: 1
|
||||
target_power: 127
|
||||
tolerance: 0.06
|
||||
matrix_size: 8640
|
||||
ops_type: dgemm
|
||||
@@ -0,0 +1,68 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# Memory test
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g
|
||||
# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify
|
||||
# all the GPUs IDs separated by white space (e.g.: device: 50599 3245)
|
||||
# Set run count to how many times we want each test to run
|
||||
#
|
||||
# Run test with:
|
||||
# ./rvs -c conf/mem.conf -d 3
|
||||
#
|
||||
# Expected result:
|
||||
# The test on each GPU passes (TRUE) if no memory errors are seen
|
||||
# FALSE otherwise
|
||||
#
|
||||
# To omit individual actions specify number of test in exclude tag's value, numbers as specified below
|
||||
# 0: Walking 1 bit
|
||||
# 1: Own address test
|
||||
# 2: Moving inversions, ones&zeros
|
||||
# 3: Moving inversions, 8 bit pattern
|
||||
# 4: Moving inversions, random pattern
|
||||
# 5: Block move, 64 moves
|
||||
# 6: Moving inversions, 32 bit pattern
|
||||
# 7: Random number sequence
|
||||
# 8: Modulo 20, random pattern
|
||||
# 9: Bit fade test
|
||||
# 10: Memory stress test
|
||||
#
|
||||
|
||||
actions:
|
||||
- name: action_1
|
||||
device: all
|
||||
module: mem
|
||||
parallel: true
|
||||
count: 1
|
||||
wait: 100
|
||||
mapped_memory: false
|
||||
mem_blocks: 128
|
||||
num_passes: 500
|
||||
thrds_per_blk: 64
|
||||
stress: true
|
||||
num_iter: 50000
|
||||
exclude : 9 10
|
||||
@@ -0,0 +1,182 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: action_1
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
block_size: 1000000 2000000 10000000
|
||||
device_id: all
|
||||
|
||||
- name: action_2
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 1000
|
||||
count: 3
|
||||
duration: 10000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_3
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 4000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_4
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 1000
|
||||
duration: 5000
|
||||
count: 1
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_5
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 4000
|
||||
count: 1
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_6
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 800
|
||||
duration: 8000
|
||||
count: 1
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: false
|
||||
device_id: all
|
||||
|
||||
- name: action_7
|
||||
device: all
|
||||
module: pbqt
|
||||
peers: all
|
||||
count: 1
|
||||
test_bandwidth: false
|
||||
device_id: all
|
||||
|
||||
- name: action_8
|
||||
device: all
|
||||
module: pbqt
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel : true
|
||||
device_id: all
|
||||
|
||||
- name: action_9
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 500
|
||||
duration: 1000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: true
|
||||
device_id: all
|
||||
|
||||
- name: action_10
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 500
|
||||
duration: 1000
|
||||
peers: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: false
|
||||
parallel: true
|
||||
|
||||
- name: action_11
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 0
|
||||
duration: 10000
|
||||
peers: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: false
|
||||
device_id: all
|
||||
|
||||
- name: action_12
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 0
|
||||
duration: 1000
|
||||
count: 3
|
||||
wait: 1000
|
||||
peers: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
|
||||
- name: action_13
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 1000
|
||||
duration: 10000
|
||||
peers: all
|
||||
device_id: all
|
||||
peer_device_id: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
parallel: true
|
||||
|
||||
- name: action_14
|
||||
device: all
|
||||
module: pbqt
|
||||
log_interval: 500
|
||||
duration: 10000
|
||||
peers: all
|
||||
test_bandwidth: true
|
||||
bidirectional: true
|
||||
device_id: all
|
||||
@@ -0,0 +1,43 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# PEBB test #3
|
||||
#
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. bidirectional
|
||||
|
||||
actions:
|
||||
- name: h2d-d2h-sequential-51MB
|
||||
device: all
|
||||
module: pebb
|
||||
log_interval: 800
|
||||
duration: 5000
|
||||
device_to_host: true
|
||||
host_to_device: true
|
||||
parallel: true
|
||||
block_size: 51200000
|
||||
link_type: 2 # PCIe
|
||||
@@ -0,0 +1,593 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# Run test with testscript or binary:
|
||||
#
|
||||
# Using Testscript -
|
||||
# cd /opt/rocm/share/rocm-validation-suite/testscripts
|
||||
# sudo ./peqt.new.sh
|
||||
#
|
||||
# Using Binary -
|
||||
# cd /opt/rocm/share/rocm-validation-suite/conf
|
||||
# cd /opt/rocm/bin
|
||||
# ./rvs -c /opt/rocm/share/rocm-validation-suite/conf/peqt_single.conf
|
||||
#
|
||||
# Note: Paths may vary with the ROCm version or ROCm installation path.
|
||||
|
||||
# PEQT test #1
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. no regular expressions
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if at least one AMD compatible GPU is registered within the system, FALSE otherwise
|
||||
|
||||
|
||||
actions:
|
||||
- name: pcie_act_1
|
||||
device: all
|
||||
module: peqt
|
||||
capability:
|
||||
link_cap_max_speed:
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed:
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #2
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 2. all types of devices
|
||||
# 3. no regular expressions
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list, FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_2
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed:
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed:
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #3
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 3. no regular expressions
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if at least one AMD compatible GPU (registered within the system), FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_3
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed:
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed:
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #4
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 3. no regular expressions
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
|
||||
# , FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_4
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed:
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed:
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #5
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 2. a given device type (deviceid filtering)(replace 0 with appropriate deviceid
|
||||
# 3. no regular expressions
|
||||
# 4. only a subset of PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
|
||||
# and also matches the <deviceid>, FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_5
|
||||
module: peqt
|
||||
device: all
|
||||
deviceid: 0
|
||||
capability:
|
||||
link_cap_max_speed:
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed:
|
||||
link_stat_neg_width:
|
||||
dev_serial_num:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #6
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. a simple regular expression for <link_cap_max_speed> capability
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if all <link_cap_max_speed> values match the given regular expression
|
||||
# and at least one AMD compatible GPU is registered within the system
|
||||
# FALSE otherwise
|
||||
|
||||
- name: pcie_act_6
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed:
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #7
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. 2 simple regular expressions, as follows: one for <link_cap_max_speed> capability and another one for the <link_stat_cur_speed>
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - at least one AMD compatible GPU is registered within the system and
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_7
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
# PEQT test #8
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and an erroneous one for <slot_pwr_limit_value>
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - at least one AMD compatible GPU is registered within the system and
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
# Notice: <slot_pwr_limit_value> regular expression is not valid and will be skipped
|
||||
# without affecting the PEQT modules' check RESULT (however, an error will be logged out)
|
||||
|
||||
|
||||
- name: pcie_act_8
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value: '[a-b][d-'
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #9
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 2. all types of devices
|
||||
# 3. 2 simple regular expressions, as follows: one for <link_cap_max_speed> capability and another one for the <link_stat_cur_speed>
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list and
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_9
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver:
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #10
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and one for <kernel_driver>
|
||||
# 3. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_10
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver: ^amdgpu$
|
||||
dev_serial_num:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #11
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 3. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and one for <kernel_driver>
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> lis
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_11
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width:
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver: ^amdgpu$
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #12
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 3. 3 simple regular expressions, as follows: one for <link_cap_max_speed> capability, another one for the <link_stat_cur_speed> and one for <kernel_driver>
|
||||
# 4. only a subset of PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> value smatch the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_12
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width:
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
vendor_id:
|
||||
kernel_driver: ^amdgpu$
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #13
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 3. 5 simple regular expressions, as follows:
|
||||
# - one for <link_cap_max_speed> PCIe capability
|
||||
# - one for the <link_stat_cur_speed> PCIe capability
|
||||
# - one for <kernel_driver>
|
||||
# - one for <link_cap_max_width> PCIe capability
|
||||
# - one for <link_stat_neg_width> PCIe capability
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - at least one of the AMD compatible GPUs (registered within the system) matches one of the GPU ID in the <device> list
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# - all <link_cap_max_width> values match the given regular expression
|
||||
# - all <link_stat_neg_width> values match the given regular expression
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_13
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width: ^(x8|x16)$
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width: ^(x8|x16)$
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver: ^amdgpu$
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing:
|
||||
atomic_op_32_completer:
|
||||
atomic_op_64_completer:
|
||||
atomic_op_128_CAS_completer:
|
||||
|
||||
# PEQT test #14
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 3. 6 simple regular expressions, as follows:
|
||||
# - one for <link_cap_max_speed> PCIe capability
|
||||
# - one for the <link_stat_cur_speed> PCIe capability
|
||||
# - one for <kernel_driver>
|
||||
# - one for <link_cap_max_width> PCIe capability
|
||||
# - one for <link_stat_neg_width> PCIe capability
|
||||
# - one for <atomic_op_completer> PCIe capability
|
||||
# 4. all PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# - all <link_cap_max_width> values match the given regular expression
|
||||
# - all <link_stat_neg_width> values match the given regular expression
|
||||
# - all <atomic_op_completer> values match the given regular expression (4 TRUE/FALSE values with whitespace between them)
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_14
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width: ^(x8|x16)$
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width: ^(x8|x16)$
|
||||
slot_pwr_limit_value:
|
||||
slot_physical_num:
|
||||
deviceid:
|
||||
vendor_id:
|
||||
kernel_driver: ^amdgpu$
|
||||
dev_serial_num:
|
||||
D0_Maximum_Power_12V:
|
||||
D0_Maximum_Power_3_3V:
|
||||
D0_Sustained_Power_12V:
|
||||
D0_Sustained_Power_3_3V:
|
||||
atomic_op_routing: ^((TRUE|FALSE){1})$
|
||||
atomic_op_32_completer: ^((TRUE|FALSE){1})$
|
||||
atomic_op_64_completer: ^((TRUE|FALSE){1})$
|
||||
atomic_op_128_CAS_completer: ^((TRUE|FALSE){1})$
|
||||
|
||||
# PEQT test #15
|
||||
# testing conditions:
|
||||
# 1. only a subset of AMD compatible GPUs (device filtering)
|
||||
# 3. 6 simple regular expressions, as follows:
|
||||
# - one for <link_cap_max_speed> PCIe capability
|
||||
# - one for the <link_stat_cur_speed> PCIe capability
|
||||
# - one for <kernel_driver>
|
||||
# - one for <link_cap_max_width> PCIe capability
|
||||
# - one for <link_stat_neg_width> PCIe capability
|
||||
# - one for <atomic_op_completer> PCIe capability
|
||||
# 4. only a subset of PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# - all <link_cap_max_width> values match the given regular expression
|
||||
# - all <link_stat_neg_width> values match the given regular expression
|
||||
# - all <atomic_op_completer> values match the given regular expression (4 TRUE/FALSE values with whitespace between them)
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_15
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width: ^(x8|x16)$
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width: ^(x8|x16)$
|
||||
kernel_driver: ^amdgpu$
|
||||
atomic_op_routing: ^((TRUE|FALSE){1})$
|
||||
atomic_op_32_completer: ^((TRUE|FALSE){1})$
|
||||
|
||||
# PEQT test #16
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 3. 6 simple regular expressions, as follows:
|
||||
# - one for <link_cap_max_speed> PCIe capability
|
||||
# - one for the <link_stat_cur_speed> PCIe capability
|
||||
# - one for <kernel_driver>
|
||||
# - one for <link_cap_max_width> PCIe capability
|
||||
# - one for <link_stat_neg_width> PCIe capability
|
||||
# - one for <atomic_op_completer> PCIe capability
|
||||
# 4. only a subset of PCIe capabilities
|
||||
# Expected PCIe check RESULT = TRUE if
|
||||
# - all <link_cap_max_speed> values match the given regular expression and
|
||||
# - all <link_stat_cur_speed> values match the given regular expression and
|
||||
# - all <kernel_driver> values match the given regular expression
|
||||
# - all <link_cap_max_width> values match the given regular expression
|
||||
# - all <link_stat_neg_width> values match the given regular expression
|
||||
# - all <atomic_op_completer> values match the given regular expression (4 TRUE/FALSE values with whitespace between them)
|
||||
# FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_16
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
link_cap_max_speed: '^(\d+ GT\/s)$'
|
||||
link_cap_max_width: ^(x8|x16)$
|
||||
link_stat_cur_speed: '^(\d+ GT\/s)$'
|
||||
link_stat_neg_width: ^(x8|x16)$
|
||||
kernel_driver: ^amdgpu$
|
||||
atomic_op_routing: ^((TRUE|FALSE){1})$
|
||||
atomic_op_32_completer: ^((TRUE|FALSE){1})$
|
||||
atomic_op_64_completer: ^((TRUE|FALSE){1})$
|
||||
atomic_op_128_CAS_completer: ^((TRUE|FALSE){1})$
|
||||
|
||||
# PEQT test #17
|
||||
# testing conditions:
|
||||
# 1. all AMD compatible GPUs
|
||||
# 2. all types of devices
|
||||
# 3. no regular expressions
|
||||
# 4. bus and slot number
|
||||
# Expected PCIe check RESULT = TRUE if at least one AMD compatible GPU is registered within the system, FALSE otherwise
|
||||
|
||||
|
||||
- name: pcie_act_17
|
||||
module: peqt
|
||||
device: all
|
||||
capability:
|
||||
bus_id:
|
||||
slot_physical_num:
|
||||
@@ -0,0 +1,47 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
# PESM test #1
|
||||
#
|
||||
# Preconditions:
|
||||
# Set device id to an existing AMD deviceid values
|
||||
#
|
||||
# Run test with:
|
||||
# cd bin
|
||||
# sudo ./rvs -c conf/pesm2.conf
|
||||
#
|
||||
# Expected result:
|
||||
# Test passes without displaying data for any GPUs
|
||||
actions:
|
||||
- name: act1
|
||||
device: all
|
||||
deviceid: 26720
|
||||
module: pesm
|
||||
monitor: true
|
||||
- name: act2
|
||||
device: all
|
||||
debugwait: 3000
|
||||
module: pesm
|
||||
monitor: false
|
||||
@@ -0,0 +1,36 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
|
||||
- name: action_1
|
||||
device: all
|
||||
module: rcqt
|
||||
package: rocm-hip-sdk
|
||||
|
||||
- name: action_2
|
||||
device: all
|
||||
module: rcqt
|
||||
packagelist: rocm-hip-libraries rocm-core rocm-dev rocm-hip-runtime-devel rocm-language-runtime rocm-hip-runtime rocm-hip-sdk rocm-utils rocm-smi-lib rocalution rocm-debug-agent rocm-clang-ocl rocm-device-libs hsa-rocr-devel hipcub-devel rocm-ocl-icd rocsolver rocsparse rocsolver-devel rocminfo hipfft-devel rocm-gdb rocm-dbgapi rocfft hipblas-devel rocthrust-devel openmp-extras comgr rccl rocblas hipblas roctracer-dev hip-doc amdgpu-install rocrand hsa-rocr hipfft hipsparse-devel rocsparse-devel rocrand-devel rocm-opencl hip-devel rocprim-devel hipsolver-devel rocfft-devel hsa-amd-aqlprofile hipify-clang miopen-hip-devel rocm-llvm hip-runtime-amd hip-samples rocalution-devel rccl-devel hipsolver rocprofiler-dev miopen-hip rocm-cmake hipsparse rocblas-devel rocm-opencl-devel
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/gpup_single.conf
|
||||
@@ -0,0 +1,41 @@
|
||||
# ################################################################################
|
||||
# #
|
||||
# # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# #
|
||||
# # MIT LICENSE:
|
||||
# # Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
# # this software and associated documentation files (the "Software"), to deal in
|
||||
# # the Software without restriction, including without limitation the rights to
|
||||
# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
# # of the Software, and to permit persons to whom the Software is furnished to do
|
||||
# # so, subject to the following conditions:
|
||||
# #
|
||||
# # The above copyright notice and this permission notice shall be included in all
|
||||
# # copies or substantial portions of the Software.
|
||||
# #
|
||||
# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# # SOFTWARE.
|
||||
# #
|
||||
# ###############################################################################
|
||||
|
||||
actions:
|
||||
- name: gpustress-9000-sgemm-false
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
count: 1
|
||||
duration: 10000
|
||||
copy_matrix: false
|
||||
target_stress: 6000
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
ops_type: sgemm
|
||||
lda: 8640
|
||||
ldb: 8640
|
||||
ldc: 8640
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/gst_stress_3_hrs.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/iet_stress.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/mem.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/pbqt_single.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/pebb_single.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/peqt_single.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/pesm_1.conf
|
||||
Nasc siombalach
@@ -0,0 +1 @@
|
||||
../nv31/rcqt_single.conf
|
||||
Tagairt in Eagrán Nua
Cuir bac ar úsáideoir