714 líneas
20 KiB
C++
714 líneas
20 KiB
C++
// Copied from https://github.com/benrichard-amd/rocflop/tree/82f197e12314bab694fc70451a2b495b4f51bf90
|
|
|
|
#include <iostream>
|
|
#include <cstring>
|
|
#include <hip/hip_runtime.h>
|
|
#include <hip/hip_fp16.h>
|
|
#include <unistd.h>
|
|
#include <type_traits>
|
|
#include <vector>
|
|
#include <sys/wait.h>
|
|
#include <fcntl.h>
|
|
|
|
using float16 = _Float16;
|
|
|
|
// Vector types. Useful for packed math (where supported) and MFMA inputs.
|
|
template<typename T, uint32_t Rank>
|
|
using vecT = T __attribute__((ext_vector_type(Rank)));
|
|
|
|
template<typename T> using vec4 = vecT<T, 4>;
|
|
template<typename T> using vec8 = vecT<T, 8>;
|
|
|
|
|
|
// Kernels
|
|
|
|
|
|
template<typename T> __global__ void fma_throughput(vec4<T>* buffer, int count)
|
|
{
|
|
const T k = 1.0;
|
|
|
|
const int grid_size = gridDim.x * blockDim.x;
|
|
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
vec4<T>* ptr = buffer;
|
|
|
|
vec4<T> value0 = ptr[0 * grid_size + tid];
|
|
vec4<T> value1 = ptr[1 * grid_size + tid];
|
|
vec4<T> value2 = ptr[2 * grid_size + tid];
|
|
vec4<T> value3 = ptr[3 * grid_size + tid];
|
|
|
|
for(int j = 0; j < count; j++) {
|
|
for(int j = 0; j < 64; j++) {
|
|
|
|
// 16 FMA ops
|
|
value0 = value0 * value0 + k;
|
|
value1 = value1 * value1 + k;
|
|
value2 = value2 * value2 + k;
|
|
value3 = value3 * value3 + k;
|
|
}
|
|
}
|
|
|
|
ptr[tid] = value0 + value1 + value2 + value3;
|
|
}
|
|
|
|
// MFMA instructions are only available on gfx908 and later (not supported on gfx906)
|
|
#if !defined(__gfx906__)
|
|
__global__ void matmul_fp16_throughput(vec4<float16>* inputs, vec4<float>* outputs, int count)
|
|
{
|
|
int grid_size = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
vec4<float16>* ptr = inputs;
|
|
|
|
vec4<float16> value0 = ptr[0 * grid_size + tid];
|
|
vec4<float16> value1 = ptr[1 * grid_size + tid];
|
|
vec4<float16> value2 = ptr[2 * grid_size + tid];
|
|
vec4<float16> value3 = ptr[3 * grid_size + tid];
|
|
|
|
vec4<float> accum0;
|
|
vec4<float> accum1;
|
|
vec4<float> accum2;
|
|
vec4<float> accum3;
|
|
for(int i = 0; i < count; i++) {
|
|
for(int j = 0; j < 64; j++) {
|
|
// 4 MFMA ops
|
|
accum0 = __builtin_amdgcn_mfma_f32_16x16x16f16(value0, value0, accum0, 0, 0, 0);
|
|
accum1 = __builtin_amdgcn_mfma_f32_16x16x16f16(value1, value1, accum1, 0, 0, 0);
|
|
accum2 = __builtin_amdgcn_mfma_f32_16x16x16f16(value2, value2, accum2, 0, 0, 0);
|
|
accum3 = __builtin_amdgcn_mfma_f32_16x16x16f16(value3, value3, accum3, 0, 0, 0);
|
|
}
|
|
}
|
|
|
|
outputs[tid] = accum0 + accum1 + accum2 + accum3;
|
|
}
|
|
|
|
__global__ void matmul_fp32_throughput(float* inputs, vec4<float>* outputs, int count)
|
|
{
|
|
int grid_size = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
float* ptr = inputs;
|
|
|
|
float value0 = ptr[0 * grid_size + tid];
|
|
float value1 = ptr[1 * grid_size + tid];
|
|
float value2 = ptr[2 * grid_size + tid];
|
|
float value3 = ptr[2 * grid_size + tid];
|
|
|
|
vec4<float> accum0;
|
|
vec4<float> accum1;
|
|
vec4<float> accum2;
|
|
vec4<float> accum3;
|
|
for(int i = 0; i < count; i++) {
|
|
for(int j = 0; j < 64; j++) {
|
|
// 4 MFMA ops
|
|
accum0 = __builtin_amdgcn_mfma_f32_16x16x4f32(value0, value0, accum0, 0, 0, 0);
|
|
accum1 = __builtin_amdgcn_mfma_f32_16x16x4f32(value1, value1, accum1, 0, 0, 0);
|
|
accum2 = __builtin_amdgcn_mfma_f32_16x16x4f32(value2, value2, accum2, 0, 0, 0);
|
|
accum3 = __builtin_amdgcn_mfma_f32_16x16x4f32(value3, value3, accum3, 0, 0, 0);
|
|
}
|
|
}
|
|
|
|
outputs[tid] = accum0 + accum1 + accum2 + accum3;
|
|
}
|
|
#endif // !defined(__gfx906__)
|
|
|
|
// SMFMAC (Sparse MFMA) instructions are only available on gfx940 and later (not on gfx906, gfx908, or gfx90a)
|
|
#if !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__)
|
|
__global__ void sparse_matmul_fp16_throughput(vec4<float16>* input0, vec8<float16>* input1, vec4<float>* outputs, int count)
|
|
{
|
|
int grid_size = gridDim.x * blockDim.x;
|
|
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
vec4<float16>* x_ptr = input0;
|
|
vec8<float16>* y_ptr = input1;
|
|
|
|
vec4<float16> x0 = x_ptr[0 * grid_size + tid];
|
|
vec4<float16> x1 = x_ptr[1 * grid_size + tid];
|
|
vec4<float16> x2 = x_ptr[2 * grid_size + tid];
|
|
vec4<float16> x3 = x_ptr[3 * grid_size + tid];
|
|
|
|
vec8<float16> y0 = y_ptr[0 * grid_size + tid];
|
|
vec8<float16> y1 = y_ptr[1 * grid_size + tid];
|
|
vec8<float16> y2 = y_ptr[2 * grid_size + tid];
|
|
vec8<float16> y3 = y_ptr[3 * grid_size + tid];
|
|
|
|
vec4<float> accum0;
|
|
vec4<float> accum1;
|
|
vec4<float> accum2;
|
|
vec4<float> accum3;
|
|
|
|
for(int i = 0; i < count; i++) {
|
|
for(int j = 0; j < 64; j++) {
|
|
// 4 SMFMAC ops
|
|
accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0);
|
|
accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0);
|
|
accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0);
|
|
accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0);
|
|
}
|
|
}
|
|
|
|
outputs[tid] = accum0 + accum1 + accum2 + accum3;
|
|
}
|
|
#endif // !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__)
|
|
|
|
void HIP_CALL(hipError_t err)
|
|
{
|
|
if(err != hipSuccess) {
|
|
std::cout << "HIP Error: " << (int)err << " " << hipGetErrorString(err) << std::endl;
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
struct GCNArch {
|
|
int major;
|
|
int minor;
|
|
int rev;
|
|
};
|
|
|
|
GCNArch get_gcn_arch(int device)
|
|
{
|
|
hipDeviceProp_t props;
|
|
|
|
HIP_CALL(hipGetDeviceProperties(&props, device));
|
|
|
|
// Example: gfx908:sramecc+:xnack-
|
|
std::string arch_full(props.gcnArchName);
|
|
|
|
// Extract number e.g. "908"
|
|
std::string gfx_str = arch_full.substr(3, arch_full.find_first_of(':'));
|
|
|
|
int gfx_num = std::stoi(gfx_str, nullptr, 16);
|
|
|
|
GCNArch arch;
|
|
arch.major = (gfx_num & 0xff00) >> 8;
|
|
arch.minor = (gfx_num & 0x00f0) >> 4;
|
|
arch.rev = (gfx_num & 0x000f);
|
|
|
|
return arch;
|
|
}
|
|
|
|
enum : uint32_t {
|
|
VALU_FP32 = 1 << 0,
|
|
VALU_FP16 = 1 << 1,
|
|
VALU_FP64 = 1 << 2,
|
|
MATRIX_FP16 = 1 << 3,
|
|
MATRIX_FP32 = 1 << 4,
|
|
SMATRIX_FP16 = 1 << 5,
|
|
VALU_INT32 = 1 << 6,
|
|
|
|
ALL = (uint32_t)-1
|
|
};
|
|
|
|
// Timer for measuring kernel duration
|
|
class HIPTimer {
|
|
|
|
private:
|
|
hipEvent_t m_start;
|
|
hipEvent_t m_stop;
|
|
|
|
public:
|
|
HIPTimer()
|
|
{
|
|
HIP_CALL(hipEventCreate(&m_start));
|
|
HIP_CALL(hipEventCreate(&m_stop));
|
|
}
|
|
|
|
void start()
|
|
{
|
|
HIP_CALL(hipEventRecord(m_start));
|
|
}
|
|
|
|
void stop()
|
|
{
|
|
HIP_CALL(hipEventRecord(m_stop));
|
|
}
|
|
|
|
double elapsed()
|
|
{
|
|
float ms;
|
|
HIP_CALL(hipEventElapsedTime(&ms, m_start, m_stop));
|
|
|
|
return (double)ms / 1000.0;
|
|
}
|
|
};
|
|
|
|
// Host code
|
|
|
|
template<typename T> double fma_throughput_test(int device, int count, int runs = 1)
|
|
{
|
|
vec4<T>* buffer = nullptr;
|
|
|
|
hipDeviceProp_t props;
|
|
HIP_CALL(hipGetDeviceProperties(&props, device));
|
|
|
|
int blocks = props.multiProcessorCount * 512;
|
|
int threads_per_block = 64;
|
|
int total_threads = blocks * threads_per_block;
|
|
|
|
HIP_CALL(hipMalloc(&buffer, sizeof(vec4<T>) * total_threads * 4));
|
|
|
|
HIPTimer t;
|
|
t.start();
|
|
for(int i = 0; i < runs; i++) {
|
|
fma_throughput<T><<<blocks, threads_per_block>>>(buffer, count);
|
|
}
|
|
t.stop();
|
|
HIP_CALL(hipDeviceSynchronize());
|
|
|
|
double elapsed = t.elapsed();
|
|
double ops = (double)total_threads * count * 64 * 16 * runs;
|
|
double flops = (double)ops * 2.0 / elapsed;
|
|
|
|
HIP_CALL(hipFree(buffer));
|
|
|
|
return flops;
|
|
}
|
|
|
|
#if !defined(__gfx906__)
|
|
template<typename matT, typename accumT> double matmul_throughput_test(int device, int count, int runs = 1)
|
|
{
|
|
const int wave_size = 64;
|
|
int k;
|
|
int m;
|
|
int n;
|
|
|
|
if(std::is_same<matT, float16>::value) {
|
|
m = 16;
|
|
n = 16;
|
|
k = 16;
|
|
} else if(std::is_same<matT, float>::value) {
|
|
m = 16;
|
|
n = 16;
|
|
k = 4;
|
|
} else {
|
|
assert(false);
|
|
}
|
|
|
|
int ops_per_matmul = k * m * n * 2;
|
|
|
|
void* buffer = nullptr;
|
|
void* accum = nullptr;
|
|
|
|
hipDeviceProp_t props;
|
|
HIP_CALL(hipGetDeviceProperties(&props, device));
|
|
|
|
int blocks = props.multiProcessorCount * 512;
|
|
int threads_per_block = wave_size;
|
|
int total_threads = blocks * threads_per_block;
|
|
|
|
HIP_CALL(hipMalloc(&buffer, 4 * sizeof(matT) * m * k * total_threads));
|
|
HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
|
|
|
|
HIPTimer t;
|
|
t.start();
|
|
for(int i = 0; i < runs; i++) {
|
|
if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
|
|
matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer, (vec4<float>*)accum, count);
|
|
} else if(std::is_same<matT,float>::value && std::is_same<accumT, float>::value) {
|
|
matmul_fp32_throughput<<<blocks, threads_per_block>>>((float*)buffer, (vec4<float>*)accum, count);
|
|
}
|
|
}
|
|
t.stop();
|
|
HIP_CALL(hipDeviceSynchronize());
|
|
|
|
double elapsed = t.elapsed();
|
|
double ops = (double)blocks * count * 64 * 4 * runs;
|
|
double flops = (double)ops * ops_per_matmul / elapsed;
|
|
|
|
HIP_CALL(hipFree(buffer));
|
|
HIP_CALL(hipFree(accum));
|
|
|
|
return flops;
|
|
}
|
|
#endif // !defined(__gfx906__)
|
|
|
|
#if !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__)
|
|
template<typename matT, typename accumT> double sparse_matmul_throughput_test(int device, int count, int runs = 1)
|
|
{
|
|
const int wave_size = 64;
|
|
int k;
|
|
int m;
|
|
int n;
|
|
|
|
if(std::is_same<matT, float16>::value) {
|
|
m = 16;
|
|
n = 16;
|
|
k = 32;
|
|
} else {
|
|
assert(false);
|
|
}
|
|
|
|
int ops_per_matmul = k * m * n * 2;
|
|
|
|
void* buffer1 = nullptr;
|
|
void* buffer2 = nullptr;
|
|
void* accum = nullptr;
|
|
|
|
hipDeviceProp_t props;
|
|
HIP_CALL(hipGetDeviceProperties(&props, device));
|
|
|
|
int blocks = props.multiProcessorCount * 512;
|
|
int threads_per_block = wave_size;
|
|
int total_threads = blocks * threads_per_block;
|
|
|
|
HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads));
|
|
HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads));
|
|
HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
|
|
|
|
HIPTimer t;
|
|
t.start();
|
|
for(int i = 0; i < runs; i++) {
|
|
if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
|
|
sparse_matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer1,
|
|
(vec8<float16>*)buffer2, (vec4<float>*)accum, count);
|
|
}
|
|
}
|
|
t.stop();
|
|
HIP_CALL(hipDeviceSynchronize());
|
|
|
|
double elapsed = t.elapsed();
|
|
double ops = (double)blocks * count * 64 * 4 * runs;
|
|
double flops = (double)ops * ops_per_matmul / elapsed;
|
|
|
|
HIP_CALL(hipFree(buffer1));
|
|
HIP_CALL(hipFree(buffer2));
|
|
HIP_CALL(hipFree(accum));
|
|
|
|
return flops;
|
|
}
|
|
#endif // !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__)
|
|
|
|
struct Result {
|
|
int device = -1;
|
|
double valu_fp16 = 0;
|
|
double valu_fp32 = 0;
|
|
double valu_fp64 = 0;
|
|
double valu_int32 = 0;
|
|
double mfma_fp16 = 0;
|
|
double mfma_fp32 = 0;
|
|
double smfmac_fp16 = 0;
|
|
|
|
// Used for sorting
|
|
bool operator<(const Result& other) {
|
|
return device < other.device;
|
|
}
|
|
};
|
|
|
|
void print_result(const Result& res, uint32_t mask)
|
|
{
|
|
if(mask & VALU_FP16) {
|
|
printf("VALU FP16: %8.2f TFLOPS\n", res.valu_fp16 / 1e12);
|
|
}
|
|
if(mask & VALU_FP32) {
|
|
printf("VALU FP32: %8.2f TFLOPS\n", res.valu_fp32 / 1e12);
|
|
}
|
|
if(mask & VALU_FP64) {
|
|
printf("VALU FP64: %8.2f TFLOPS\n", res.valu_fp64 / 1e12);
|
|
}
|
|
if(mask & VALU_INT32) {
|
|
printf("VALU INT32: %8.2f TIOPS\n", res.valu_int32 / 1e12);
|
|
}
|
|
if(mask & MATRIX_FP16) {
|
|
printf("MFMA FP16: %8.2f TFLOPS\n", res.mfma_fp16 / 1e12);
|
|
}
|
|
if(mask & MATRIX_FP32) {
|
|
printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12);
|
|
}
|
|
if(mask & SMATRIX_FP16) {
|
|
printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12);
|
|
}
|
|
}
|
|
|
|
Result run_tests(int device, int runs, uint32_t mask)
|
|
{
|
|
int device_count;
|
|
|
|
HIP_CALL(hipGetDeviceCount(&device_count));
|
|
|
|
if(device >= device_count) {
|
|
std::cout << "Device " << device << " does not exist. Skipping..." << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
HIP_CALL(hipSetDevice(device));
|
|
GCNArch arch = get_gcn_arch(device);
|
|
|
|
Result res = {.device = device};
|
|
|
|
if(mask & VALU_FP16) {
|
|
res.valu_fp16 = fma_throughput_test<float16>(device, 4096, runs);
|
|
}
|
|
|
|
if(mask & VALU_FP32) {
|
|
res.valu_fp32 = fma_throughput_test<float>(device, 4096, runs);
|
|
}
|
|
|
|
if(mask & VALU_FP64) {
|
|
res.valu_fp64 = fma_throughput_test<double>(device, 4096, runs);
|
|
}
|
|
|
|
if(mask & VALU_INT32) {
|
|
res.valu_int32 = fma_throughput_test<int>(device, 4096, runs);
|
|
}
|
|
|
|
#if !defined(__gfx906__)
|
|
// MFMA available on gfx908+ (excludes gfx906 with rev=6)
|
|
bool has_mfma = arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8));
|
|
|
|
if(mask & MATRIX_FP16) {
|
|
if(has_mfma) {
|
|
res.mfma_fp16 = matmul_throughput_test<float16, float>(device, 4096, runs);
|
|
} else {
|
|
res.mfma_fp16 = 0;
|
|
}
|
|
}
|
|
|
|
if(mask & MATRIX_FP32) {
|
|
if(has_mfma) {
|
|
res.mfma_fp32 = matmul_throughput_test<float, float>(device, 4096, runs);
|
|
} else {
|
|
res.mfma_fp32 = 0;
|
|
}
|
|
}
|
|
#else
|
|
// MFMA not available when compiling for gfx906
|
|
if(mask & MATRIX_FP16) {
|
|
res.mfma_fp16 = 0;
|
|
}
|
|
if(mask & MATRIX_FP32) {
|
|
res.mfma_fp32 = 0;
|
|
}
|
|
#endif
|
|
|
|
#if !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__)
|
|
if(mask & SMATRIX_FP16) {
|
|
// SMFMAC only available on gfx940 (MI300) and later, not on gfx906, gfx908, or gfx90a
|
|
if(arch.major == 0x9 && arch.minor >= 0x4) {
|
|
res.smfmac_fp16 = sparse_matmul_throughput_test<float16, float>(device, 4096, runs);
|
|
} else {
|
|
res.smfmac_fp16 = 0;
|
|
}
|
|
}
|
|
#else
|
|
// SMFMAC not available when compiling for gfx906, gfx908, or gfx90a
|
|
if(mask & SMATRIX_FP16) {
|
|
res.smfmac_fp16 = 0;
|
|
}
|
|
#endif
|
|
|
|
return res;
|
|
}
|
|
|
|
// Use fork() followed by exec() to run child process. For some reason
|
|
// rocprof does not pick up the child processes when only fork() is
|
|
// used.
|
|
pid_t fork_process(int device, int runs, uint32_t mask, int fd)
|
|
{
|
|
pid_t pid = fork();
|
|
|
|
if(pid != 0) {
|
|
return pid;
|
|
}
|
|
|
|
std::string str_device = std::to_string(device);
|
|
std::string str_runs = std::to_string(runs);
|
|
std::string str_mask = std::to_string(mask);
|
|
std::string str_fd = std::to_string(fd);
|
|
|
|
char* const args[] = {
|
|
(char*)"CHILD",
|
|
(char*)str_device.c_str(),
|
|
(char*)str_runs.c_str(),
|
|
(char*)str_mask.c_str(),
|
|
(char*)str_fd.c_str(),
|
|
NULL
|
|
};
|
|
|
|
execv("/proc/self/exe", args);
|
|
std::cout << "execv() failed: " << std::strerror(errno) << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
void run(std::vector<int>& devices, int runs, uint32_t mask)
|
|
{
|
|
std::vector<pid_t> pids;
|
|
|
|
// We will receive results from the child processes using a pipe
|
|
int fd[2];
|
|
|
|
if(pipe(fd)) {
|
|
std::cout << std::strerror(errno) << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
// Start a new process for each GPU
|
|
for(auto d : devices) {
|
|
pid_t pid = fork_process(d, runs, mask, fd[1]);
|
|
|
|
pids.push_back(pid);
|
|
}
|
|
|
|
// Wait for all processes to finish
|
|
for(auto pid : pids) {
|
|
int status;
|
|
waitpid(pid, &status, 0);
|
|
}
|
|
|
|
// Set the read to non-blocking
|
|
int flags = fcntl(fd[0], F_GETFL, 0);
|
|
fcntl(fd[0], F_SETFL, flags | O_NONBLOCK);
|
|
|
|
// Read records from pipe
|
|
std::vector<Result> results(pids.size());
|
|
int count = read(fd[0], results.data(), results.size() * sizeof(Result)) / sizeof(Result);
|
|
|
|
results.resize(count);
|
|
|
|
// Sort results by GPU id
|
|
std::sort(results.begin(), results.end());
|
|
|
|
// Print results
|
|
for(auto r : results) {
|
|
std::cout << std::endl << "GPU " << r.device << std::endl;
|
|
print_result(r, mask);
|
|
}
|
|
|
|
Result total;
|
|
for(auto r : results) {
|
|
total.valu_fp16 += r.valu_fp16;
|
|
total.valu_fp32 += r.valu_fp32;
|
|
total.valu_fp64 += r.valu_fp64;
|
|
total.valu_int32 += r.valu_int32;
|
|
total.mfma_fp16 += r.mfma_fp16;
|
|
total.mfma_fp32 += r.mfma_fp32;
|
|
total.smfmac_fp16 += r.smfmac_fp16;
|
|
}
|
|
std::cout << std::endl << "System total" << std::endl;
|
|
print_result(total, mask);
|
|
}
|
|
|
|
|
|
void usage()
|
|
{
|
|
std::cout << "--device ID Use device with the given numerical ID" << std::endl;
|
|
std::cout << "--devices IDS | ALL Comma-separated list of device Ids (e.g., 1,2,3)" << std::endl;
|
|
std::cout << " ALL for all devices" << std::endl;
|
|
std::cout << "--runs RUNS Number of times each kernel is dispatched" << std::endl;
|
|
|
|
std::cout << "--fp16 Run FP16 (VALU) test" << std::endl;
|
|
std::cout << "--fp32 Run FP32 (VALU) test" << std::endl;
|
|
std::cout << "--fp64 Run FP64 (VALU) test" << std::endl;
|
|
std::cout << "--int32 Run INT32 (VALU) test" << std::endl;
|
|
std::cout << "--matfp16 Run FP16 (MFMA) test" << std::endl;
|
|
std::cout << "--matfp32 Run FP32 (MFMA) test" << std::endl;
|
|
std::cout << "--smatfp16 Run FP16 (SMFMAC) test" << std::endl;
|
|
}
|
|
|
|
int main(int argc, char** argv)
|
|
{
|
|
if(std::string(argv[0]) == "CHILD") {
|
|
int device = atoi(argv[1]);
|
|
int runs = atoi(argv[2]);
|
|
uint32_t mask = atoi(argv[3]);
|
|
int fd = atoi(argv[4]);
|
|
|
|
Result res = run_tests(device, runs, mask);
|
|
|
|
write(fd, &res, sizeof(res));
|
|
return 0;
|
|
}
|
|
|
|
int runs = 1;
|
|
|
|
uint32_t mask = 0;
|
|
bool all_devices = false;
|
|
std::vector<int> devices;
|
|
int device_count;
|
|
int device = 0;
|
|
|
|
HIP_CALL(hipGetDeviceCount(&device_count));
|
|
|
|
int i = 1;
|
|
while(i < argc) {
|
|
std::string arg = std::string(argv[i]);
|
|
|
|
if(arg == "--help") {
|
|
usage();
|
|
return 0;
|
|
} else if(arg == "--device") {
|
|
devices.push_back(atoi(argv[i + 1]));
|
|
// Skip next
|
|
i++;
|
|
} else if(arg == "--devices") {
|
|
// Parse comma-separated string of numbers
|
|
std::string s(argv[i + 1]);
|
|
|
|
if(s == "all" || s == "ALL") {
|
|
all_devices = true;
|
|
} else {
|
|
std::stringstream ss(s);
|
|
std::string r;
|
|
while(getline(ss, r, ',')) {
|
|
devices.push_back(std::stoi(r));
|
|
}
|
|
}
|
|
// Skip next
|
|
i++;
|
|
} else if(arg == "--runs") {
|
|
runs = atoi(argv[i + 1]);
|
|
|
|
// Skip next
|
|
i++;
|
|
} else if(arg == "--fp32") {
|
|
mask |= VALU_FP32;
|
|
} else if(arg == "--fp64") {
|
|
mask |= VALU_FP64;
|
|
} else if(arg == "--fp16") {
|
|
mask |= VALU_FP16;
|
|
} else if(arg == "--int32") {
|
|
mask |= VALU_INT32;
|
|
} else if(arg == "--matfp16") {
|
|
mask |= MATRIX_FP16;
|
|
} else if(arg == "--matfp32") {
|
|
mask |= MATRIX_FP32;
|
|
} else if(arg == "--smatfp16") {
|
|
mask |= SMATRIX_FP16;
|
|
} else {
|
|
std::cout << "Invalid argument '" << arg << "'" << std::endl;
|
|
std::cout << std::endl;
|
|
usage();
|
|
return 1;
|
|
}
|
|
|
|
i++;
|
|
}
|
|
|
|
if(all_devices) {
|
|
for(int i = 0; i < device_count; i++ ){
|
|
devices.push_back(i);
|
|
}
|
|
}
|
|
|
|
// Verify device ID's
|
|
for(auto d : devices) {
|
|
if(d >= device_count) {
|
|
std::cout << "Invalid device ordinal: " << d << std::endl;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if(devices.size() == 0) {
|
|
devices.push_back(0);
|
|
}
|
|
|
|
if(mask == 0) {
|
|
mask = ALL;
|
|
}
|
|
|
|
run(devices, runs, mask);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|