// Copied from https://github.com/benrichard-amd/rocflop/tree/82f197e12314bab694fc70451a2b495b4f51bf90 #include #include #include #include #include #include #include #include #include using float16 = _Float16; // Vector types. Useful for packed math (where supported) and MFMA inputs. template using vecT = T __attribute__((ext_vector_type(Rank))); template using vec4 = vecT; template using vec8 = vecT; // Kernels template __global__ void fma_throughput(vec4* buffer, int count) { const T k = 1.0; const int grid_size = gridDim.x * blockDim.x; const int tid = blockDim.x * blockIdx.x + threadIdx.x; vec4* ptr = buffer; vec4 value0 = ptr[0 * grid_size + tid]; vec4 value1 = ptr[1 * grid_size + tid]; vec4 value2 = ptr[2 * grid_size + tid]; vec4 value3 = ptr[3 * grid_size + tid]; for(int j = 0; j < count; j++) { for(int j = 0; j < 64; j++) { // 16 FMA ops value0 = value0 * value0 + k; value1 = value1 * value1 + k; value2 = value2 * value2 + k; value3 = value3 * value3 + k; } } ptr[tid] = value0 + value1 + value2 + value3; } // MFMA instructions are only available on gfx908 and later (not supported on gfx906) #if !defined(__gfx906__) __global__ void matmul_fp16_throughput(vec4* inputs, vec4* outputs, int count) { int grid_size = gridDim.x * blockDim.x; int tid = blockDim.x * blockIdx.x + threadIdx.x; vec4* ptr = inputs; vec4 value0 = ptr[0 * grid_size + tid]; vec4 value1 = ptr[1 * grid_size + tid]; vec4 value2 = ptr[2 * grid_size + tid]; vec4 value3 = ptr[3 * grid_size + tid]; vec4 accum0; vec4 accum1; vec4 accum2; vec4 accum3; for(int i = 0; i < count; i++) { for(int j = 0; j < 64; j++) { // 4 MFMA ops accum0 = __builtin_amdgcn_mfma_f32_16x16x16f16(value0, value0, accum0, 0, 0, 0); accum1 = __builtin_amdgcn_mfma_f32_16x16x16f16(value1, value1, accum1, 0, 0, 0); accum2 = __builtin_amdgcn_mfma_f32_16x16x16f16(value2, value2, accum2, 0, 0, 0); accum3 = __builtin_amdgcn_mfma_f32_16x16x16f16(value3, value3, accum3, 0, 0, 0); } } outputs[tid] = accum0 + accum1 + accum2 + accum3; } __global__ void matmul_fp32_throughput(float* inputs, vec4* outputs, int count) { int grid_size = gridDim.x * blockDim.x; int tid = blockDim.x * blockIdx.x + threadIdx.x; float* ptr = inputs; float value0 = ptr[0 * grid_size + tid]; float value1 = ptr[1 * grid_size + tid]; float value2 = ptr[2 * grid_size + tid]; float value3 = ptr[2 * grid_size + tid]; vec4 accum0; vec4 accum1; vec4 accum2; vec4 accum3; for(int i = 0; i < count; i++) { for(int j = 0; j < 64; j++) { // 4 MFMA ops accum0 = __builtin_amdgcn_mfma_f32_16x16x4f32(value0, value0, accum0, 0, 0, 0); accum1 = __builtin_amdgcn_mfma_f32_16x16x4f32(value1, value1, accum1, 0, 0, 0); accum2 = __builtin_amdgcn_mfma_f32_16x16x4f32(value2, value2, accum2, 0, 0, 0); accum3 = __builtin_amdgcn_mfma_f32_16x16x4f32(value3, value3, accum3, 0, 0, 0); } } outputs[tid] = accum0 + accum1 + accum2 + accum3; } #endif // !defined(__gfx906__) // SMFMAC (Sparse MFMA) instructions are only available on gfx940 and later (not on gfx906, gfx908, or gfx90a) #if !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__) __global__ void sparse_matmul_fp16_throughput(vec4* input0, vec8* input1, vec4* outputs, int count) { int grid_size = gridDim.x * blockDim.x; int tid = blockDim.x * blockIdx.x + threadIdx.x; vec4* x_ptr = input0; vec8* y_ptr = input1; vec4 x0 = x_ptr[0 * grid_size + tid]; vec4 x1 = x_ptr[1 * grid_size + tid]; vec4 x2 = x_ptr[2 * grid_size + tid]; vec4 x3 = x_ptr[3 * grid_size + tid]; vec8 y0 = y_ptr[0 * grid_size + tid]; vec8 y1 = y_ptr[1 * grid_size + tid]; vec8 y2 = y_ptr[2 * grid_size + tid]; vec8 y3 = y_ptr[3 * grid_size + tid]; vec4 accum0; vec4 accum1; vec4 accum2; vec4 accum3; for(int i = 0; i < count; i++) { for(int j = 0; j < 64; j++) { // 4 SMFMAC ops accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0); accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0); accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0); accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0); } } outputs[tid] = accum0 + accum1 + accum2 + accum3; } #endif // !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__) void HIP_CALL(hipError_t err) { if(err != hipSuccess) { std::cout << "HIP Error: " << (int)err << " " << hipGetErrorString(err) << std::endl; exit(1); } } struct GCNArch { int major; int minor; int rev; }; GCNArch get_gcn_arch(int device) { hipDeviceProp_t props; HIP_CALL(hipGetDeviceProperties(&props, device)); // Example: gfx908:sramecc+:xnack- std::string arch_full(props.gcnArchName); // Extract number e.g. "908" std::string gfx_str = arch_full.substr(3, arch_full.find_first_of(':')); int gfx_num = std::stoi(gfx_str, nullptr, 16); GCNArch arch; arch.major = (gfx_num & 0xff00) >> 8; arch.minor = (gfx_num & 0x00f0) >> 4; arch.rev = (gfx_num & 0x000f); return arch; } enum : uint32_t { VALU_FP32 = 1 << 0, VALU_FP16 = 1 << 1, VALU_FP64 = 1 << 2, MATRIX_FP16 = 1 << 3, MATRIX_FP32 = 1 << 4, SMATRIX_FP16 = 1 << 5, VALU_INT32 = 1 << 6, ALL = (uint32_t)-1 }; // Timer for measuring kernel duration class HIPTimer { private: hipEvent_t m_start; hipEvent_t m_stop; public: HIPTimer() { HIP_CALL(hipEventCreate(&m_start)); HIP_CALL(hipEventCreate(&m_stop)); } void start() { HIP_CALL(hipEventRecord(m_start)); } void stop() { HIP_CALL(hipEventRecord(m_stop)); } double elapsed() { float ms; HIP_CALL(hipEventElapsedTime(&ms, m_start, m_stop)); return (double)ms / 1000.0; } }; // Host code template double fma_throughput_test(int device, int count, int runs = 1) { vec4* buffer = nullptr; hipDeviceProp_t props; HIP_CALL(hipGetDeviceProperties(&props, device)); int blocks = props.multiProcessorCount * 512; int threads_per_block = 64; int total_threads = blocks * threads_per_block; HIP_CALL(hipMalloc(&buffer, sizeof(vec4) * total_threads * 4)); HIPTimer t; t.start(); for(int i = 0; i < runs; i++) { fma_throughput<<>>(buffer, count); } t.stop(); HIP_CALL(hipDeviceSynchronize()); double elapsed = t.elapsed(); double ops = (double)total_threads * count * 64 * 16 * runs; double flops = (double)ops * 2.0 / elapsed; HIP_CALL(hipFree(buffer)); return flops; } #if !defined(__gfx906__) template double matmul_throughput_test(int device, int count, int runs = 1) { const int wave_size = 64; int k; int m; int n; if(std::is_same::value) { m = 16; n = 16; k = 16; } else if(std::is_same::value) { m = 16; n = 16; k = 4; } else { assert(false); } int ops_per_matmul = k * m * n * 2; void* buffer = nullptr; void* accum = nullptr; hipDeviceProp_t props; HIP_CALL(hipGetDeviceProperties(&props, device)); int blocks = props.multiProcessorCount * 512; int threads_per_block = wave_size; int total_threads = blocks * threads_per_block; HIP_CALL(hipMalloc(&buffer, 4 * sizeof(matT) * m * k * total_threads)); HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads)); HIPTimer t; t.start(); for(int i = 0; i < runs; i++) { if(std::is_same::value && std::is_same::value) { matmul_fp16_throughput<<>>((vec4*)buffer, (vec4*)accum, count); } else if(std::is_same::value && std::is_same::value) { matmul_fp32_throughput<<>>((float*)buffer, (vec4*)accum, count); } } t.stop(); HIP_CALL(hipDeviceSynchronize()); double elapsed = t.elapsed(); double ops = (double)blocks * count * 64 * 4 * runs; double flops = (double)ops * ops_per_matmul / elapsed; HIP_CALL(hipFree(buffer)); HIP_CALL(hipFree(accum)); return flops; } #endif // !defined(__gfx906__) #if !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__) template double sparse_matmul_throughput_test(int device, int count, int runs = 1) { const int wave_size = 64; int k; int m; int n; if(std::is_same::value) { m = 16; n = 16; k = 32; } else { assert(false); } int ops_per_matmul = k * m * n * 2; void* buffer1 = nullptr; void* buffer2 = nullptr; void* accum = nullptr; hipDeviceProp_t props; HIP_CALL(hipGetDeviceProperties(&props, device)); int blocks = props.multiProcessorCount * 512; int threads_per_block = wave_size; int total_threads = blocks * threads_per_block; HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads)); HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads)); HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads)); HIPTimer t; t.start(); for(int i = 0; i < runs; i++) { if(std::is_same::value && std::is_same::value) { sparse_matmul_fp16_throughput<<>>((vec4*)buffer1, (vec8*)buffer2, (vec4*)accum, count); } } t.stop(); HIP_CALL(hipDeviceSynchronize()); double elapsed = t.elapsed(); double ops = (double)blocks * count * 64 * 4 * runs; double flops = (double)ops * ops_per_matmul / elapsed; HIP_CALL(hipFree(buffer1)); HIP_CALL(hipFree(buffer2)); HIP_CALL(hipFree(accum)); return flops; } #endif // !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__) struct Result { int device = -1; double valu_fp16 = 0; double valu_fp32 = 0; double valu_fp64 = 0; double valu_int32 = 0; double mfma_fp16 = 0; double mfma_fp32 = 0; double smfmac_fp16 = 0; // Used for sorting bool operator<(const Result& other) { return device < other.device; } }; void print_result(const Result& res, uint32_t mask) { if(mask & VALU_FP16) { printf("VALU FP16: %8.2f TFLOPS\n", res.valu_fp16 / 1e12); } if(mask & VALU_FP32) { printf("VALU FP32: %8.2f TFLOPS\n", res.valu_fp32 / 1e12); } if(mask & VALU_FP64) { printf("VALU FP64: %8.2f TFLOPS\n", res.valu_fp64 / 1e12); } if(mask & VALU_INT32) { printf("VALU INT32: %8.2f TIOPS\n", res.valu_int32 / 1e12); } if(mask & MATRIX_FP16) { printf("MFMA FP16: %8.2f TFLOPS\n", res.mfma_fp16 / 1e12); } if(mask & MATRIX_FP32) { printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12); } if(mask & SMATRIX_FP16) { printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12); } } Result run_tests(int device, int runs, uint32_t mask) { int device_count; HIP_CALL(hipGetDeviceCount(&device_count)); if(device >= device_count) { std::cout << "Device " << device << " does not exist. Skipping..." << std::endl; exit(1); } HIP_CALL(hipSetDevice(device)); GCNArch arch = get_gcn_arch(device); Result res = {.device = device}; if(mask & VALU_FP16) { res.valu_fp16 = fma_throughput_test(device, 4096, runs); } if(mask & VALU_FP32) { res.valu_fp32 = fma_throughput_test(device, 4096, runs); } if(mask & VALU_FP64) { res.valu_fp64 = fma_throughput_test(device, 4096, runs); } if(mask & VALU_INT32) { res.valu_int32 = fma_throughput_test(device, 4096, runs); } #if !defined(__gfx906__) // MFMA available on gfx908+ (excludes gfx906 with rev=6) bool has_mfma = arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8)); if(mask & MATRIX_FP16) { if(has_mfma) { res.mfma_fp16 = matmul_throughput_test(device, 4096, runs); } else { res.mfma_fp16 = 0; } } if(mask & MATRIX_FP32) { if(has_mfma) { res.mfma_fp32 = matmul_throughput_test(device, 4096, runs); } else { res.mfma_fp32 = 0; } } #else // MFMA not available when compiling for gfx906 if(mask & MATRIX_FP16) { res.mfma_fp16 = 0; } if(mask & MATRIX_FP32) { res.mfma_fp32 = 0; } #endif #if !defined(__gfx906__) && !defined(__gfx908__) && !defined(__gfx90a__) if(mask & SMATRIX_FP16) { // SMFMAC only available on gfx940 (MI300) and later, not on gfx906, gfx908, or gfx90a if(arch.major == 0x9 && arch.minor >= 0x4) { res.smfmac_fp16 = sparse_matmul_throughput_test(device, 4096, runs); } else { res.smfmac_fp16 = 0; } } #else // SMFMAC not available when compiling for gfx906, gfx908, or gfx90a if(mask & SMATRIX_FP16) { res.smfmac_fp16 = 0; } #endif return res; } // Use fork() followed by exec() to run child process. For some reason // rocprof does not pick up the child processes when only fork() is // used. pid_t fork_process(int device, int runs, uint32_t mask, int fd) { pid_t pid = fork(); if(pid != 0) { return pid; } std::string str_device = std::to_string(device); std::string str_runs = std::to_string(runs); std::string str_mask = std::to_string(mask); std::string str_fd = std::to_string(fd); char* const args[] = { (char*)"CHILD", (char*)str_device.c_str(), (char*)str_runs.c_str(), (char*)str_mask.c_str(), (char*)str_fd.c_str(), NULL }; execv("/proc/self/exe", args); std::cout << "execv() failed: " << std::strerror(errno) << std::endl; exit(1); } void run(std::vector& devices, int runs, uint32_t mask) { std::vector pids; // We will receive results from the child processes using a pipe int fd[2]; if(pipe(fd)) { std::cout << std::strerror(errno) << std::endl; exit(1); } // Start a new process for each GPU for(auto d : devices) { pid_t pid = fork_process(d, runs, mask, fd[1]); pids.push_back(pid); } // Wait for all processes to finish for(auto pid : pids) { int status; waitpid(pid, &status, 0); } // Set the read to non-blocking int flags = fcntl(fd[0], F_GETFL, 0); fcntl(fd[0], F_SETFL, flags | O_NONBLOCK); // Read records from pipe std::vector results(pids.size()); int count = read(fd[0], results.data(), results.size() * sizeof(Result)) / sizeof(Result); results.resize(count); // Sort results by GPU id std::sort(results.begin(), results.end()); // Print results for(auto r : results) { std::cout << std::endl << "GPU " << r.device << std::endl; print_result(r, mask); } Result total; for(auto r : results) { total.valu_fp16 += r.valu_fp16; total.valu_fp32 += r.valu_fp32; total.valu_fp64 += r.valu_fp64; total.valu_int32 += r.valu_int32; total.mfma_fp16 += r.mfma_fp16; total.mfma_fp32 += r.mfma_fp32; total.smfmac_fp16 += r.smfmac_fp16; } std::cout << std::endl << "System total" << std::endl; print_result(total, mask); } void usage() { std::cout << "--device ID Use device with the given numerical ID" << std::endl; std::cout << "--devices IDS | ALL Comma-separated list of device Ids (e.g., 1,2,3)" << std::endl; std::cout << " ALL for all devices" << std::endl; std::cout << "--runs RUNS Number of times each kernel is dispatched" << std::endl; std::cout << "--fp16 Run FP16 (VALU) test" << std::endl; std::cout << "--fp32 Run FP32 (VALU) test" << std::endl; std::cout << "--fp64 Run FP64 (VALU) test" << std::endl; std::cout << "--int32 Run INT32 (VALU) test" << std::endl; std::cout << "--matfp16 Run FP16 (MFMA) test" << std::endl; std::cout << "--matfp32 Run FP32 (MFMA) test" << std::endl; std::cout << "--smatfp16 Run FP16 (SMFMAC) test" << std::endl; } int main(int argc, char** argv) { if(std::string(argv[0]) == "CHILD") { int device = atoi(argv[1]); int runs = atoi(argv[2]); uint32_t mask = atoi(argv[3]); int fd = atoi(argv[4]); Result res = run_tests(device, runs, mask); write(fd, &res, sizeof(res)); return 0; } int runs = 1; uint32_t mask = 0; bool all_devices = false; std::vector devices; int device_count; int device = 0; HIP_CALL(hipGetDeviceCount(&device_count)); int i = 1; while(i < argc) { std::string arg = std::string(argv[i]); if(arg == "--help") { usage(); return 0; } else if(arg == "--device") { devices.push_back(atoi(argv[i + 1])); // Skip next i++; } else if(arg == "--devices") { // Parse comma-separated string of numbers std::string s(argv[i + 1]); if(s == "all" || s == "ALL") { all_devices = true; } else { std::stringstream ss(s); std::string r; while(getline(ss, r, ',')) { devices.push_back(std::stoi(r)); } } // Skip next i++; } else if(arg == "--runs") { runs = atoi(argv[i + 1]); // Skip next i++; } else if(arg == "--fp32") { mask |= VALU_FP32; } else if(arg == "--fp64") { mask |= VALU_FP64; } else if(arg == "--fp16") { mask |= VALU_FP16; } else if(arg == "--int32") { mask |= VALU_INT32; } else if(arg == "--matfp16") { mask |= MATRIX_FP16; } else if(arg == "--matfp32") { mask |= MATRIX_FP32; } else if(arg == "--smatfp16") { mask |= SMATRIX_FP16; } else { std::cout << "Invalid argument '" << arg << "'" << std::endl; std::cout << std::endl; usage(); return 1; } i++; } if(all_devices) { for(int i = 0; i < device_count; i++ ){ devices.push_back(i); } } // Verify device ID's for(auto d : devices) { if(d >= device_count) { std::cout << "Invalid device ordinal: " << d << std::endl; return 1; } } if(devices.size() == 0) { devices.push_back(0); } if(mask == 0) { mask = ALL; } run(devices, runs, mask); return 0; }