diff --git a/projects/rocprofiler-compute/sample/rocflop.cpp b/projects/rocprofiler-compute/sample/rocflop.cpp index e7511084bb..e8133da9e0 100644 --- a/projects/rocprofiler-compute/sample/rocflop.cpp +++ b/projects/rocprofiler-compute/sample/rocflop.cpp @@ -51,6 +51,8 @@ template __global__ void fma_throughput(vec4* buffer, int count) ptr[tid] = value0 + value1 + value2 + value3; } +// MFMA instructions are only available on gfx908 and later (not supported on gfx906) +#if !defined(__gfx906__) __global__ void matmul_fp16_throughput(vec4* inputs, vec4* outputs, int count) { int grid_size = gridDim.x * blockDim.x; @@ -108,6 +110,46 @@ __global__ void matmul_fp32_throughput(float* inputs, vec4* outputs, int outputs[tid] = accum0 + accum1 + accum2 + accum3; } +#endif // !defined(__gfx906__) + +// SMFMAC (Sparse MFMA) instructions are only available on gfx90a and later (not on gfx906 or gfx908) +#if !defined(__gfx906__) && !defined(__gfx908__) +__global__ void sparse_matmul_fp16_throughput(vec4* input0, vec8* input1, vec4* outputs, int count) +{ + int grid_size = gridDim.x * blockDim.x; + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + vec4* x_ptr = input0; + vec8* y_ptr = input1; + + vec4 x0 = x_ptr[0 * grid_size + tid]; + vec4 x1 = x_ptr[1 * grid_size + tid]; + vec4 x2 = x_ptr[2 * grid_size + tid]; + vec4 x3 = x_ptr[3 * grid_size + tid]; + + vec8 y0 = y_ptr[0 * grid_size + tid]; + vec8 y1 = y_ptr[1 * grid_size + tid]; + vec8 y2 = y_ptr[2 * grid_size + tid]; + vec8 y3 = y_ptr[3 * grid_size + tid]; + + vec4 accum0; + vec4 accum1; + vec4 accum2; + vec4 accum3; + + for(int i = 0; i < count; i++) { + for(int j = 0; j < 64; j++) { + // 4 SMFMAC ops + accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0); + accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0); + accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0); + accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0); + } + } + + outputs[tid] = accum0 + accum1 + accum2 + accum3; +} +#endif // !defined(__gfx906__) && !defined(__gfx908__) void HIP_CALL(hipError_t err) { @@ -151,6 +193,7 @@ enum : uint32_t { VALU_FP64 = 1 << 2, MATRIX_FP16 = 1 << 3, MATRIX_FP32 = 1 << 4, + SMATRIX_FP16 = 1 << 5, VALU_INT32 = 1 << 6, ALL = (uint32_t)-1 @@ -221,6 +264,7 @@ template double fma_throughput_test(int device, int count, int runs return flops; } +#if !defined(__gfx906__) template double matmul_throughput_test(int device, int count, int runs = 1) { const int wave_size = 64; @@ -276,6 +320,63 @@ template double matmul_throughput_test(int devic return flops; } +#endif // !defined(__gfx906__) + +#if !defined(__gfx906__) && !defined(__gfx908__) +template double sparse_matmul_throughput_test(int device, int count, int runs = 1) +{ + const int wave_size = 64; + int k; + int m; + int n; + + if(std::is_same::value) { + m = 16; + n = 16; + k = 32; + } else { + assert(false); + } + + int ops_per_matmul = k * m * n * 2; + + void* buffer1 = nullptr; + void* buffer2 = nullptr; + void* accum = nullptr; + + hipDeviceProp_t props; + HIP_CALL(hipGetDeviceProperties(&props, device)); + + int blocks = props.multiProcessorCount * 512; + int threads_per_block = wave_size; + int total_threads = blocks * threads_per_block; + + HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads)); + HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads)); + HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads)); + + HIPTimer t; + t.start(); + for(int i = 0; i < runs; i++) { + if(std::is_same::value && std::is_same::value) { + sparse_matmul_fp16_throughput<<>>((vec4*)buffer1, + (vec8*)buffer2, (vec4*)accum, count); + } + } + t.stop(); + HIP_CALL(hipDeviceSynchronize()); + + double elapsed = t.elapsed(); + double ops = (double)blocks * count * 64 * 4 * runs; + double flops = (double)ops * ops_per_matmul / elapsed; + + HIP_CALL(hipFree(buffer1)); + HIP_CALL(hipFree(buffer2)); + HIP_CALL(hipFree(accum)); + + return flops; +} +#endif // !defined(__gfx906__) && !defined(__gfx908__) struct Result { int device = -1; @@ -285,6 +386,7 @@ struct Result { double valu_int32 = 0; double mfma_fp16 = 0; double mfma_fp32 = 0; + double smfmac_fp16 = 0; // Used for sorting bool operator<(const Result& other) { @@ -312,6 +414,9 @@ void print_result(const Result& res, uint32_t mask) if(mask & MATRIX_FP32) { printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12); } + if(mask & SMATRIX_FP16) { + printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12); + } } Result run_tests(int device, int runs, uint32_t mask) @@ -346,8 +451,12 @@ Result run_tests(int device, int runs, uint32_t mask) res.valu_int32 = fma_throughput_test(device, 4096, runs); } +#if !defined(__gfx906__) + // MFMA available on gfx908+ (excludes gfx906 with rev=6) + bool has_mfma = arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8)); + if(mask & MATRIX_FP16) { - if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) { + if(has_mfma) { res.mfma_fp16 = matmul_throughput_test(device, 4096, runs); } else { res.mfma_fp16 = 0; @@ -355,12 +464,37 @@ Result run_tests(int device, int runs, uint32_t mask) } if(mask & MATRIX_FP32) { - if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) { + if(has_mfma) { res.mfma_fp32 = matmul_throughput_test(device, 4096, runs); } else { res.mfma_fp32 = 0; } } +#else + // MFMA not available when compiling for gfx906 + if(mask & MATRIX_FP16) { + res.mfma_fp16 = 0; + } + if(mask & MATRIX_FP32) { + res.mfma_fp32 = 0; + } +#endif + +#if !defined(__gfx906__) && !defined(__gfx908__) + if(mask & SMATRIX_FP16) { + // SMFMAC only available on gfx90a (MI200) and later, not on gfx906 or gfx908 + if(arch.major == 0x9 && (arch.minor > 0x4 || (arch.minor == 0 && arch.rev >= 0xa))) { + res.smfmac_fp16 = sparse_matmul_throughput_test(device, 4096, runs); + } else { + res.smfmac_fp16 = 0; + } + } +#else + // SMFMAC not available when compiling for gfx906 or gfx908 + if(mask & SMATRIX_FP16) { + res.smfmac_fp16 = 0; + } +#endif return res; } @@ -447,6 +581,7 @@ void run(std::vector& devices, int runs, uint32_t mask) total.valu_int32 += r.valu_int32; total.mfma_fp16 += r.mfma_fp16; total.mfma_fp32 += r.mfma_fp32; + total.smfmac_fp16 += r.smfmac_fp16; } std::cout << std::endl << "System total" << std::endl; print_result(total, mask); @@ -463,8 +598,10 @@ void usage() std::cout << "--fp16 Run FP16 (VALU) test" << std::endl; std::cout << "--fp32 Run FP32 (VALU) test" << std::endl; std::cout << "--fp64 Run FP64 (VALU) test" << std::endl; + std::cout << "--int32 Run INT32 (VALU) test" << std::endl; std::cout << "--matfp16 Run FP16 (MFMA) test" << std::endl; std::cout << "--matfp32 Run FP32 (MFMA) test" << std::endl; + std::cout << "--smatfp16 Run FP16 (SMFMAC) test" << std::endl; } int main(int argc, char** argv) @@ -534,6 +671,8 @@ int main(int argc, char** argv) mask |= MATRIX_FP16; } else if(arg == "--matfp32") { mask |= MATRIX_FP32; + } else if(arg == "--smatfp16") { + mask |= SMATRIX_FP16; } else { std::cout << "Invalid argument '" << arg << "'" << std::endl; std::cout << std::endl;