From ea3fb1b810d37a8ef234544b41f68df1403beff0 Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Sat, 27 Dec 2025 09:47:54 -0500 Subject: [PATCH] Remove SMFMAC functionality in rocflop sample since its not supported in MI100 (#2456) --- .../rocprofiler-compute/sample/rocflop.cpp | 107 ------------------ 1 file changed, 107 deletions(-) diff --git a/projects/rocprofiler-compute/sample/rocflop.cpp b/projects/rocprofiler-compute/sample/rocflop.cpp index a2fe1438ee..e7511084bb 100644 --- a/projects/rocprofiler-compute/sample/rocflop.cpp +++ b/projects/rocprofiler-compute/sample/rocflop.cpp @@ -80,42 +80,6 @@ __global__ void matmul_fp16_throughput(vec4* inputs, vec4* outpu outputs[tid] = accum0 + accum1 + accum2 + accum3; } -__global__ void sparse_matmul_fp16_throughput(vec4* input0, vec8* input1, vec4* outputs, int count) -{ - int grid_size = gridDim.x * blockDim.x; - int tid = blockDim.x * blockIdx.x + threadIdx.x; - - vec4* x_ptr = input0; - vec8* y_ptr = input1; - - vec4 x0 = x_ptr[0 * grid_size + tid]; - vec4 x1 = x_ptr[1 * grid_size + tid]; - vec4 x2 = x_ptr[2 * grid_size + tid]; - vec4 x3 = x_ptr[3 * grid_size + tid]; - - vec8 y0 = y_ptr[0 * grid_size + tid]; - vec8 y1 = y_ptr[1 * grid_size + tid]; - vec8 y2 = y_ptr[2 * grid_size + tid]; - vec8 y3 = y_ptr[3 * grid_size + tid]; - - vec4 accum0; - vec4 accum1; - vec4 accum2; - vec4 accum3; - - for(int i = 0; i < count; i++) { - for(int j = 0; j < 64; j++) { - // 4 SMFMAC ops - accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0); - accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0); - accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0); - accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0); - } - } - - outputs[tid] = accum0 + accum1 + accum2 + accum3; -} - __global__ void matmul_fp32_throughput(float* inputs, vec4* outputs, int count) { int grid_size = gridDim.x * blockDim.x; @@ -187,7 +151,6 @@ enum : uint32_t { VALU_FP64 = 1 << 2, MATRIX_FP16 = 1 << 3, MATRIX_FP32 = 1 << 4, - SMATRIX_FP16 = 1 << 5, VALU_INT32 = 1 << 6, ALL = (uint32_t)-1 @@ -314,60 +277,6 @@ template double matmul_throughput_test(int devic return flops; } -template double sparse_matmul_throughput_test(int device, int count, int runs = 1) -{ - const int wave_size = 64; - int k; - int m; - int n; - - if(std::is_same::value) { - m = 16; - n = 16; - k = 32; - } else { - assert(false); - } - - int ops_per_matmul = k * m * n * 2; - - void* buffer1 = nullptr; - void* buffer2 = nullptr; - void* accum = nullptr; - - hipDeviceProp_t props; - HIP_CALL(hipGetDeviceProperties(&props, device)); - - int blocks = props.multiProcessorCount * 512; - int threads_per_block = wave_size; - int total_threads = blocks * threads_per_block; - - HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads)); - HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads)); - HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads)); - - HIPTimer t; - t.start(); - for(int i = 0; i < runs; i++) { - if(std::is_same::value && std::is_same::value) { - sparse_matmul_fp16_throughput<<>>((vec4*)buffer1, - (vec8*)buffer2, (vec4*)accum, count); - } - } - t.stop(); - HIP_CALL(hipDeviceSynchronize()); - - double elapsed = t.elapsed(); - double ops = (double)blocks * count * 64 * 4 * runs; - double flops = (double)ops * ops_per_matmul / elapsed; - - HIP_CALL(hipFree(buffer1)); - HIP_CALL(hipFree(buffer2)); - HIP_CALL(hipFree(accum)); - - return flops; -} - struct Result { int device = -1; double valu_fp16 = 0; @@ -376,7 +285,6 @@ struct Result { double valu_int32 = 0; double mfma_fp16 = 0; double mfma_fp32 = 0; - double smfmac_fp16 = 0; // Used for sorting bool operator<(const Result& other) { @@ -404,10 +312,6 @@ void print_result(const Result& res, uint32_t mask) if(mask & MATRIX_FP32) { printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12); } - if(mask & SMATRIX_FP16) { - printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12); - - } } Result run_tests(int device, int runs, uint32_t mask) @@ -458,13 +362,6 @@ Result run_tests(int device, int runs, uint32_t mask) } } - if(mask & SMATRIX_FP16) { - if(arch.major == 9 && arch.minor >= 4) { - res.smfmac_fp16 = sparse_matmul_throughput_test(device, 4096, runs); - } else { - res.smfmac_fp16 = 0; - } - } return res; } @@ -550,7 +447,6 @@ void run(std::vector& devices, int runs, uint32_t mask) total.valu_int32 += r.valu_int32; total.mfma_fp16 += r.mfma_fp16; total.mfma_fp32 += r.mfma_fp32; - total.smfmac_fp16 += r.smfmac_fp16; } std::cout << std::endl << "System total" << std::endl; print_result(total, mask); @@ -569,7 +465,6 @@ void usage() std::cout << "--fp64 Run FP64 (VALU) test" << std::endl; std::cout << "--matfp16 Run FP16 (MFMA) test" << std::endl; std::cout << "--matfp32 Run FP32 (MFMA) test" << std::endl; - std::cout << "--smatfp16 Run FP16 (SMFMAC) test" << std::endl; } int main(int argc, char** argv) @@ -639,8 +534,6 @@ int main(int argc, char** argv) mask |= MATRIX_FP16; } else if(arg == "--matfp32") { mask |= MATRIX_FP32; - } else if(arg == "--smatfp16") { - mask |= SMATRIX_FP16; } else { std::cout << "Invalid argument '" << arg << "'" << std::endl; std::cout << std::endl;