// MIT License // // Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // // undefine NDEBUG so asserts are implemented #ifdef NDEBUG # undef NDEBUG #endif #include #include #include #include #include #include "hip/hip_runtime.h" // Two waves per SIMD on MI300 #define DATA_SIZE (304 * 64 * 4 * 2) #define HIP_API_CALL(CALL) \ if((CALL) != hipSuccess) \ { \ abort(); \ } #define LDS_SIZE 1024 __global__ void divide_kernel(float* a, const float* b, const float* c, int /* unused */) { int index = blockDim.x * blockIdx.x + threadIdx.x; if(index >= DATA_SIZE) return; a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1; } __global__ void looping_lds_kernel(float* a, const float* b, const float* c, int loopcount) { __shared__ float interm[LDS_SIZE]; size_t index = blockDim.x * blockIdx.x + threadIdx.x; for(size_t i = index; i < DATA_SIZE; i += blockDim.x * gridDim.x) interm[threadIdx.x % LDS_SIZE] = b[index] + threadIdx.x; for(int it = 0; it < loopcount; it++) { __syncthreads(); float value = interm[(it + threadIdx.x + LDS_SIZE / 2) % LDS_SIZE]; __syncthreads(); interm[threadIdx.x % LDS_SIZE] += value; } a[index] = interm[threadIdx.x % LDS_SIZE] + c[index]; } __global__ void fifo_kernel(float* /* a */, const float* /* b */, const float* /* c */, int loops) { using _float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; __shared__ _float4 lds[LDS_SIZE]; lds[threadIdx.x] = _float4{float(threadIdx.x)}; lds[threadIdx.x + 512] = _float4{float(threadIdx.x)}; __syncthreads(); _float4 dst[16]; float res1 = 0, res2 = 0; for(int l = 0; l < loops; l++) { #pragma unroll 16 for(int i = 0; i < 16; i++) dst[i] = lds[threadIdx.x + i * 8]; __syncthreads(); #pragma unroll 16 for(int i = 0; i < 16; i++) { res1 += dst[i][0] + dst[i][1]; res2 += dst[i][2] + dst[i][3]; } asm volatile("v_add_f32 %0, %1, %2" : "=v"(res1) : "v"(res1), "v"(res2)); } }; class hipMemory { public: hipMemory(size_t size = DATA_SIZE) { HIP_API_CALL(hipMalloc(&ptr, size * sizeof(float))); HIP_API_CALL(hipMemset(ptr, 0, size * sizeof(float))); } ~hipMemory() { if(ptr) HIP_API_CALL(hipFree(ptr)); } hipMemory(hipMemory&& other) { ptr = other.ptr; other.ptr = nullptr; } float* ptr = nullptr; }; class HipStream { public: HipStream() { HIP_API_CALL(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); } ~HipStream() { HIP_API_CALL(hipStreamDestroy(stream)); } hipStream_t stream; hipMemory src1{}; hipMemory src2{}; hipMemory dst{}; }; #define Launch(kernel, stream, arglast) \ hipLaunchKernelGGL( \ kernel, DATA_SIZE / 512, 512, 0, 0, stream.dst.ptr, stream.src1.ptr, stream.src2.ptr, 6); int main(int /*argc*/, char** /*argv*/) { std::array streams{}; std::vector kernels{}; kernels.push_back(divide_kernel); kernels.push_back(looping_lds_kernel); kernels.push_back(fifo_kernel); for(size_t i = 0; i < streams.size() * kernels.size(); i++) { // Warmup then start if(i == streams.size()) { HIP_API_CALL(hipDeviceSynchronize()); roctxProfilerResume(0); } auto& stream = streams.at(i % streams.size()); auto& kernel = kernels.at(i % kernels.size()); Launch(kernel, stream, 3); HIP_API_CALL(hipGetLastError()); } HIP_API_CALL(hipDeviceSynchronize()); roctxProfilerPause(0); return 0; }