e898079a13
* Adding test and samples to decoder * Fix sample * Formatting * Fix multi test * Disable sample * Fix tests * Format * Version fix * Locking the decoder * Add atomic * Review comments * Format * Adding readme * merge conflict and adding PCS+ATT test * Review comments * Properly disable PCS test * Update tests/rocprofv3/advanced-thread-trace/CMakeLists.txt * Adding back env var test * Name fix * Preload sample * Addressing review comments * Update docs --------- Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>
179 lines
5.3 KiB
C++
179 lines
5.3 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
//
|
|
// undefine NDEBUG so asserts are implemented
|
|
#ifdef NDEBUG
|
|
# undef NDEBUG
|
|
#endif
|
|
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <iostream>
|
|
#include <vector>
|
|
|
|
#include <rocprofiler-sdk-roctx/roctx.h>
|
|
|
|
#include "hip/hip_runtime.h"
|
|
|
|
// Two waves per SIMD on MI300
|
|
#define DATA_SIZE (304 * 64 * 4 * 2)
|
|
#define HIP_API_CALL(CALL) \
|
|
if((CALL) != hipSuccess) \
|
|
{ \
|
|
abort(); \
|
|
}
|
|
|
|
#define LDS_SIZE 1024
|
|
|
|
__global__ void
|
|
divide_kernel(float* a, const float* b, const float* c, int /* unused */)
|
|
{
|
|
int index = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
if(index >= DATA_SIZE) return;
|
|
|
|
a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1;
|
|
}
|
|
|
|
__global__ void
|
|
looping_lds_kernel(float* a, const float* b, const float* c, int loopcount)
|
|
{
|
|
__shared__ float interm[LDS_SIZE];
|
|
|
|
size_t index = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
for(size_t i = index; i < DATA_SIZE; i += blockDim.x * gridDim.x)
|
|
interm[threadIdx.x % LDS_SIZE] = b[index] + threadIdx.x;
|
|
|
|
for(int it = 0; it < loopcount; it++)
|
|
{
|
|
__syncthreads();
|
|
float value = interm[(it + threadIdx.x + LDS_SIZE / 2) % LDS_SIZE];
|
|
__syncthreads();
|
|
interm[threadIdx.x % LDS_SIZE] += value;
|
|
}
|
|
|
|
a[index] = interm[threadIdx.x % LDS_SIZE] + c[index];
|
|
}
|
|
|
|
__global__ void
|
|
fifo_kernel(float* /* a */, const float* /* b */, const float* /* c */, int loops)
|
|
{
|
|
using _float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
|
|
|
|
__shared__ _float4 lds[LDS_SIZE];
|
|
lds[threadIdx.x] = _float4{float(threadIdx.x)};
|
|
lds[threadIdx.x + 512] = _float4{float(threadIdx.x)};
|
|
|
|
__syncthreads();
|
|
|
|
_float4 dst[16];
|
|
|
|
float res1 = 0, res2 = 0;
|
|
|
|
for(int l = 0; l < loops; l++)
|
|
{
|
|
#pragma unroll 16
|
|
for(int i = 0; i < 16; i++)
|
|
dst[i] = lds[threadIdx.x + i * 8];
|
|
|
|
__syncthreads();
|
|
|
|
#pragma unroll 16
|
|
for(int i = 0; i < 16; i++)
|
|
{
|
|
res1 += dst[i][0] + dst[i][1];
|
|
res2 += dst[i][2] + dst[i][3];
|
|
}
|
|
asm volatile("v_add_f32 %0, %1, %2" : "=v"(res1) : "v"(res1), "v"(res2));
|
|
}
|
|
};
|
|
|
|
class hipMemory
|
|
{
|
|
public:
|
|
hipMemory(size_t size = DATA_SIZE)
|
|
{
|
|
HIP_API_CALL(hipMalloc(&ptr, size * sizeof(float)));
|
|
HIP_API_CALL(hipMemset(ptr, 0, size * sizeof(float)));
|
|
}
|
|
~hipMemory()
|
|
{
|
|
if(ptr) HIP_API_CALL(hipFree(ptr));
|
|
}
|
|
hipMemory(hipMemory&& other)
|
|
{
|
|
ptr = other.ptr;
|
|
other.ptr = nullptr;
|
|
}
|
|
float* ptr = nullptr;
|
|
};
|
|
|
|
class HipStream
|
|
{
|
|
public:
|
|
HipStream() { HIP_API_CALL(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); }
|
|
~HipStream() { HIP_API_CALL(hipStreamDestroy(stream)); }
|
|
|
|
hipStream_t stream;
|
|
|
|
hipMemory src1{};
|
|
hipMemory src2{};
|
|
hipMemory dst{};
|
|
};
|
|
|
|
#define Launch(kernel, stream, arglast) \
|
|
hipLaunchKernelGGL( \
|
|
kernel, DATA_SIZE / 512, 512, 0, 0, stream.dst.ptr, stream.src1.ptr, stream.src2.ptr, 6);
|
|
|
|
int
|
|
main(int /*argc*/, char** /*argv*/)
|
|
{
|
|
std::array<HipStream, 3> streams{};
|
|
std::vector<decltype(divide_kernel)*> kernels{};
|
|
|
|
kernels.push_back(divide_kernel);
|
|
kernels.push_back(looping_lds_kernel);
|
|
kernels.push_back(fifo_kernel);
|
|
|
|
for(size_t i = 0; i < streams.size() * kernels.size(); i++)
|
|
{
|
|
// Warmup then start
|
|
if(i == streams.size())
|
|
{
|
|
HIP_API_CALL(hipDeviceSynchronize());
|
|
roctxProfilerResume(0);
|
|
}
|
|
|
|
auto& stream = streams.at(i % streams.size());
|
|
auto& kernel = kernels.at(i % kernels.size());
|
|
|
|
Launch(kernel, stream, 3);
|
|
HIP_API_CALL(hipGetLastError());
|
|
}
|
|
|
|
HIP_API_CALL(hipDeviceSynchronize());
|
|
roctxProfilerPause(0);
|
|
|
|
return 0;
|
|
}
|