From dfe9034cab3e80c59a5d54ac3d9736d41477ebef Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Mon, 14 Aug 2023 20:47:53 +0530 Subject: [PATCH] SWDEV-384714 - Add RTC test for half data type warp shfl functions (#202) Change-Id: I7cbf6932c928974236df2a49c7b5dee863da72ed [ROCm/hip-tests commit: 08fa8258ec8ae0192f238e520315a81aa1730d61] --- .../hip-tests/catch/unit/rtc/CMakeLists.txt | 1 + projects/hip-tests/catch/unit/rtc/shfl.cc | 197 ++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 projects/hip-tests/catch/unit/rtc/shfl.cc diff --git a/projects/hip-tests/catch/unit/rtc/CMakeLists.txt b/projects/hip-tests/catch/unit/rtc/CMakeLists.txt index a1aea77c78..57b17b7807 100644 --- a/projects/hip-tests/catch/unit/rtc/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/rtc/CMakeLists.txt @@ -11,6 +11,7 @@ set(AMD_TEST_SRC customOptions.cc hipRtcBfloat16.cc linker.cc + shfl.cc ) if(HIP_PLATFORM MATCHES "nvidia") diff --git a/projects/hip-tests/catch/unit/rtc/shfl.cc b/projects/hip-tests/catch/unit/rtc/shfl.cc new file mode 100644 index 0000000000..7059d7934f --- /dev/null +++ b/projects/hip-tests/catch/unit/rtc/shfl.cc @@ -0,0 +1,197 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static constexpr int n = 32; + +static constexpr auto shfl { +R"( +template +__global__ void shflUpSum(T* a, int size) { + T val = a[threadIdx.x]; + for (unsigned int i = size / 2; i > 0; i /= 2) { + val += __shfl_up(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflDownSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_down(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflXorSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size/2; i > 0; i /= 2) + val += __shfl_xor(val, i, size); + a[threadIdx.x] = val; +} +)"}; + +void getFactor(int& fact) { fact = 101; } +void getFactor(__half& fact) { fact = 2.5; } + +template T sum(T* a) { + T cpuSum = 0; + T factor; + getFactor(factor); + for (int i = 0; i < n; i++) { + a[i] = i + factor; + cpuSum += a[i]; + } + return cpuSum; +} + +template bool compare(T gpuSum, T cpuSum) { + if (gpuSum != cpuSum) { + return true; + } + return false; +} + +template <> __half sum(__half* a) { + __half cpuSum = 0; + __half factor; + getFactor(factor); + for (int i = 0; i < n; i++) { + a[i] = i + __half2float(factor); + cpuSum = __half2float(cpuSum) + __half2float(a[i]); + } + return cpuSum; +} + +template <> bool compare(__half gpuSum, __half cpuSum) { + if (__half2float(gpuSum) != __half2float(cpuSum)) { + return true; + } + return false; +} + +template +void runTestShfl(int option) { + using namespace std; + hiprtcProgram prog; + hiprtcCreateProgram(&prog, // prog + shfl, // buffer + "shfl.cu", // name + 0, nullptr, nullptr); + + string str; + switch(option) { + case 1: + str = "shflUpSum<__half>"; break; + case 2: + str = "shflDownSum<__half>"; break; + case 3: + str = "shflXorSum<__half>"; break; + default: + INFO("Options 1,2,3 are supported, but the passed option is: " << option); + REQUIRE(false); + } + + hiprtcAddNameExpression(prog, str.c_str()); + + hiprtcResult compileResult{hiprtcCompileProgram(prog, 0, 0)}; + size_t logSize; + HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize)); + if (logSize) { + string log(logSize, '\0'); + HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0])); + std::cout << log << '\n'; + } + REQUIRE(compileResult == HIPRTC_SUCCESS); + size_t codeSize; + HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize)); + + vector code(codeSize); + HIPRTC_CHECK(hiprtcGetCode(prog, code.data())); + + // Do hip malloc first so that we donot need to do a cuInit manually before calling hipModule APIs + size_t bufferSize = n * sizeof(T); + + T a[n]; + T cpuSum = sum(a); + T* d_a; + HIP_CHECK(hipMalloc(&d_a, bufferSize)); + + hipModule_t module; + hipFunction_t kernel; + HIP_CHECK(hipModuleLoadData(&module, code.data())); + const char* name; + hiprtcGetLoweredName(prog, str.c_str(), &name); + HIP_CHECK(hipModuleGetFunction(&kernel, module, name)); + + HIP_CHECK(hipMemcpy(d_a, &a, bufferSize, hipMemcpyDefault)); + + struct { + T* a_; + int b_; + } args{d_a, n}; + + auto size = sizeof(args); + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END}; + + HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, n, 1, 1, 0, nullptr, nullptr, config)); + + HIP_CHECK(hipMemcpy(&a, d_a, bufferSize, hipMemcpyDefault)); + bool result; + switch (option) { + case 1: //shflUpSum + result = compare(a[n - 1], cpuSum); break; + case 2: //shflDownSum + case 3: //shflXorSum + result = compare(a[0], cpuSum); break; + } + + if (result) { + HIP_CHECK(hipFree(d_a)); + REQUIRE(false); + } + + HIP_CHECK(hipFree(d_a)); + HIP_CHECK(hipModuleUnload(module)); + HIPRTC_CHECK(hiprtcDestroyProgram(&prog)); + +} + +TEST_CASE("Unit_hiprtc_half_shuffle") { + runTestShfl<__half>(1); + runTestShfl<__half>(2); + runTestShfl<__half>(3); +}