Adding changes for hipExtLaunchKernel for rocCLR

Change-Id: Iba52bc3bde7c37f3fb375a55ba0947e87b3cdc9b


[ROCm/hip commit: 2d517fdcc6]
Bu işleme şunda yer alıyor:
Jatin
2020-05-08 18:18:36 +00:00
işlemeyi yapan: Rahul Garg
ebeveyn 1be23e215c
işleme 7b52f0a1ea
10 değiştirilmiş dosya ile 396 ekleme ve 21 silme
+43
Dosyayı Görüntüle
@@ -392,10 +392,53 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri
typedef int hipLaunchParm;
template <std::size_t n, typename... Ts,
typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
void pArgs(const std::tuple<Ts...>&, void*) {}
template <std::size_t n, typename... Ts,
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
static_assert(!std::is_reference<T>{},
"A __global__ function cannot have a reference as one of its "
"arguments.");
#if defined(HIP_STRICT)
static_assert(std::is_trivially_copyable<T>{},
"Only TriviallyCopyable types can be arguments to a __global__ "
"function");
#endif
_vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
return pArgs<n + 1>(formals, _vargs);
}
template <typename... Formals, typename... Actuals>
std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
std::tuple<Formals...> to_formals{std::move(actuals)};
return to_formals;
}
#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
template <typename... Args, typename F = void (*)(Args...)>
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
constexpr size_t count = sizeof...(Args);
auto tup_ = std::tuple<Args...>{args...};
auto tup = validateArgsCountType(kernel, tup_);
void* _Args[count];
pArgs<0>(tup, _Args);
auto k = reinterpret_cast<void*>(kernel);
hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
}
#else
#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
do { \
kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(__VA_ARGS__); \
} while (0)
#endif
#include <hip/hip_runtime_api.h>
+5
Dosyayı Görüntüle
@@ -3378,6 +3378,11 @@ hipError_t hipLaunchKernel(const void* function_address,
hipStream_t stream __dparm(0));
#if __HIP_ROCclr__ || !defined(__HCC__)
//TODO: Move this to hip_ext.h
hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
void** args, size_t sharedMemBytes, hipStream_t stream,
hipEvent_t startEvent, hipEvent_t stopEvent, int flags);
hipError_t hipBindTexture(
size_t* offset,
const textureReference* tex,
+27 -2
Dosyayı Görüntüle
@@ -23,6 +23,10 @@ THE SOFTWARE.
#ifndef HIP_INCLUDE_HIP_HIP_EXT_H
#define HIP_INCLUDE_HIP_HIP_EXT_H
#include "hip/hip_runtime.h"
#if defined(__cplusplus)
#include <tuple>
#include <type_traits>
#endif
#ifdef __HCC__
// Forward declarations:
@@ -109,8 +113,29 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
hipEvent_t stopEvent = nullptr)
__attribute__((deprecated("use hipExtModuleLaunchKernel instead")));
//#if !__HIP_ROCclr__ && defined(__cplusplus)
#if defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__)
#if defined(__HIP_ROCclr__) && defined(__cplusplus)
extern "C" hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks,
dim3 dimBlocks, void** args, size_t sharedMemBytes,
hipStream_t stream, hipEvent_t startEvent,
hipEvent_t stopEvent, int flags);
template <typename... Args, typename F = void (*)(Args...)>
inline void hipExtLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
std::uint32_t sharedMemBytes, hipStream_t stream,
hipEvent_t startEvent, hipEvent_t stopEvent, std::uint32_t flags,
Args... args) {
constexpr size_t count = sizeof...(Args);
auto tup_ = std::tuple<Args...>{args...};
auto tup = validateArgsCountType(kernel, tup_);
void* _Args[count];
pArgs<0>(tup, _Args);
auto k = reinterpret_cast<void*>(kernel);
hipExtLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream, startEvent,
stopEvent, (int)flags);
}
#elif defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__)
//kernel_descriptor and hip_impl::make_kernarg are in "grid_launch_GGL.hpp"
namespace hip_impl {
+1
Dosyayı Görüntüle
@@ -51,6 +51,7 @@ hipExtGetLinkTypeAndHopCount
hipExtLaunchMultiKernelMultiDevice
hipExtMallocWithFlags
hipExtModuleLaunchKernel
hipExtLaunchKernel
hipFree
hipFreeArray
hipFuncSetCacheConfig
+1
Dosyayı Görüntüle
@@ -52,6 +52,7 @@ global:
hipExtLaunchMultiKernelMultiDevice;
hipExtMallocWithFlags;
hipExtModuleLaunchKernel;
hipExtLaunchKernel;
hipFree;
hipFreeArray;
hipFuncSetCacheConfig;
+35
Dosyayı Görüntüle
@@ -29,6 +29,16 @@
hipError_t ihipModuleLoadData(hipModule_t* module, const void* mmap_ptr, size_t mmap_size);
extern hipError_t ihipLaunchKernel(const void* hostFunction,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMemBytes,
hipStream_t stream,
hipEvent_t startEvent,
hipEvent_t stopEvent,
int flags);
const std::string& FunctionName(const hipFunction_t f)
{
return hip::Function::asFunction(f)->function_->name();
@@ -539,6 +549,31 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
}
extern "C" hipError_t hipLaunchKernel(const void *hostFunction,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMemBytes,
hipStream_t stream)
{
HIP_INIT_API(NONE, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
HIP_RETURN(ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, nullptr, nullptr, 0));
}
extern "C" hipError_t hipExtLaunchKernel(const void* hostFunction,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMemBytes,
hipStream_t stream,
hipEvent_t startEvent,
hipEvent_t stopEvent,
int flags)
{
HIP_INIT_API(NONE, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
HIP_RETURN(ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, startEvent, stopEvent, flags));
}
hipError_t hipLaunchCooperativeKernel(const void* f,
dim3 gridDim, dim3 blockDim,
void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream)
+21 -13
Dosyayı Görüntüle
@@ -62,6 +62,14 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
hipDeviceptr_t* dptr, size_t* bytes);
extern hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
static bool isCompatibleCodeObject(const std::string& codeobj_target_id,
const char* device_name) {
// Workaround for device name mismatch.
@@ -1339,16 +1347,16 @@ void hipLaunchCooperativeKernelGGLImpl(
#endif // defined(ATI_OS_LINUX)
extern "C" hipError_t hipLaunchKernel(const void *hostFunction,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMemBytes,
hipStream_t stream)
hipError_t ihipLaunchKernel(const void* hostFunction,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMemBytes,
hipStream_t stream,
hipEvent_t startEvent,
hipEvent_t stopEvent,
int flags)
{
HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes,
stream);
hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice();
if (deviceId == -1) {
@@ -1368,10 +1376,10 @@ extern "C" hipError_t hipLaunchKernel(const void *hostFunction,
HIP_RETURN(hipErrorInvalidDeviceFunction);
#endif
}
HIP_RETURN(hipModuleLaunchKernel(func, gridDim.x, gridDim.y, gridDim.z,
blockDim.x, blockDim.y, blockDim.z,
sharedMemBytes, stream, args, nullptr));
HIP_RETURN(ihipModuleLaunchKernel(func, (gridDim.x * blockDim.x), (gridDim.y * blockDim.y),
(gridDim.z * blockDim.z), blockDim.x, blockDim.y, blockDim.z,
sharedMemBytes, stream, args, nullptr, startEvent, stopEvent,
flags));
}
// conversion routines between float and half precision
+24 -3
Dosyayı Görüntüle
@@ -1709,12 +1709,33 @@ hipError_t hipLaunchKernel(
const void* func_addr, dim3 numBlocks, dim3 dimBlocks, void** args,
size_t sharedMemBytes, hipStream_t stream)
{
HIP_INIT_API(hipLaunchKernel,func_addr,numBlocks,dimBlocks,args,sharedMemBytes,stream);
HIP_INIT_API(hipLaunchKernel,func_addr,numBlocks,dimBlocks,args,sharedMemBytes,stream);
hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)func_addr,
hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)func_addr,
hip_impl::target_agent(stream));
return hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
return hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
stream, args, nullptr);
}
hipError_t hipExtLaunchKernel(const void* function, dim3 numBlocks, dim3 dimBlocks, void** args,
size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent,
hipEvent_t stopEvent, int flags) {
HIP_INIT_API(hipExtLaunchKernel,function,numBlocks,dimBlocks,args,sharedMemBytes,stream,startEvent,stopEvent,flags);
hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)function,
hip_impl::target_agent(stream));
uint32_t globalWorkSizeX = numBlocks.x * dimBlocks.x;
uint32_t globalWorkSizeY = numBlocks.y * dimBlocks.y;
uint32_t globalWorkSizeZ = numBlocks.z * dimBlocks.z;
if (globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX ||
globalWorkSizeZ > UINT32_MAX) {
return hipErrorInvalidConfiguration;
}
return ihipLogStatus(ihipModuleLaunchKernel(
tls, kd, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, dimBlocks.x, dimBlocks.y,
dimBlocks.z, sharedMemBytes, stream, args, nullptr, startEvent, stopEvent, flags));
}
+193
Dosyayı Görüntüle
@@ -0,0 +1,193 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp HCC_OPTIONS -Xclang -fallow-half-arguments-and-returns CLANG_OPTIONS -Xclang -fallow-half-arguments-and-returns EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
#define HIP_TEMPLATE_KERNEL_LAUNCH
#include "hip/hip_runtime.h"
#include "test_common.h"
__global__ void kernel_abs_int64(long long* input, long long* output) {
int tx = threadIdx.x;
output[tx] = abs(input[tx]);
}
__global__ void kernel_lgamma_double(double* input, double* output) {
int tx = threadIdx.x;
output[tx] = lgamma(input[tx]);
}
#define CHECK_LGAMMA_DOUBLE(IN, OUT, EXP) \
{ \
if (OUT != EXP) { \
failed("check_abs_int64 failed on %f (output = %f, expected = %fd)\n", IN, OUT, EXP); \
} \
}
#define CHECK_ABS_INT64(IN, OUT, EXP) \
{ \
if (OUT != EXP) { \
failed("check_abs_int64 failed on %lld (output = %lld, expected = %lld)\n", IN, OUT, \
EXP); \
} \
}
void check_lgamma_double() {
using datatype_t = double;
const int NUM_INPUTS = 8;
auto memsize = NUM_INPUTS * sizeof(datatype_t);
// allocate memories
datatype_t* inputCPU = (datatype_t*)malloc(memsize);
datatype_t* outputCPU = (datatype_t*)malloc(memsize);
datatype_t* inputGPU = nullptr;
hipMalloc((void**)&inputGPU, memsize);
datatype_t* outputGPU = nullptr;
hipMalloc((void**)&outputGPU, memsize);
// populate input
for (int i = 0; i < NUM_INPUTS; i++) {
inputCPU[i] = -3.5 + i;
}
// copy inputs to device
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
// launch kernel
hipLaunchKernelGGL(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
// copy outputs from device
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
// check outputs
for (int i = 0; i < NUM_INPUTS; i++) {
CHECK_LGAMMA_DOUBLE(inputCPU[i], outputCPU[i], lgamma(inputCPU[i]));
}
// free memories
hipFree(inputGPU);
hipFree(outputGPU);
free(inputCPU);
free(outputCPU);
// done
return;
}
void check_abs_int64() {
using datatype_t = long long;
const int NUM_INPUTS = 8;
auto memsize = NUM_INPUTS * sizeof(datatype_t);
// allocate memories
datatype_t* inputCPU = (datatype_t*)malloc(memsize);
datatype_t* outputCPU = (datatype_t*)malloc(memsize);
datatype_t* inputGPU = nullptr;
hipMalloc((void**)&inputGPU, memsize);
datatype_t* outputGPU = nullptr;
hipMalloc((void**)&outputGPU, memsize);
// populate input
inputCPU[0] = -81985529216486895ll;
inputCPU[1] = 81985529216486895ll;
inputCPU[2] = -1250999896491ll;
inputCPU[3] = 1250999896491ll;
inputCPU[4] = -19088743ll;
inputCPU[5] = 19088743ll;
inputCPU[6] = -291ll;
inputCPU[7] = 291ll;
// copy inputs to device
hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
// launch kernel
hipLaunchKernelGGL(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
// copy outputs from device
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
// check outputs
CHECK_ABS_INT64(inputCPU[0], outputCPU[0], outputCPU[1]);
CHECK_ABS_INT64(inputCPU[1], outputCPU[1], outputCPU[1]);
CHECK_ABS_INT64(inputCPU[2], outputCPU[2], outputCPU[3]);
CHECK_ABS_INT64(inputCPU[3], outputCPU[3], outputCPU[3]);
CHECK_ABS_INT64(inputCPU[4], outputCPU[4], outputCPU[5]);
CHECK_ABS_INT64(inputCPU[5], outputCPU[5], outputCPU[5]);
CHECK_ABS_INT64(inputCPU[6], outputCPU[6], outputCPU[7]);
CHECK_ABS_INT64(inputCPU[7], outputCPU[7], outputCPU[7]);
// free memories
hipFree(inputGPU);
hipFree(outputGPU);
free(inputCPU);
free(outputCPU);
// done
return;
}
template <class T, class F>
__global__ void kernel_simple(F f, T* out) {
*out = f();
}
template <class T, class F>
void check_simple(F f, T expected, const char* file, unsigned line) {
auto memsize = sizeof(T);
T* outputCPU = (T*)malloc(memsize);
T* outputGPU = nullptr;
hipMalloc((void**)&outputGPU, memsize);
hipLaunchKernelGGL(kernel_simple, 1, 1, 0, 0, f, outputGPU);
hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
if (*outputCPU != expected) {
failed("%s line %u : check failed (output = %lf, expected = %lf)\n", file, line,
(double)(*outputCPU), (double)expected);
}
hipFree(outputGPU);
free(outputCPU);
}
#define CHECK_SIMPLE(lambda, expected) check_simple(lambda, expected, __FILE__, __LINE__);
void test_fp16() {
CHECK_SIMPLE([] __device__() { return max<__fp16>(1.0f, 2.0f); }, 2.0f);
CHECK_SIMPLE([] __device__() { return min<__fp16>(1.0f, 2.0f); }, 1.0f);
}
int main(int argc, char* argv[]) {
HipTest::parseStandardArguments(argc, argv, true);
check_abs_int64();
// check_lgamma_double();
test_fp16();
passed();
}
+46 -3
Dosyayı Görüntüle
@@ -28,9 +28,52 @@ THE SOFTWARE.
#include "hip/hip_ext.h"
#include "test_common.h"
struct _t {
double _a, _b, _c, _d, _e, _f, _g, _h, _i, _j;
};
typedef struct _t _T;
__global__ void sKernel(_T s, double *a) {
*a = s._a + s._b + s._c + s._d + s._e + s._f + s._g + s._h + s._i + s._j;
}
__global__ void mKernel(char f, short a, int b, double c, short d, int e, double* res) {
*res = a + b + c + d + e + f;
}
void testMixData() {
double m = 0;
double *d_m;
HIPCHECK(hipMalloc(&d_m, sizeof(double)));
int a = 1, e = 10;
short b = 2, d = 4;
double c = 3.0;
char ff = 10;
hipExtLaunchKernelGGL(mKernel, 1, 1, 0, 0, nullptr, nullptr, 0, ff, b, a, c, d, e, d_m);
HIPCHECK(hipMemcpy(&m, d_m, sizeof(double), hipMemcpyDeviceToHost));
if (m != 30.0) {
std::cout << "M is:: " << m << std::endl;
failed("Mismatch");
}
hipFree(d_m);
}
void testStruct() {
double m = 0;
double *d_m;
HIPCHECK(hipMalloc(&d_m, sizeof(double)));
_T s{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
hipExtLaunchKernelGGL(sKernel, 1, 1, 0, 0, nullptr, nullptr, 0, s, d_m);
HIPCHECK(hipMemcpy(&m, d_m, sizeof(double), hipMemcpyDeviceToHost));
if (m != 55.0) {
std::cout << "M is:: " << m << std::endl;
failed("Mismatch");
}
hipFree(d_m);
}
void test(size_t N) {
size_t Nbytes = N * sizeof(int);
#if defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__)
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
@@ -51,13 +94,13 @@ void test(size_t N) {
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
#endif
}
int main(int argc, char* argv[]) {
HipTest::parseStandardArguments(argc, argv, true);
test(N);
testStruct();
testMixData();
passed();
}