Fix C-style hipLaunchKernel (#1835)
* Fix bug in LaunchKernel test Instead of passing the address of the gpu buffer, pass the address of the pointer that holds the address of the gpu buffer * Fix hipLaunchKernel's kernarg buffer construction. The hipLaunchKernel implementation should rely on ihipModuleLaunchKernel to construct the kernarg buffer correctly based on kernel metadata. * Fix a bug in get_functions where the Kernel_descriptor wasn't constructed with the correct kernarg layout information. * Fix a bug in kernarg layout parsing dealing with kernel without any arg * teach ihipModuleLaunchKernel to handle kernel without any arg * Add a more interesting test
This commit is contained in:
@@ -188,7 +188,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
} else {
|
||||
}
|
||||
else if (f->_kernarg_layout.size() != 0) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
@@ -1470,21 +1471,7 @@ hipError_t hipLaunchKernel(
|
||||
hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)func_addr,
|
||||
hip_impl::target_agent(stream));
|
||||
|
||||
if(kd == nullptr || kd->_header == nullptr)
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
|
||||
size_t szKernArg = kd->_header->kernarg_segment_byte_size;
|
||||
|
||||
if(args == NULL && szKernArg != 0)
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
|
||||
void* config[]{
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER,
|
||||
args,
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
&szKernArg,
|
||||
HIP_LAUNCH_PARAM_END};
|
||||
|
||||
return ihipLogStatus(ihipModuleLaunchKernel(tls, kd, numBlocks.x, numBlocks.y, numBlocks.z,
|
||||
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes, stream, nullptr, (void**)&config, nullptr, nullptr, 0));
|
||||
return hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
|
||||
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
|
||||
stream, args, nullptr);
|
||||
}
|
||||
|
||||
@@ -613,7 +613,8 @@ public:
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
functions[aa].second.emplace(
|
||||
function.first,
|
||||
Kernel_descriptor{kernel_object(kernel_symbol), it->first});
|
||||
Kernel_descriptor{kernel_object(kernel_symbol), it->first,
|
||||
kernargs_size_align(function.first)});
|
||||
}
|
||||
}
|
||||
}, agent);
|
||||
@@ -672,11 +673,12 @@ public:
|
||||
auto dx1 = kernels_md.find("CodeProps", dx);
|
||||
dx = kernels_md.find("Args:", dx);
|
||||
|
||||
if (dx1 < dx) {
|
||||
if (dx1 < dx || dx == std::string::npos) {
|
||||
dx = dx1;
|
||||
// create an empty kernarg laybout vector for kernels without any arg
|
||||
kernargs[fn];
|
||||
continue;
|
||||
}
|
||||
if (dx == std::string::npos) break;
|
||||
|
||||
static constexpr decltype(kernels_md.size()) args_sz{5};
|
||||
dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]);
|
||||
|
||||
@@ -36,7 +36,7 @@ bool LaunchKernelArg()
|
||||
dim3 blocks = {1,1,1};
|
||||
dim3 threads = {1,1,1};
|
||||
|
||||
HIPCHECK(hipLaunchKernel(kernel, blocks, threads,NULL, 0, 0));
|
||||
HIPCHECK(hipLaunchKernel(kernel, blocks, threads, NULL, 0, 0));
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -50,9 +50,9 @@ bool LaunchKernelArg1()
|
||||
|
||||
// Allocate Device memory
|
||||
HIPCHECK(hipMalloc((void**)&A_d, sizeof(int)));
|
||||
|
||||
void* Args[]={A_d};
|
||||
HIPCHECK(hipLaunchKernel(kernel1, blocks, threads, Args,0,0));
|
||||
|
||||
void* Args[]={&A_d};
|
||||
HIPCHECK(hipLaunchKernel(kernel1, blocks, threads, Args, 0, 0));
|
||||
|
||||
// Get the result back to host memory
|
||||
HIPCHECK(hipMemcpy(&A, A_d, sizeof(int), hipMemcpyDeviceToHost));
|
||||
@@ -60,7 +60,7 @@ bool LaunchKernelArg1()
|
||||
HIPCHECK(hipFree(A_d));
|
||||
|
||||
if(A != 333)
|
||||
return false;
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -81,9 +81,9 @@ bool LaunchKernelArg2()
|
||||
HIPCHECK(hipMalloc((void**)&B_d, sizeof(int)));
|
||||
|
||||
// Copy data from host memory to device memory
|
||||
HIPCHECK(hipMemcpy(B_d,&B, sizeof(int), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, &B, sizeof(int), hipMemcpyHostToDevice));
|
||||
|
||||
void* Args[]={A_d,B_d};
|
||||
void* Args[]={&A_d, &B_d};
|
||||
HIPCHECK(hipLaunchKernel(kernel2, blocks, threads, Args,0,0));
|
||||
|
||||
// Get the result back to host memory
|
||||
@@ -118,11 +118,11 @@ bool LaunchKernelArg3()
|
||||
HIPCHECK(hipMalloc((void**)&C_d, sizeof(int)));
|
||||
|
||||
// Copy data from host memory to device memory
|
||||
HIPCHECK(hipMemcpy(A_d,&A, sizeof(int), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(A_d, &A, sizeof(int), hipMemcpyHostToDevice));
|
||||
|
||||
HIPCHECK(hipMemcpy(B_d,&B, sizeof(int), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, &B, sizeof(int), hipMemcpyHostToDevice));
|
||||
|
||||
void* Args[]={A_d,B_d,C_d};
|
||||
void* Args[]={&A_d, &B_d, &C_d};
|
||||
HIPCHECK(hipLaunchKernel(kernel3, blocks, threads, Args,0,0));
|
||||
|
||||
// Get the result back to host memory
|
||||
@@ -138,14 +138,43 @@ bool LaunchKernelArg3()
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LaunchKernelArg4()
|
||||
{
|
||||
int A = 0;
|
||||
int *A_d = NULL;
|
||||
dim3 blocks = {1,1,1};
|
||||
dim3 threads = {1,1,1};
|
||||
|
||||
// Allocate Device memory
|
||||
HIPCHECK(hipMalloc((void**)&A_d, sizeof(int)));
|
||||
|
||||
char c = 1;
|
||||
short s = 10;
|
||||
int i = 100;
|
||||
struct things t = {2,20,200};
|
||||
|
||||
void* Args[]={&A_d, &c, &s, &i, &t};
|
||||
HIPCHECK(hipLaunchKernel(kernel4, blocks, threads, Args, 0, 0));
|
||||
|
||||
// Get the result back to host memory
|
||||
HIPCHECK(hipMemcpy(&A, A_d, sizeof(int), hipMemcpyDeviceToHost));
|
||||
|
||||
HIPCHECK(hipFree(A_d));
|
||||
|
||||
if (A != (c + s + i + t.c + t.s + t.i))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
if( LaunchKernelArg() &&
|
||||
LaunchKernelArg1() &&
|
||||
LaunchKernelArg2() &&
|
||||
LaunchKernelArg3())
|
||||
LaunchKernelArg3() &&
|
||||
LaunchKernelArg4())
|
||||
{
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
||||
@@ -17,7 +17,23 @@
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
extern __global__ void kernel();
|
||||
extern __global__ void kernel1(int*);
|
||||
extern __global__ void kernel2(int*,int*);
|
||||
extern __global__ void kernel3(int*,int*,int*);
|
||||
|
||||
struct things {
|
||||
char c;
|
||||
short s;
|
||||
int i;
|
||||
};
|
||||
extern __global__ void kernel4(int*, char, short, int, struct things);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -19,6 +19,7 @@
|
||||
|
||||
|
||||
#include<hip/hip_runtime.h>
|
||||
#include "LaunchKernel.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
@@ -43,4 +44,9 @@ __global__ void kernel3(int *a, int*b, int* c)
|
||||
*c = *a+*b;
|
||||
}
|
||||
|
||||
__global__ void kernel4(int *a, char c, short s, int i, struct things t)
|
||||
{
|
||||
*a = c + s + i + t.c + t.s + t.i;
|
||||
}
|
||||
|
||||
}//extern "C"
|
||||
|
||||
مرجع در شماره جدید
Block a user