[hip] Fix for bug introduced in #1770 when blockSize is non-power of 2 (#1864)

Fixes SWDEV-222161
이 커밋은 다음에 포함됨:
Sarbojit2019
2020-02-13 14:22:46 +05:30
커밋한 사람 GitHub
부모 fc5256fd28
커밋 1109cbff83
+60 -35
파일 보기
@@ -134,10 +134,10 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device);
return ihipLogStatus(hipStatus); \
}
hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSizeX,
uint32_t gridSizeY, uint32_t gridSizeZ,
uint32_t blockSizeX, uint32_t blockSizeY,
uint32_t blockSizeZ, size_t sharedMemBytes,
hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX,
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
uint32_t localWorkSizeZ, size_t sharedMemBytes,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
void** impCoopParams = 0) {
@@ -146,14 +146,6 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
size_t globalWorkSizeX = (size_t)gridSizeX * (size_t)blockSizeX;
size_t globalWorkSizeY = (size_t)gridSizeY * (size_t)blockSizeY;
size_t globalWorkSizeZ = (size_t)gridSizeZ * (size_t)blockSizeZ;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
if (ctx == nullptr) {
ret = hipErrorInvalidDevice;
@@ -211,8 +203,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
lp.dynamic_group_mem_bytes =
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
hStream = ihipPreLaunchKernel(
hStream, dim3(globalWorkSizeX/blockSizeX, globalWorkSizeY/blockSizeY, globalWorkSizeZ/blockSizeZ),
dim3(blockSizeX, blockSizeY, blockSizeZ), &lp, f->_name.c_str(), isStreamLocked);
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);
hsa_kernel_dispatch_packet_t aql;
@@ -221,9 +213,9 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
// aql.completion_signal._handle = 0;
// aql.kernarg_address = 0;
aql.workgroup_size_x = blockSizeX;
aql.workgroup_size_y = blockSizeY;
aql.workgroup_size_z = blockSizeZ;
aql.workgroup_size_x = localWorkSizeX;
aql.workgroup_size_y = localWorkSizeY;
aql.workgroup_size_z = localWorkSizeZ;
aql.grid_size_x = globalWorkSizeX;
aql.grid_size_y = globalWorkSizeY;
aql.grid_size_z = globalWorkSizeZ;
@@ -283,8 +275,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
void** kernelParams, void** extra) {
HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes,
hStream, kernelParams, extra);
size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX;
size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY;
size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
return ihipLogStatus(ihipModuleLaunchKernel(tls,
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY,
blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0));
}
@@ -297,11 +298,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
return hipErrorInvalidValue;
return ihipLogStatus(ihipModuleLaunchKernel(tls,
f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
}
@@ -314,11 +312,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
return hipErrorInvalidValue;
return ihipLogStatus(ihipModuleLaunchKernel(tls,
f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
}
@@ -364,14 +359,26 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
}
GET_TLS();
size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
// launch kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x,
lp.gridDim.y,
lp.gridDim.z,
lp.gridDim.x * lp.blockDim.x,
lp.gridDim.y * lp.blockDim.y,
lp.gridDim.z * lp.blockDim.z,
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
@@ -416,6 +423,14 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
return hipErrorInvalidConfiguration;
}
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
// Prepare the kernel descriptor for initializing the GWS
hipFunction_t gwsKD = ps.kernel_descriptor(
reinterpret_cast<std::uintptr_t>(&init_gws),
@@ -475,9 +490,9 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
// launch the main kernel
result = ihipModuleLaunchKernel(tls, kd,
gridDim.x,
gridDim.y,
gridDim.z,
gridDim.x * blockDimX.x,
gridDim.y * blockDimX.y,
gridDim.z * blockDimX.z,
blockDimX.x, blockDimX.y, blockDimX.z,
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, true, impCoopParams);
@@ -612,6 +627,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
void* impCoopParams[1];
ulong prev_sum = 0;
size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
// launch the main kernels for each device
for (int i = 0; i < numDevices; ++i) {
const hipLaunchParams& lp = launchParamsList[i];
@@ -628,10 +645,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
impCoopParams[0] = &mg_info_ptr[i];
globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
{
return hipErrorInvalidConfiguration;
}
result = ihipModuleLaunchKernel(tls, kds[i],
lp.gridDim.x,
lp.gridDim.y,
lp.gridDim.z,
lp.gridDim.x * lp.blockDim.x,
lp.gridDim.y * lp.blockDim.y,
lp.gridDim.z * lp.blockDim.z,
lp.blockDim.x, lp.blockDim.y,
lp.blockDim.z, lp.sharedMem,
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,