Fixes SWDEV-222161
이 커밋은 다음에 포함됨:
+60
-35
@@ -134,10 +134,10 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device);
|
||||
return ihipLogStatus(hipStatus); \
|
||||
}
|
||||
|
||||
hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSizeX,
|
||||
uint32_t gridSizeY, uint32_t gridSizeZ,
|
||||
uint32_t blockSizeX, uint32_t blockSizeY,
|
||||
uint32_t blockSizeZ, size_t sharedMemBytes,
|
||||
hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0,
|
||||
void** impCoopParams = 0) {
|
||||
@@ -146,14 +146,6 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
hipError_t ret = hipSuccess;
|
||||
|
||||
size_t globalWorkSizeX = (size_t)gridSizeX * (size_t)blockSizeX;
|
||||
size_t globalWorkSizeY = (size_t)gridSizeY * (size_t)blockSizeY;
|
||||
size_t globalWorkSizeZ = (size_t)gridSizeZ * (size_t)blockSizeZ;
|
||||
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
|
||||
{
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
if (ctx == nullptr) {
|
||||
ret = hipErrorInvalidDevice;
|
||||
|
||||
@@ -211,8 +203,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
|
||||
lp.dynamic_group_mem_bytes =
|
||||
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
|
||||
hStream = ihipPreLaunchKernel(
|
||||
hStream, dim3(globalWorkSizeX/blockSizeX, globalWorkSizeY/blockSizeY, globalWorkSizeZ/blockSizeZ),
|
||||
dim3(blockSizeX, blockSizeY, blockSizeZ), &lp, f->_name.c_str(), isStreamLocked);
|
||||
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
|
||||
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked);
|
||||
|
||||
hsa_kernel_dispatch_packet_t aql;
|
||||
|
||||
@@ -221,9 +213,9 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi
|
||||
// aql.completion_signal._handle = 0;
|
||||
// aql.kernarg_address = 0;
|
||||
|
||||
aql.workgroup_size_x = blockSizeX;
|
||||
aql.workgroup_size_y = blockSizeY;
|
||||
aql.workgroup_size_z = blockSizeZ;
|
||||
aql.workgroup_size_x = localWorkSizeX;
|
||||
aql.workgroup_size_y = localWorkSizeY;
|
||||
aql.workgroup_size_z = localWorkSizeZ;
|
||||
aql.grid_size_x = globalWorkSizeX;
|
||||
aql.grid_size_y = globalWorkSizeY;
|
||||
aql.grid_size_z = globalWorkSizeZ;
|
||||
@@ -283,8 +275,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
|
||||
void** kernelParams, void** extra) {
|
||||
HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes,
|
||||
hStream, kernelParams, extra);
|
||||
|
||||
size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX;
|
||||
size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY;
|
||||
size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ;
|
||||
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
|
||||
{
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
return ihipLogStatus(ihipModuleLaunchKernel(tls,
|
||||
f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY,
|
||||
blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0));
|
||||
}
|
||||
|
||||
@@ -297,11 +298,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
|
||||
localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
|
||||
|
||||
if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
|
||||
return hipErrorInvalidValue;
|
||||
|
||||
return ihipLogStatus(ihipModuleLaunchKernel(tls,
|
||||
f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags));
|
||||
}
|
||||
|
||||
@@ -314,11 +312,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
|
||||
localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra);
|
||||
|
||||
if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0)
|
||||
return hipErrorInvalidValue;
|
||||
|
||||
return ihipLogStatus(ihipModuleLaunchKernel(tls,
|
||||
f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0));
|
||||
}
|
||||
|
||||
@@ -364,14 +359,26 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList
|
||||
}
|
||||
|
||||
GET_TLS();
|
||||
|
||||
size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
|
||||
|
||||
// launch kernels for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
|
||||
globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
|
||||
globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
|
||||
globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
|
||||
|
||||
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
|
||||
{
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
result = ihipModuleLaunchKernel(tls, kds[i],
|
||||
lp.gridDim.x,
|
||||
lp.gridDim.y,
|
||||
lp.gridDim.z,
|
||||
lp.gridDim.x * lp.blockDim.x,
|
||||
lp.gridDim.y * lp.blockDim.y,
|
||||
lp.gridDim.z * lp.blockDim.z,
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
@@ -416,6 +423,14 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x;
|
||||
size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y;
|
||||
size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z;
|
||||
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
|
||||
{
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
// Prepare the kernel descriptor for initializing the GWS
|
||||
hipFunction_t gwsKD = ps.kernel_descriptor(
|
||||
reinterpret_cast<std::uintptr_t>(&init_gws),
|
||||
@@ -475,9 +490,9 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
|
||||
// launch the main kernel
|
||||
result = ihipModuleLaunchKernel(tls, kd,
|
||||
gridDim.x,
|
||||
gridDim.y,
|
||||
gridDim.z,
|
||||
gridDim.x * blockDimX.x,
|
||||
gridDim.y * blockDimX.y,
|
||||
gridDim.z * blockDimX.z,
|
||||
blockDimX.x, blockDimX.y, blockDimX.z,
|
||||
sharedMemBytes, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, true, impCoopParams);
|
||||
@@ -612,6 +627,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
|
||||
void* impCoopParams[1];
|
||||
ulong prev_sum = 0;
|
||||
|
||||
size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0;
|
||||
// launch the main kernels for each device
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
const hipLaunchParams& lp = launchParamsList[i];
|
||||
@@ -628,10 +645,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
|
||||
impCoopParams[0] = &mg_info_ptr[i];
|
||||
|
||||
globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x;
|
||||
globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y;
|
||||
globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z;
|
||||
if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX)
|
||||
{
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
result = ihipModuleLaunchKernel(tls, kds[i],
|
||||
lp.gridDim.x,
|
||||
lp.gridDim.y,
|
||||
lp.gridDim.z,
|
||||
lp.gridDim.x * lp.blockDim.x,
|
||||
lp.gridDim.y * lp.blockDim.y,
|
||||
lp.gridDim.z * lp.blockDim.z,
|
||||
lp.blockDim.x, lp.blockDim.y,
|
||||
lp.blockDim.z, lp.sharedMem,
|
||||
lp.stream, lp.args, nullptr, nullptr, nullptr, 0,
|
||||
|
||||
새 이슈에서 참조
사용자 차단