diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 16606e8016..44f0f108a6 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -134,10 +134,10 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device); return ihipLogStatus(hipStatus); \ } -hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSizeX, - uint32_t gridSizeY, uint32_t gridSizeZ, - uint32_t blockSizeX, uint32_t blockSizeY, - uint32_t blockSizeZ, size_t sharedMemBytes, +hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, + uint32_t localWorkSizeX, uint32_t localWorkSizeY, + uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, void** impCoopParams = 0) { @@ -146,14 +146,6 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi auto ctx = ihipGetTlsDefaultCtx(); hipError_t ret = hipSuccess; - size_t globalWorkSizeX = (size_t)gridSizeX * (size_t)blockSizeX; - size_t globalWorkSizeY = (size_t)gridSizeY * (size_t)blockSizeY; - size_t globalWorkSizeZ = (size_t)gridSizeZ * (size_t)blockSizeZ; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) - { - return hipErrorInvalidConfiguration; - } - if (ctx == nullptr) { ret = hipErrorInvalidDevice; @@ -211,8 +203,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi lp.dynamic_group_mem_bytes = sharedMemBytes; // TODO - this should be part of preLaunchKernel. hStream = ihipPreLaunchKernel( - hStream, dim3(globalWorkSizeX/blockSizeX, globalWorkSizeY/blockSizeY, globalWorkSizeZ/blockSizeZ), - dim3(blockSizeX, blockSizeY, blockSizeZ), &lp, f->_name.c_str(), isStreamLocked); + hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ), + dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked); hsa_kernel_dispatch_packet_t aql; @@ -221,9 +213,9 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi // aql.completion_signal._handle = 0; // aql.kernarg_address = 0; - aql.workgroup_size_x = blockSizeX; - aql.workgroup_size_y = blockSizeY; - aql.workgroup_size_z = blockSizeZ; + aql.workgroup_size_x = localWorkSizeX; + aql.workgroup_size_y = localWorkSizeY; + aql.workgroup_size_z = localWorkSizeZ; aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; @@ -283,8 +275,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr void** kernelParams, void** extra) { HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); + + size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX; + size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY; + size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ; + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0)); } @@ -297,11 +298,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); - if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0) - return hipErrorInvalidValue; - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY, + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags)); } @@ -314,11 +312,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); - if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0) - return hipErrorInvalidValue; - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY, + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0)); } @@ -364,14 +359,26 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList } GET_TLS(); + + size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0; + // launch kernels for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; + globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; + globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; + globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; + + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + result = ihipModuleLaunchKernel(tls, kds[i], - lp.gridDim.x, - lp.gridDim.y, - lp.gridDim.z, + lp.gridDim.x * lp.blockDim.x, + lp.gridDim.y * lp.blockDim.y, + lp.gridDim.z * lp.blockDim.z, lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, @@ -416,6 +423,14 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, return hipErrorInvalidConfiguration; } + size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x; + size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y; + size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z; + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + // Prepare the kernel descriptor for initializing the GWS hipFunction_t gwsKD = ps.kernel_descriptor( reinterpret_cast(&init_gws), @@ -475,9 +490,9 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, // launch the main kernel result = ihipModuleLaunchKernel(tls, kd, - gridDim.x, - gridDim.y, - gridDim.z, + gridDim.x * blockDimX.x, + gridDim.y * blockDimX.y, + gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, nullptr, 0, true, impCoopParams); @@ -612,6 +627,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL void* impCoopParams[1]; ulong prev_sum = 0; + + size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0; // launch the main kernels for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; @@ -628,10 +645,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL impCoopParams[0] = &mg_info_ptr[i]; + globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; + globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; + globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + result = ihipModuleLaunchKernel(tls, kds[i], - lp.gridDim.x, - lp.gridDim.y, - lp.gridDim.z, + lp.gridDim.x * lp.blockDim.x, + lp.gridDim.y * lp.blockDim.y, + lp.gridDim.z * lp.blockDim.z, lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0,