diff --git a/projects/clr/hipamd/src/hip_graph_helper.hpp b/projects/clr/hipamd/src/hip_graph_helper.hpp index 3abbbd0f26..5904aab375 100644 --- a/projects/clr/hipamd/src/hip_graph_helper.hpp +++ b/projects/clr/hipamd/src/hip_graph_helper.hpp @@ -41,11 +41,9 @@ void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& s bool IsHtoHMemcpy(void* dst, const void* src); -hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, - uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, - uint32_t sharedMemBytes, void** kernelParams, void** extra, - int deviceId, uint32_t params); +hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params, + void** kernelParams, void** extra, int deviceId, + uint32_t params); hipError_t ihipMemset_validate(void* dst, int64_t value, size_t valueSize, size_t sizeBytes); @@ -53,9 +51,7 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte size_t sizeBytes); hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, - uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, - uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, - uint32_t blockDimZ, uint32_t sharedMemBytes, hip::Stream* stream, + amd::LaunchParams& launch_params, hip::Stream* stream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, uint32_t params, uint32_t gridId, uint32_t numGrids, uint64_t prevGridSum, diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 59e2e0c8a0..56084cc17f 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -1176,13 +1176,19 @@ class GraphKernelNode : public GraphNode { } } } + + amd::HIPLaunchParams launch_params(kernelParams_.gridDim.x, kernelParams_.gridDim.y, + kernelParams_.gridDim.z, kernelParams_.blockDim.x, + kernelParams_.blockDim.y, kernelParams_.blockDim.z, + kernelParams_.sharedMemBytes); + + if (!launch_params.IsValidConfig()) { + return hipErrorInvalidConfiguration; + } + status = ihipLaunchKernelCommand( - command, func, kernelParams_.gridDim.x * kernelParams_.blockDim.x, - kernelParams_.gridDim.y * kernelParams_.blockDim.y, - kernelParams_.gridDim.z * kernelParams_.blockDim.z, kernelParams_.blockDim.x, - kernelParams_.blockDim.y, kernelParams_.blockDim.z, kernelParams_.sharedMemBytes, stream, - kernelParams_.kernelParams, kernelParams_.extra, kernelEvents_.startEvent_, - kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0); + command, func, launch_params, stream, kernelParams_.kernelParams, kernelParams_.extra, + kernelEvents_.startEvent_, kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0); if (signal_is_required_) { // Optimize the barriers by adding a signal into the dispatch packet directly command->SetProfiling(); @@ -1315,15 +1321,18 @@ class GraphKernelNode : public GraphNode { static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams, hipFunction_t func, int devId) { - size_t globalWorkSizeX = static_cast(pNodeParams->gridDim.x) * pNodeParams->blockDim.x; - size_t globalWorkSizeY = static_cast(pNodeParams->gridDim.y) * pNodeParams->blockDim.y; - size_t globalWorkSizeZ = static_cast(pNodeParams->gridDim.z) * pNodeParams->blockDim.z; - hipError_t status = ihipLaunchKernel_validate( - func, static_cast(globalWorkSizeX), static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), pNodeParams->blockDim.x, pNodeParams->blockDim.y, - pNodeParams->blockDim.z, pNodeParams->sharedMemBytes, pNodeParams->kernelParams, - pNodeParams->extra, devId, 0); + amd::HIPLaunchParams launch_params(pNodeParams->gridDim.x, pNodeParams->gridDim.y, + pNodeParams->gridDim.z, pNodeParams->blockDim.x, + pNodeParams->blockDim.y, pNodeParams->blockDim.z, + pNodeParams->sharedMemBytes); + + if (!launch_params.IsValidConfig()) { + HIP_RETURN(hipErrorInvalidConfiguration); + } + + hipError_t status = ihipLaunchKernel_validate(func, launch_params, pNodeParams->kernelParams, + pNodeParams->extra, devId, 0); if (status != hipSuccess) { return status; } diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index c78f23c7db..b747ac14d8 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -23,6 +23,7 @@ #include #include "hip_internal.hpp" +#include "platform/ndrange.hpp" #include "platform/program.hpp" #include "hip_event.hpp" #include "hip_platform.hpp" @@ -255,11 +256,9 @@ hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config HIP_RETURN(hipSuccess); } -hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, - uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, - uint32_t sharedMemBytes, void** kernelParams, void** extra, - int deviceId, uint32_t params = 0) { +hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params, + void** kernelParams, void** extra, int deviceId, + uint32_t params = 0) { if (f == nullptr) { LogPrintfError("%s", "Function passed is null"); return hipErrorInvalidImage; @@ -270,21 +269,23 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, return hipErrorInvalidValue; } - if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) { + if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 + || launch_params.global_[2] == 0) { return hipErrorInvalidValue; } - if (blockDimX == 0 || blockDimY == 0 || blockDimZ == 0) { + if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 + || launch_params.local_[2] == 0) { return hipErrorInvalidConfiguration; } const amd::Device* device = g_devices[deviceId]->devices()[0]; const auto& info = device->info(); - if (sharedMemBytes > info.localMemSizePerCU_) { //sharedMemPerBlock + if (launch_params.sharedMemBytes_ > info.localMemSizePerCU_) { //sharedMemPerBlock return hipErrorInvalidValue; } // Make sure dispatch doesn't exceed max workgroup size limit - if (blockDimX * blockDimY * blockDimZ > info.maxWorkGroupSize_) { + if (launch_params.local_.product() > info.maxWorkGroupSize_) { return (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_LAUNCH_KERNEL) ? hipErrorInvalidConfiguration : hipErrorInvalidValue; @@ -301,10 +302,9 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, } // Make sure the launch params are not larger than if specified launch_bounds // If it exceeds, then return a failure - if (blockDimX * blockDimY * blockDimZ > - kernel->getDeviceKernel(*device)->workGroupInfo()->size_) { + if (launch_params.local_.product() > kernel->getDeviceKernel(*device)->workGroupInfo()->size_) { LogPrintfError("Launch params (%u, %u, %u) are larger than launch bounds (%lu) for kernel %s", - blockDimX, blockDimY, blockDimZ, + launch_params.local_[0], launch_params.local_[1], launch_params.local_[2], kernel->getDeviceKernel(*device)->workGroupInfo()->size_, function->name().c_str()); return hipErrorLaunchFailure; @@ -317,15 +317,14 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, int num_blocks = 0; int max_blocks_per_grid = 0; int best_block_size = 0; - int block_size = blockDimX * blockDimY * blockDimZ; + int block_size = launch_params.local_.product(); hipError_t err = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( - &num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size, sharedMemBytes, - true); + &num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size, + launch_params.sharedMemBytes_, true); if (err != hipSuccess) { return err; } - if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) > - unsigned(max_blocks_per_grid)) { + if (((launch_params.global_.product()) / block_size) > unsigned(max_blocks_per_grid)) { return hipErrorCooperativeLaunchTooLarge; } } @@ -338,10 +337,8 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, } hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, - uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, - uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, - uint32_t blockDimZ, uint32_t sharedMemBytes, - hip::Stream* stream, void** kernelParams, void** extra, + amd::LaunchParams& launch_params, hip::Stream* stream, + void** kernelParams, void** extra, hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr, uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, @@ -350,9 +347,8 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, amd::Kernel* kernel = function->kernel(); size_t globalWorkOffset[3] = {0}; - size_t globalWorkSize[3] = {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ}; - size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ}; - amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); + amd::NDRangeContainer ndrange(3, globalWorkOffset, launch_params.global_.Data(), + launch_params.local_.Data()); amd::Command::EventWaitList waitList; bool profileNDRange = (startEvent != nullptr || stopEvent != nullptr); @@ -362,9 +358,9 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, params |= amd::NDRangeKernelCommand::AnyOrderLaunch; } - amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand( - *stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum, - allGridSum, firstDevice, profileNDRange); + amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(*stream, waitList, + *kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids, prevGridSum, + allGridSum, firstDevice, profileNDRange); if (!kernelCommand) { return hipErrorOutOfMemory; } @@ -414,14 +410,12 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, return hipSuccess; } -hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, - uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, - uint32_t sharedMemBytes, hipStream_t hStream, void** kernelParams, - void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, - uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, - uint32_t numGrids = 0, uint64_t prevGridSum = 0, - uint64_t allGridSum = 0, uint32_t firstDevice = 0) { +hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_params, + hipStream_t hStream, void** kernelParams, void** extra, + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, + uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, + uint64_t prevGridSum = 0, uint64_t allGridSum = 0, + uint32_t firstDevice = 0) { int deviceId = hip::Stream::DeviceId(hStream); HIP_RETURN_ONFAIL(PlatformState::instance().initStatManagedVarDevicePtr(deviceId)); @@ -433,32 +427,37 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, amd::Kernel* kernel = function->kernel(); amd::ScopedLock lock (DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_); - hipError_t status = ihipLaunchKernel_validate( - f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, - sharedMemBytes, kernelParams, extra, deviceId, params); + hipError_t status = ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId, + params); if (status != hipSuccess) { return status; } + // Make sure the app doesn't launch a workgroup bigger than the global size - if (globalWorkSizeX < blockDimX) blockDimX = globalWorkSizeX; - if (globalWorkSizeY < blockDimY) blockDimY = globalWorkSizeY; - if (globalWorkSizeZ < blockDimZ) blockDimZ = globalWorkSizeZ; + if (launch_params.global_[0] < launch_params.local_[0]) { + launch_params.local_[0] = launch_params.global_[0]; + } + if (launch_params.global_[1] < launch_params.local_[1]) { + launch_params.local_[1] = launch_params.global_[1]; + } + if (launch_params.global_[2] < launch_params.local_[2]) { + launch_params.local_[2] = launch_params.global_[2]; + } auto device = g_devices[deviceId]->devices()[0]; // Check if it's a uniform kernel and validate dimensions if (kernel->getDeviceKernel(*device)->getUniformWorkGroupSize()) { - if (((globalWorkSizeX % blockDimX) != 0) || - ((globalWorkSizeY % blockDimY) != 0) || - ((globalWorkSizeZ % blockDimZ) != 0)) { + if (((launch_params.global_[0] % launch_params.local_[0]) != 0) || + ((launch_params.global_[1] % launch_params.local_[1]) != 0) || + ((launch_params.global_[2] % launch_params.local_[2]) != 0)) { return hipErrorInvalidValue; } } amd::Command* command = nullptr; hip::Stream* hip_stream = hip::getStream(hStream); - status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, - blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream, - kernelParams, extra, startEvent, stopEvent, flags, params, - gridId, numGrids, prevGridSum, allGridSum, firstDevice); + status = ihipLaunchKernelCommand(command, f, launch_params, hip_stream, kernelParams, extra, + startEvent, stopEvent, flags, params, gridId, numGrids, + prevGridSum, allGridSum, firstDevice); if (status != hipSuccess) { return status; } @@ -514,16 +513,15 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr if (gridDimX > int32_max || gridDimY > uint16_max || gridDimZ > uint16_max) { HIP_RETURN(hipErrorInvalidValue); } - size_t globalWorkSizeX = static_cast(gridDimX) * blockDimX; - size_t globalWorkSizeY = static_cast(gridDimY) * blockDimY; - size_t globalWorkSizeZ = static_cast(gridDimZ) * blockDimZ; - if (globalWorkSizeX > std::numeric_limits::max()) { + + amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes); + if (!launch_params.IsValidConfig()) { HIP_RETURN(hipErrorInvalidConfiguration); } - HIP_RETURN(ihipModuleLaunchKernel( - f, static_cast(globalWorkSizeX), static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), blockDimX, blockDimY, blockDimZ, sharedMemBytes, - hStream, kernelParams, extra, nullptr, nullptr)); + + HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr, + nullptr)); } hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, @@ -544,9 +542,12 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, kernelParams, extra, startEvent, stopEvent, flags); - HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, - localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, - hStream, kernelParams, extra, startEvent, stopEvent, flags)); + amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, + localWorkSizeX, localWorkSizeY, localWorkSizeZ, + sharedMemBytes); + + HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent, + stopEvent, flags)); } @@ -559,9 +560,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); - HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, - blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, - extra, startEvent, stopEvent)); + amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, + blockDimY, blockDimZ, sharedMemBytes); + + HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent, + stopEvent)); } hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDimX, @@ -579,19 +582,15 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi STREAM_CAPTURE(hipModuleLaunchCooperativeKernel, stream, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, kernelParams); - size_t globalWorkSizeX = static_cast(gridDimX) * blockDimX; - size_t globalWorkSizeY = static_cast(gridDimY) * blockDimY; - size_t globalWorkSizeZ = static_cast(gridDimZ) * blockDimZ; - if (globalWorkSizeX > std::numeric_limits::max() || - globalWorkSizeY > std::numeric_limits::max() || - globalWorkSizeZ > std::numeric_limits::max()) { + amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes); + + if (!launch_params.IsValidConfig()) { HIP_RETURN(hipErrorInvalidConfiguration); } - HIP_RETURN(ihipModuleLaunchKernel(f, static_cast(globalWorkSizeX), - static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), blockDimX, blockDimY, - blockDimZ, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups)); + + HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, stream, kernelParams, nullptr, nullptr, + nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups)); } hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList, @@ -682,25 +681,22 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* } } - size_t globalWorkSizeX = static_cast(launch.gridDimX) * launch.blockDimX; - size_t globalWorkSizeY = static_cast(launch.gridDimY) * launch.blockDimY; - size_t globalWorkSizeZ = static_cast(launch.gridDimZ) * launch.blockDimZ; - if (globalWorkSizeX > std::numeric_limits::max() || - globalWorkSizeY > std::numeric_limits::max() || - globalWorkSizeZ > std::numeric_limits::max()) { + amd::HIPLaunchParams launch_params(launch.gridDimX, launch.gridDimY, launch.gridDimZ, + launch.blockDimX, launch.blockDimY, launch.blockDimZ, + launch.sharedMemBytes); + + if (!launch_params.IsValidConfig()) { return hipErrorInvalidConfiguration; } + result = ihipModuleLaunchKernel( - launch.function, static_cast(globalWorkSizeX), - static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), launch.blockDimX, launch.blockDimY, - launch.blockDimZ, launch.sharedMemBytes, launch.hStream, launch.kernelParams, + launch.function, launch_params, launch.hStream, launch.kernelParams, nullptr, nullptr, nullptr, flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice); if (result != hipSuccess) { break; } - prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ; + prevGridSize += launch_params.global_.product(); } // Sync the execution streams on all devices @@ -810,13 +806,12 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b return getStatFuncError; } const amd::Device* device = g_devices[deviceId]->devices()[0]; - size_t globalWorkSizeX = static_cast(gridDim.x) * blockDim.x; - size_t globalWorkSizeY = static_cast(gridDim.y) * blockDim.y; - size_t globalWorkSizeZ = static_cast(gridDim.z) * blockDim.z; - if (globalWorkSizeX > std::numeric_limits::max() || - globalWorkSizeY > std::numeric_limits::max() || - globalWorkSizeZ > std::numeric_limits::max() || - (blockDim.x * blockDim.y * blockDim.z > device->info().maxWorkGroupSize_)) { + + amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, + blockDim.z, sharedMemBytes); + + if (!launch_params.IsValidConfig() || + launch_params.local_.product() > device->info().maxWorkGroupSize_) { return hipErrorInvalidConfiguration; } @@ -824,15 +819,14 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b return hipErrorCooperativeLaunchTooLarge; } - if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) { + //if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) { + if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 + || launch_params.global_[2] == 0) { return hipErrorInvalidConfiguration; } - return ihipModuleLaunchKernel(func, static_cast(globalWorkSizeX), - static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), blockDim.x, blockDim.y, - blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr, - nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups); + return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr, + nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups); } hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim, @@ -1132,35 +1126,31 @@ hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f HIP_RETURN(hipErrorInvalidValue); } - size_t globalWorkSizeX = static_cast(config->gridDimX) * config->blockDimX; - size_t globalWorkSizeY = static_cast(config->gridDimY) * config->blockDimY; - size_t globalWorkSizeZ = static_cast(config->gridDimZ) * config->blockDimZ; - if (globalWorkSizeX > std::numeric_limits::max() || - globalWorkSizeY > std::numeric_limits::max() || - globalWorkSizeZ > std::numeric_limits::max()) { + amd::HIPLaunchParams launch_params(config->gridDimX, config->gridDimY, config->gridDimZ, + config->blockDimX, config->blockDimY, config->blockDimZ, + config->sharedMemBytes); + + if (!launch_params.IsValidConfig()) { HIP_RETURN(hipErrorInvalidConfiguration); } if (config->numAttrs == 0) { - HIP_RETURN(ihipModuleLaunchKernel( - f, static_cast(globalWorkSizeX), static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), config->blockDimX, config->blockDimY, - config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr, - nullptr, nullptr, 0)); + HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams, nullptr, + nullptr, nullptr, 0)); } for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) { hipLaunchAttribute& attr = config->attrs[attr_idx]; switch (attr.id) { case hipLaunchAttributeCooperative: + { if (attr.value.cooperative != 0) { - HIP_RETURN(ihipModuleLaunchKernel( - f, static_cast(globalWorkSizeX), static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), config->blockDimX, config->blockDimY, - config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr, - nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups)); + HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams, + nullptr, nullptr, nullptr, 0, + amd::NDRangeKernelCommand::CooperativeGroups)); } break; + } default: LogPrintfError("Attribute %u not supported", attr.id); break; diff --git a/projects/clr/hipamd/src/hip_platform.cpp b/projects/clr/hipamd/src/hip_platform.cpp index 137702afb2..ea095adf81 100644 --- a/projects/clr/hipamd/src/hip_platform.cpp +++ b/projects/clr/hipamd/src/hip_platform.cpp @@ -50,11 +50,10 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor hipDeviceptr_t* dptr, size_t* bytes); extern hipError_t ihipModuleLaunchKernel( - hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX, - uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream, - void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, - uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, - uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0); + hipFunction_t f, amd::LaunchParams& launch_params, hipStream_t hStream, void** kernelParams, + void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, + uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, + uint64_t allGridSum = 0, uint32_t firstDevice = 0); static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) { // Workaround for device name mismatch. // Device name may contain feature strings delimited by '+', e.g. @@ -641,18 +640,14 @@ hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDi return hipErrorInvalidConfiguration; } - size_t globalWorkSizeX = static_cast(gridDim.x) * blockDim.x; - size_t globalWorkSizeY = static_cast(gridDim.y) * blockDim.y; - size_t globalWorkSizeZ = static_cast(gridDim.z) * blockDim.z; - if (globalWorkSizeX > std::numeric_limits::max() || - globalWorkSizeY > std::numeric_limits::max() || - globalWorkSizeZ > std::numeric_limits::max()) { + amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, + blockDim.z, sharedMemBytes); + if (!launch_params.IsValidConfig()) { return hipErrorInvalidConfiguration; } - return ihipModuleLaunchKernel( - func, static_cast(globalWorkSizeX), static_cast(globalWorkSizeY), - static_cast(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes, - stream, args, nullptr, startEvent, stopEvent, flags); + + return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr, + startEvent, stopEvent, flags); } // conversion routines between float and half precision diff --git a/projects/clr/rocclr/platform/ndrange.hpp b/projects/clr/rocclr/platform/ndrange.hpp index cb6f926ce1..b4ff928464 100644 --- a/projects/clr/rocclr/platform/ndrange.hpp +++ b/projects/clr/rocclr/platform/ndrange.hpp @@ -56,6 +56,10 @@ class NDRange : public EmbeddedObject { //! Construct a new index space of the given dimensions. explicit NDRange(size_t dimensions); + NDRange(size_t dataX, size_t dataY, size_t dataZ) : dimensions_(3) { + data_[0] = dataX; data_[1] = dataY; data_[2] = dataZ; + } + //! Copy constructor. NDRange(const NDRange& space); @@ -112,6 +116,40 @@ class NDRange : public EmbeddedObject { //! Print this index space on the given stream. void printOn(FILE* file) const; #endif // DEBUG + + const size_t* Data() const { return data_; } +}; + +//! Stucture to store launch parameters. +struct LaunchParams { + NDRange global_; //!< Total number of work-items in N-dims + NDRange local_; //!< Number of work-items in N-dims in a workgroup. + uint32_t sharedMemBytes_; //!< Shared Memory bytes + bool validConfig_; //!< Flag will be set to false when config is not correct. + + LaunchParams(uint32_t globalX, uint32_t globalY, uint32_t globalZ, uint32_t localX, + uint32_t localY, uint32_t localZ, uint32_t sharedMemBytes) + : global_(globalX, globalY, globalZ), local_(localX, localY, localZ), + sharedMemBytes_ (sharedMemBytes), validConfig_(true) {} + + bool IsValidConfig() const { return validConfig_; } +}; + +//! Structure to store launch parameters in HIP Style (global and local size needs computation). +struct HIPLaunchParams : public LaunchParams { + public: + HIPLaunchParams(uint32_t gridX, uint32_t gridY, uint32_t gridZ, uint32_t blockX, + uint32_t blockY, uint32_t blockZ, uint32_t sharedMemBytes) + : LaunchParams(static_cast(gridX) * blockX, + static_cast(gridY) * blockY, + static_cast(gridZ) * blockZ, + blockX, blockY, blockZ, sharedMemBytes) { + if (global_[0] > std::numeric_limits::max() || + global_[1] > std::numeric_limits::max() || + global_[2] > std::numeric_limits::max()) { + validConfig_ = false; + } + } }; //! A container for the local and global worksizes.