SWDEV-493805 - Cleaning up launch parameters arguments. (#241)

[ROCm/clr commit: fa55557f46]
Этот коммит содержится в:
Jayaprakash, Karthik
2025-05-06 15:06:13 -04:00
коммит произвёл GitHub
родитель 086a1c289a
Коммит cde2a250ec
5 изменённых файлов: 178 добавлений и 150 удалений
+4 -8
Просмотреть файл
@@ -41,11 +41,9 @@ void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& s
bool IsHtoHMemcpy(void* dst, const void* src);
hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, void** kernelParams, void** extra,
int deviceId, uint32_t params);
hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params,
void** kernelParams, void** extra, int deviceId,
uint32_t params);
hipError_t ihipMemset_validate(void* dst, int64_t value, size_t valueSize, size_t sizeBytes);
@@ -53,9 +51,7 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte
size_t sizeBytes);
hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
uint32_t blockDimZ, uint32_t sharedMemBytes, hip::Stream* stream,
amd::LaunchParams& launch_params, hip::Stream* stream,
void** kernelParams, void** extra, hipEvent_t startEvent,
hipEvent_t stopEvent, uint32_t flags, uint32_t params,
uint32_t gridId, uint32_t numGrids, uint64_t prevGridSum,
+23 -14
Просмотреть файл
@@ -1176,13 +1176,19 @@ class GraphKernelNode : public GraphNode {
}
}
}
amd::HIPLaunchParams launch_params(kernelParams_.gridDim.x, kernelParams_.gridDim.y,
kernelParams_.gridDim.z, kernelParams_.blockDim.x,
kernelParams_.blockDim.y, kernelParams_.blockDim.z,
kernelParams_.sharedMemBytes);
if (!launch_params.IsValidConfig()) {
return hipErrorInvalidConfiguration;
}
status = ihipLaunchKernelCommand(
command, func, kernelParams_.gridDim.x * kernelParams_.blockDim.x,
kernelParams_.gridDim.y * kernelParams_.blockDim.y,
kernelParams_.gridDim.z * kernelParams_.blockDim.z, kernelParams_.blockDim.x,
kernelParams_.blockDim.y, kernelParams_.blockDim.z, kernelParams_.sharedMemBytes, stream,
kernelParams_.kernelParams, kernelParams_.extra, kernelEvents_.startEvent_,
kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0);
command, func, launch_params, stream, kernelParams_.kernelParams, kernelParams_.extra,
kernelEvents_.startEvent_, kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0);
if (signal_is_required_) {
// Optimize the barriers by adding a signal into the dispatch packet directly
command->SetProfiling();
@@ -1315,15 +1321,18 @@ class GraphKernelNode : public GraphNode {
static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
hipFunction_t func, int devId) {
size_t globalWorkSizeX = static_cast<size_t>(pNodeParams->gridDim.x) * pNodeParams->blockDim.x;
size_t globalWorkSizeY = static_cast<size_t>(pNodeParams->gridDim.y) * pNodeParams->blockDim.y;
size_t globalWorkSizeZ = static_cast<size_t>(pNodeParams->gridDim.z) * pNodeParams->blockDim.z;
hipError_t status = ihipLaunchKernel_validate(
func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), pNodeParams->blockDim.x, pNodeParams->blockDim.y,
pNodeParams->blockDim.z, pNodeParams->sharedMemBytes, pNodeParams->kernelParams,
pNodeParams->extra, devId, 0);
amd::HIPLaunchParams launch_params(pNodeParams->gridDim.x, pNodeParams->gridDim.y,
pNodeParams->gridDim.z, pNodeParams->blockDim.x,
pNodeParams->blockDim.y, pNodeParams->blockDim.z,
pNodeParams->sharedMemBytes);
if (!launch_params.IsValidConfig()) {
HIP_RETURN(hipErrorInvalidConfiguration);
}
hipError_t status = ihipLaunchKernel_validate(func, launch_params, pNodeParams->kernelParams,
pNodeParams->extra, devId, 0);
if (status != hipSuccess) {
return status;
}
+103 -113
Просмотреть файл
@@ -23,6 +23,7 @@
#include <fstream>
#include "hip_internal.hpp"
#include "platform/ndrange.hpp"
#include "platform/program.hpp"
#include "hip_event.hpp"
#include "hip_platform.hpp"
@@ -255,11 +256,9 @@ hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config
HIP_RETURN(hipSuccess);
}
hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, void** kernelParams, void** extra,
int deviceId, uint32_t params = 0) {
hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params,
void** kernelParams, void** extra, int deviceId,
uint32_t params = 0) {
if (f == nullptr) {
LogPrintfError("%s", "Function passed is null");
return hipErrorInvalidImage;
@@ -270,21 +269,23 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
return hipErrorInvalidValue;
}
if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|| launch_params.global_[2] == 0) {
return hipErrorInvalidValue;
}
if (blockDimX == 0 || blockDimY == 0 || blockDimZ == 0) {
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
|| launch_params.local_[2] == 0) {
return hipErrorInvalidConfiguration;
}
const amd::Device* device = g_devices[deviceId]->devices()[0];
const auto& info = device->info();
if (sharedMemBytes > info.localMemSizePerCU_) { //sharedMemPerBlock
if (launch_params.sharedMemBytes_ > info.localMemSizePerCU_) { //sharedMemPerBlock
return hipErrorInvalidValue;
}
// Make sure dispatch doesn't exceed max workgroup size limit
if (blockDimX * blockDimY * blockDimZ > info.maxWorkGroupSize_) {
if (launch_params.local_.product() > info.maxWorkGroupSize_) {
return (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_LAUNCH_KERNEL) ?
hipErrorInvalidConfiguration :
hipErrorInvalidValue;
@@ -301,10 +302,9 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
}
// Make sure the launch params are not larger than if specified launch_bounds
// If it exceeds, then return a failure
if (blockDimX * blockDimY * blockDimZ >
kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
if (launch_params.local_.product() > kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
LogPrintfError("Launch params (%u, %u, %u) are larger than launch bounds (%lu) for kernel %s",
blockDimX, blockDimY, blockDimZ,
launch_params.local_[0], launch_params.local_[1], launch_params.local_[2],
kernel->getDeviceKernel(*device)->workGroupInfo()->size_,
function->name().c_str());
return hipErrorLaunchFailure;
@@ -317,15 +317,14 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
int num_blocks = 0;
int max_blocks_per_grid = 0;
int best_block_size = 0;
int block_size = blockDimX * blockDimY * blockDimZ;
int block_size = launch_params.local_.product();
hipError_t err = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
&num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size, sharedMemBytes,
true);
&num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size,
launch_params.sharedMemBytes_, true);
if (err != hipSuccess) {
return err;
}
if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) >
unsigned(max_blocks_per_grid)) {
if (((launch_params.global_.product()) / block_size) > unsigned(max_blocks_per_grid)) {
return hipErrorCooperativeLaunchTooLarge;
}
}
@@ -338,10 +337,8 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
}
hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
uint32_t blockDimZ, uint32_t sharedMemBytes,
hip::Stream* stream, void** kernelParams, void** extra,
amd::LaunchParams& launch_params, hip::Stream* stream,
void** kernelParams, void** extra,
hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr,
uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
uint32_t numGrids = 0, uint64_t prevGridSum = 0,
@@ -350,9 +347,8 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
amd::Kernel* kernel = function->kernel();
size_t globalWorkOffset[3] = {0};
size_t globalWorkSize[3] = {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ};
size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ};
amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
amd::NDRangeContainer ndrange(3, globalWorkOffset, launch_params.global_.Data(),
launch_params.local_.Data());
amd::Command::EventWaitList waitList;
bool profileNDRange = (startEvent != nullptr || stopEvent != nullptr);
@@ -362,9 +358,9 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
params |= amd::NDRangeKernelCommand::AnyOrderLaunch;
}
amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
*stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum,
allGridSum, firstDevice, profileNDRange);
amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(*stream, waitList,
*kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids, prevGridSum,
allGridSum, firstDevice, profileNDRange);
if (!kernelCommand) {
return hipErrorOutOfMemory;
}
@@ -414,14 +410,12 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
return hipSuccess;
}
hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
uint32_t sharedMemBytes, hipStream_t hStream, void** kernelParams,
void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
uint32_t numGrids = 0, uint64_t prevGridSum = 0,
uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_params,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
uint64_t prevGridSum = 0, uint64_t allGridSum = 0,
uint32_t firstDevice = 0) {
int deviceId = hip::Stream::DeviceId(hStream);
HIP_RETURN_ONFAIL(PlatformState::instance().initStatManagedVarDevicePtr(deviceId));
@@ -433,32 +427,37 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
amd::Kernel* kernel = function->kernel();
amd::ScopedLock lock (DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_);
hipError_t status = ihipLaunchKernel_validate(
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
sharedMemBytes, kernelParams, extra, deviceId, params);
hipError_t status = ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId,
params);
if (status != hipSuccess) {
return status;
}
// Make sure the app doesn't launch a workgroup bigger than the global size
if (globalWorkSizeX < blockDimX) blockDimX = globalWorkSizeX;
if (globalWorkSizeY < blockDimY) blockDimY = globalWorkSizeY;
if (globalWorkSizeZ < blockDimZ) blockDimZ = globalWorkSizeZ;
if (launch_params.global_[0] < launch_params.local_[0]) {
launch_params.local_[0] = launch_params.global_[0];
}
if (launch_params.global_[1] < launch_params.local_[1]) {
launch_params.local_[1] = launch_params.global_[1];
}
if (launch_params.global_[2] < launch_params.local_[2]) {
launch_params.local_[2] = launch_params.global_[2];
}
auto device = g_devices[deviceId]->devices()[0];
// Check if it's a uniform kernel and validate dimensions
if (kernel->getDeviceKernel(*device)->getUniformWorkGroupSize()) {
if (((globalWorkSizeX % blockDimX) != 0) ||
((globalWorkSizeY % blockDimY) != 0) ||
((globalWorkSizeZ % blockDimZ) != 0)) {
if (((launch_params.global_[0] % launch_params.local_[0]) != 0) ||
((launch_params.global_[1] % launch_params.local_[1]) != 0) ||
((launch_params.global_[2] % launch_params.local_[2]) != 0)) {
return hipErrorInvalidValue;
}
}
amd::Command* command = nullptr;
hip::Stream* hip_stream = hip::getStream(hStream);
status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream,
kernelParams, extra, startEvent, stopEvent, flags, params,
gridId, numGrids, prevGridSum, allGridSum, firstDevice);
status = ihipLaunchKernelCommand(command, f, launch_params, hip_stream, kernelParams, extra,
startEvent, stopEvent, flags, params, gridId, numGrids,
prevGridSum, allGridSum, firstDevice);
if (status != hipSuccess) {
return status;
}
@@ -514,16 +513,15 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
if (gridDimX > int32_max || gridDimY > uint16_max || gridDimZ > uint16_max) {
HIP_RETURN(hipErrorInvalidValue);
}
size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max()) {
amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
sharedMemBytes);
if (!launch_params.IsValidConfig()) {
HIP_RETURN(hipErrorInvalidConfiguration);
}
HIP_RETURN(ihipModuleLaunchKernel(
f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), blockDimX, blockDimY, blockDimZ, sharedMemBytes,
hStream, kernelParams, extra, nullptr, nullptr));
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr,
nullptr));
}
hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
@@ -544,9 +542,12 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
kernelParams, extra, startEvent, stopEvent, flags);
HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
hStream, kernelParams, extra, startEvent, stopEvent, flags));
amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
sharedMemBytes);
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
stopEvent, flags));
}
@@ -559,9 +560,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra,
startEvent, stopEvent);
HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams,
extra, startEvent, stopEvent));
amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
blockDimY, blockDimZ, sharedMemBytes);
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
stopEvent));
}
hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDimX,
@@ -579,19 +582,15 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
STREAM_CAPTURE(hipModuleLaunchCooperativeKernel, stream, f, gridDimX, gridDimY, gridDimZ,
blockDimX, blockDimY, blockDimZ, sharedMemBytes, kernelParams);
size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
sharedMemBytes);
if (!launch_params.IsValidConfig()) {
HIP_RETURN(hipErrorInvalidConfiguration);
}
HIP_RETURN(ihipModuleLaunchKernel(f, static_cast<uint32_t>(globalWorkSizeX),
static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), blockDimX, blockDimY,
blockDimZ, sharedMemBytes, stream, kernelParams, nullptr, nullptr,
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, stream, kernelParams, nullptr, nullptr,
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
}
hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
@@ -682,25 +681,22 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
}
}
size_t globalWorkSizeX = static_cast<size_t>(launch.gridDimX) * launch.blockDimX;
size_t globalWorkSizeY = static_cast<size_t>(launch.gridDimY) * launch.blockDimY;
size_t globalWorkSizeZ = static_cast<size_t>(launch.gridDimZ) * launch.blockDimZ;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
amd::HIPLaunchParams launch_params(launch.gridDimX, launch.gridDimY, launch.gridDimZ,
launch.blockDimX, launch.blockDimY, launch.blockDimZ,
launch.sharedMemBytes);
if (!launch_params.IsValidConfig()) {
return hipErrorInvalidConfiguration;
}
result = ihipModuleLaunchKernel(
launch.function, static_cast<uint32_t>(globalWorkSizeX),
static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), launch.blockDimX, launch.blockDimY,
launch.blockDimZ, launch.sharedMemBytes, launch.hStream, launch.kernelParams,
launch.function, launch_params, launch.hStream, launch.kernelParams,
nullptr, nullptr, nullptr, flags, extFlags,
i, numDevices, prevGridSize, allGridSize, firstDevice);
if (result != hipSuccess) {
break;
}
prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
prevGridSize += launch_params.global_.product();
}
// Sync the execution streams on all devices
@@ -810,13 +806,12 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
return getStatFuncError;
}
const amd::Device* device = g_devices[deviceId]->devices()[0];
size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max() ||
(blockDim.x * blockDim.y * blockDim.z > device->info().maxWorkGroupSize_)) {
amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
blockDim.z, sharedMemBytes);
if (!launch_params.IsValidConfig() ||
launch_params.local_.product() > device->info().maxWorkGroupSize_) {
return hipErrorInvalidConfiguration;
}
@@ -824,15 +819,14 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
return hipErrorCooperativeLaunchTooLarge;
}
if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
//if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|| launch_params.global_[2] == 0) {
return hipErrorInvalidConfiguration;
}
return ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y,
blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr,
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr,
nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
}
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
@@ -1132,35 +1126,31 @@ hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f
HIP_RETURN(hipErrorInvalidValue);
}
size_t globalWorkSizeX = static_cast<size_t>(config->gridDimX) * config->blockDimX;
size_t globalWorkSizeY = static_cast<size_t>(config->gridDimY) * config->blockDimY;
size_t globalWorkSizeZ = static_cast<size_t>(config->gridDimZ) * config->blockDimZ;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
amd::HIPLaunchParams launch_params(config->gridDimX, config->gridDimY, config->gridDimZ,
config->blockDimX, config->blockDimY, config->blockDimZ,
config->sharedMemBytes);
if (!launch_params.IsValidConfig()) {
HIP_RETURN(hipErrorInvalidConfiguration);
}
if (config->numAttrs == 0) {
HIP_RETURN(ihipModuleLaunchKernel(
f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), config->blockDimX, config->blockDimY,
config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr,
nullptr, nullptr, 0));
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams, nullptr,
nullptr, nullptr, 0));
}
for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) {
hipLaunchAttribute& attr = config->attrs[attr_idx];
switch (attr.id) {
case hipLaunchAttributeCooperative:
{
if (attr.value.cooperative != 0) {
HIP_RETURN(ihipModuleLaunchKernel(
f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), config->blockDimX, config->blockDimY,
config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr,
nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams,
nullptr, nullptr, nullptr, 0,
amd::NDRangeKernelCommand::CooperativeGroups));
}
break;
}
default:
LogPrintfError("Attribute %u not supported", attr.id);
break;
+10 -15
Просмотреть файл
@@ -50,11 +50,10 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
hipDeviceptr_t* dptr, size_t* bytes);
extern hipError_t ihipModuleLaunchKernel(
hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX,
uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream,
void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
hipFunction_t f, amd::LaunchParams& launch_params, hipStream_t hStream, void** kernelParams,
void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0,
uint64_t allGridSum = 0, uint32_t firstDevice = 0);
static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) {
// Workaround for device name mismatch.
// Device name may contain feature strings delimited by '+', e.g.
@@ -641,18 +640,14 @@ hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDi
return hipErrorInvalidConfiguration;
}
size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
blockDim.z, sharedMemBytes);
if (!launch_params.IsValidConfig()) {
return hipErrorInvalidConfiguration;
}
return ihipModuleLaunchKernel(
func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes,
stream, args, nullptr, startEvent, stopEvent, flags);
return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr,
startEvent, stopEvent, flags);
}
// conversion routines between float and half precision
+38
Просмотреть файл
@@ -56,6 +56,10 @@ class NDRange : public EmbeddedObject {
//! Construct a new index space of the given dimensions.
explicit NDRange(size_t dimensions);
NDRange(size_t dataX, size_t dataY, size_t dataZ) : dimensions_(3) {
data_[0] = dataX; data_[1] = dataY; data_[2] = dataZ;
}
//! Copy constructor.
NDRange(const NDRange& space);
@@ -112,6 +116,40 @@ class NDRange : public EmbeddedObject {
//! Print this index space on the given stream.
void printOn(FILE* file) const;
#endif // DEBUG
const size_t* Data() const { return data_; }
};
//! Stucture to store launch parameters.
struct LaunchParams {
NDRange global_; //!< Total number of work-items in N-dims
NDRange local_; //!< Number of work-items in N-dims in a workgroup.
uint32_t sharedMemBytes_; //!< Shared Memory bytes
bool validConfig_; //!< Flag will be set to false when config is not correct.
LaunchParams(uint32_t globalX, uint32_t globalY, uint32_t globalZ, uint32_t localX,
uint32_t localY, uint32_t localZ, uint32_t sharedMemBytes)
: global_(globalX, globalY, globalZ), local_(localX, localY, localZ),
sharedMemBytes_ (sharedMemBytes), validConfig_(true) {}
bool IsValidConfig() const { return validConfig_; }
};
//! Structure to store launch parameters in HIP Style (global and local size needs computation).
struct HIPLaunchParams : public LaunchParams {
public:
HIPLaunchParams(uint32_t gridX, uint32_t gridY, uint32_t gridZ, uint32_t blockX,
uint32_t blockY, uint32_t blockZ, uint32_t sharedMemBytes)
: LaunchParams(static_cast<uint32_t>(gridX) * blockX,
static_cast<uint32_t>(gridY) * blockY,
static_cast<uint32_t>(gridZ) * blockZ,
blockX, blockY, blockZ, sharedMemBytes) {
if (global_[0] > std::numeric_limits<uint32_t>::max() ||
global_[1] > std::numeric_limits<uint32_t>::max() ||
global_[2] > std::numeric_limits<uint32_t>::max()) {
validConfig_ = false;
}
}
};
//! A container for the local and global worksizes.