SWDEV-493805 - Cleaning up launch parameters arguments. (#241)
[ROCm/clr commit: fa55557f46]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
086a1c289a
Коммит
cde2a250ec
@@ -41,11 +41,9 @@ void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& s
|
||||
|
||||
bool IsHtoHMemcpy(void* dst, const void* src);
|
||||
|
||||
hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
|
||||
uint32_t sharedMemBytes, void** kernelParams, void** extra,
|
||||
int deviceId, uint32_t params);
|
||||
hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params,
|
||||
void** kernelParams, void** extra, int deviceId,
|
||||
uint32_t params);
|
||||
|
||||
hipError_t ihipMemset_validate(void* dst, int64_t value, size_t valueSize, size_t sizeBytes);
|
||||
|
||||
@@ -53,9 +51,7 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte
|
||||
size_t sizeBytes);
|
||||
|
||||
hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
|
||||
uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
|
||||
uint32_t blockDimZ, uint32_t sharedMemBytes, hip::Stream* stream,
|
||||
amd::LaunchParams& launch_params, hip::Stream* stream,
|
||||
void** kernelParams, void** extra, hipEvent_t startEvent,
|
||||
hipEvent_t stopEvent, uint32_t flags, uint32_t params,
|
||||
uint32_t gridId, uint32_t numGrids, uint64_t prevGridSum,
|
||||
|
||||
@@ -1176,13 +1176,19 @@ class GraphKernelNode : public GraphNode {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
amd::HIPLaunchParams launch_params(kernelParams_.gridDim.x, kernelParams_.gridDim.y,
|
||||
kernelParams_.gridDim.z, kernelParams_.blockDim.x,
|
||||
kernelParams_.blockDim.y, kernelParams_.blockDim.z,
|
||||
kernelParams_.sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
status = ihipLaunchKernelCommand(
|
||||
command, func, kernelParams_.gridDim.x * kernelParams_.blockDim.x,
|
||||
kernelParams_.gridDim.y * kernelParams_.blockDim.y,
|
||||
kernelParams_.gridDim.z * kernelParams_.blockDim.z, kernelParams_.blockDim.x,
|
||||
kernelParams_.blockDim.y, kernelParams_.blockDim.z, kernelParams_.sharedMemBytes, stream,
|
||||
kernelParams_.kernelParams, kernelParams_.extra, kernelEvents_.startEvent_,
|
||||
kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0);
|
||||
command, func, launch_params, stream, kernelParams_.kernelParams, kernelParams_.extra,
|
||||
kernelEvents_.startEvent_, kernelEvents_.stopEvent_, flags, coopKernel_, 0, 0, 0, 0, 0);
|
||||
if (signal_is_required_) {
|
||||
// Optimize the barriers by adding a signal into the dispatch packet directly
|
||||
command->SetProfiling();
|
||||
@@ -1315,15 +1321,18 @@ class GraphKernelNode : public GraphNode {
|
||||
|
||||
static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
|
||||
hipFunction_t func, int devId) {
|
||||
size_t globalWorkSizeX = static_cast<size_t>(pNodeParams->gridDim.x) * pNodeParams->blockDim.x;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(pNodeParams->gridDim.y) * pNodeParams->blockDim.y;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(pNodeParams->gridDim.z) * pNodeParams->blockDim.z;
|
||||
|
||||
hipError_t status = ihipLaunchKernel_validate(
|
||||
func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), pNodeParams->blockDim.x, pNodeParams->blockDim.y,
|
||||
pNodeParams->blockDim.z, pNodeParams->sharedMemBytes, pNodeParams->kernelParams,
|
||||
pNodeParams->extra, devId, 0);
|
||||
amd::HIPLaunchParams launch_params(pNodeParams->gridDim.x, pNodeParams->gridDim.y,
|
||||
pNodeParams->gridDim.z, pNodeParams->blockDim.x,
|
||||
pNodeParams->blockDim.y, pNodeParams->blockDim.z,
|
||||
pNodeParams->sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
HIP_RETURN(hipErrorInvalidConfiguration);
|
||||
}
|
||||
|
||||
hipError_t status = ihipLaunchKernel_validate(func, launch_params, pNodeParams->kernelParams,
|
||||
pNodeParams->extra, devId, 0);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <fstream>
|
||||
|
||||
#include "hip_internal.hpp"
|
||||
#include "platform/ndrange.hpp"
|
||||
#include "platform/program.hpp"
|
||||
#include "hip_event.hpp"
|
||||
#include "hip_platform.hpp"
|
||||
@@ -255,11 +256,9 @@ hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
|
||||
uint32_t sharedMemBytes, void** kernelParams, void** extra,
|
||||
int deviceId, uint32_t params = 0) {
|
||||
hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& launch_params,
|
||||
void** kernelParams, void** extra, int deviceId,
|
||||
uint32_t params = 0) {
|
||||
if (f == nullptr) {
|
||||
LogPrintfError("%s", "Function passed is null");
|
||||
return hipErrorInvalidImage;
|
||||
@@ -270,21 +269,23 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|
||||
|| launch_params.global_[2] == 0) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (blockDimX == 0 || blockDimY == 0 || blockDimZ == 0) {
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
|
||||
|| launch_params.local_[2] == 0) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
const amd::Device* device = g_devices[deviceId]->devices()[0];
|
||||
const auto& info = device->info();
|
||||
if (sharedMemBytes > info.localMemSizePerCU_) { //sharedMemPerBlock
|
||||
if (launch_params.sharedMemBytes_ > info.localMemSizePerCU_) { //sharedMemPerBlock
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
// Make sure dispatch doesn't exceed max workgroup size limit
|
||||
if (blockDimX * blockDimY * blockDimZ > info.maxWorkGroupSize_) {
|
||||
if (launch_params.local_.product() > info.maxWorkGroupSize_) {
|
||||
return (DEBUG_HIP_7_PREVIEW & amd::CHANGE_HIP_LAUNCH_KERNEL) ?
|
||||
hipErrorInvalidConfiguration :
|
||||
hipErrorInvalidValue;
|
||||
@@ -301,10 +302,9 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
}
|
||||
// Make sure the launch params are not larger than if specified launch_bounds
|
||||
// If it exceeds, then return a failure
|
||||
if (blockDimX * blockDimY * blockDimZ >
|
||||
kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
|
||||
if (launch_params.local_.product() > kernel->getDeviceKernel(*device)->workGroupInfo()->size_) {
|
||||
LogPrintfError("Launch params (%u, %u, %u) are larger than launch bounds (%lu) for kernel %s",
|
||||
blockDimX, blockDimY, blockDimZ,
|
||||
launch_params.local_[0], launch_params.local_[1], launch_params.local_[2],
|
||||
kernel->getDeviceKernel(*device)->workGroupInfo()->size_,
|
||||
function->name().c_str());
|
||||
return hipErrorLaunchFailure;
|
||||
@@ -317,15 +317,14 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
int num_blocks = 0;
|
||||
int max_blocks_per_grid = 0;
|
||||
int best_block_size = 0;
|
||||
int block_size = blockDimX * blockDimY * blockDimZ;
|
||||
int block_size = launch_params.local_.product();
|
||||
hipError_t err = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size, sharedMemBytes,
|
||||
true);
|
||||
&num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size,
|
||||
launch_params.sharedMemBytes_, true);
|
||||
if (err != hipSuccess) {
|
||||
return err;
|
||||
}
|
||||
if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) >
|
||||
unsigned(max_blocks_per_grid)) {
|
||||
if (((launch_params.global_.product()) / block_size) > unsigned(max_blocks_per_grid)) {
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
}
|
||||
@@ -338,10 +337,8 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
}
|
||||
|
||||
hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
|
||||
uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
|
||||
uint32_t blockDimZ, uint32_t sharedMemBytes,
|
||||
hip::Stream* stream, void** kernelParams, void** extra,
|
||||
amd::LaunchParams& launch_params, hip::Stream* stream,
|
||||
void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr,
|
||||
uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
|
||||
uint32_t numGrids = 0, uint64_t prevGridSum = 0,
|
||||
@@ -350,9 +347,8 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
amd::Kernel* kernel = function->kernel();
|
||||
|
||||
size_t globalWorkOffset[3] = {0};
|
||||
size_t globalWorkSize[3] = {globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ};
|
||||
size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ};
|
||||
amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
amd::NDRangeContainer ndrange(3, globalWorkOffset, launch_params.global_.Data(),
|
||||
launch_params.local_.Data());
|
||||
amd::Command::EventWaitList waitList;
|
||||
|
||||
bool profileNDRange = (startEvent != nullptr || stopEvent != nullptr);
|
||||
@@ -362,9 +358,9 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
params |= amd::NDRangeKernelCommand::AnyOrderLaunch;
|
||||
}
|
||||
|
||||
amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
|
||||
*stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum,
|
||||
allGridSum, firstDevice, profileNDRange);
|
||||
amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(*stream, waitList,
|
||||
*kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids, prevGridSum,
|
||||
allGridSum, firstDevice, profileNDRange);
|
||||
if (!kernelCommand) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -414,14 +410,12 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
|
||||
uint32_t sharedMemBytes, hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
|
||||
uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
|
||||
uint32_t numGrids = 0, uint64_t prevGridSum = 0,
|
||||
uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
|
||||
hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_params,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
|
||||
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
|
||||
uint64_t prevGridSum = 0, uint64_t allGridSum = 0,
|
||||
uint32_t firstDevice = 0) {
|
||||
int deviceId = hip::Stream::DeviceId(hStream);
|
||||
HIP_RETURN_ONFAIL(PlatformState::instance().initStatManagedVarDevicePtr(deviceId));
|
||||
|
||||
@@ -433,32 +427,37 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
amd::Kernel* kernel = function->kernel();
|
||||
amd::ScopedLock lock (DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_);
|
||||
|
||||
hipError_t status = ihipLaunchKernel_validate(
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
|
||||
sharedMemBytes, kernelParams, extra, deviceId, params);
|
||||
hipError_t status = ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId,
|
||||
params);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Make sure the app doesn't launch a workgroup bigger than the global size
|
||||
if (globalWorkSizeX < blockDimX) blockDimX = globalWorkSizeX;
|
||||
if (globalWorkSizeY < blockDimY) blockDimY = globalWorkSizeY;
|
||||
if (globalWorkSizeZ < blockDimZ) blockDimZ = globalWorkSizeZ;
|
||||
if (launch_params.global_[0] < launch_params.local_[0]) {
|
||||
launch_params.local_[0] = launch_params.global_[0];
|
||||
}
|
||||
if (launch_params.global_[1] < launch_params.local_[1]) {
|
||||
launch_params.local_[1] = launch_params.global_[1];
|
||||
}
|
||||
if (launch_params.global_[2] < launch_params.local_[2]) {
|
||||
launch_params.local_[2] = launch_params.global_[2];
|
||||
}
|
||||
|
||||
auto device = g_devices[deviceId]->devices()[0];
|
||||
// Check if it's a uniform kernel and validate dimensions
|
||||
if (kernel->getDeviceKernel(*device)->getUniformWorkGroupSize()) {
|
||||
if (((globalWorkSizeX % blockDimX) != 0) ||
|
||||
((globalWorkSizeY % blockDimY) != 0) ||
|
||||
((globalWorkSizeZ % blockDimZ) != 0)) {
|
||||
if (((launch_params.global_[0] % launch_params.local_[0]) != 0) ||
|
||||
((launch_params.global_[1] % launch_params.local_[1]) != 0) ||
|
||||
((launch_params.global_[2] % launch_params.local_[2]) != 0)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
amd::Command* command = nullptr;
|
||||
hip::Stream* hip_stream = hip::getStream(hStream);
|
||||
status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
|
||||
blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream,
|
||||
kernelParams, extra, startEvent, stopEvent, flags, params,
|
||||
gridId, numGrids, prevGridSum, allGridSum, firstDevice);
|
||||
status = ihipLaunchKernelCommand(command, f, launch_params, hip_stream, kernelParams, extra,
|
||||
startEvent, stopEvent, flags, params, gridId, numGrids,
|
||||
prevGridSum, allGridSum, firstDevice);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -514,16 +513,15 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
|
||||
if (gridDimX > int32_max || gridDimY > uint16_max || gridDimZ > uint16_max) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
|
||||
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max()) {
|
||||
|
||||
amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
|
||||
sharedMemBytes);
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
HIP_RETURN(hipErrorInvalidConfiguration);
|
||||
}
|
||||
HIP_RETURN(ihipModuleLaunchKernel(
|
||||
f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), blockDimX, blockDimY, blockDimZ, sharedMemBytes,
|
||||
hStream, kernelParams, extra, nullptr, nullptr));
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr,
|
||||
nullptr));
|
||||
}
|
||||
|
||||
hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
@@ -544,9 +542,12 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
|
||||
kernelParams, extra, startEvent, stopEvent, flags);
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
|
||||
localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
|
||||
hStream, kernelParams, extra, startEvent, stopEvent, flags));
|
||||
amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
|
||||
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
|
||||
sharedMemBytes);
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
|
||||
stopEvent, flags));
|
||||
}
|
||||
|
||||
|
||||
@@ -559,9 +560,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra,
|
||||
startEvent, stopEvent);
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
|
||||
blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams,
|
||||
extra, startEvent, stopEvent));
|
||||
amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX,
|
||||
blockDimY, blockDimZ, sharedMemBytes);
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
|
||||
stopEvent));
|
||||
}
|
||||
|
||||
hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDimX,
|
||||
@@ -579,19 +582,15 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
|
||||
STREAM_CAPTURE(hipModuleLaunchCooperativeKernel, stream, f, gridDimX, gridDimY, gridDimZ,
|
||||
blockDimX, blockDimY, blockDimZ, sharedMemBytes, kernelParams);
|
||||
|
||||
size_t globalWorkSizeX = static_cast<size_t>(gridDimX) * blockDimX;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(gridDimY) * blockDimY;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(gridDimZ) * blockDimZ;
|
||||
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
|
||||
amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
|
||||
sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
HIP_RETURN(hipErrorInvalidConfiguration);
|
||||
}
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, static_cast<uint32_t>(globalWorkSizeX),
|
||||
static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), blockDimX, blockDimY,
|
||||
blockDimZ, sharedMemBytes, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, stream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
|
||||
}
|
||||
|
||||
hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
|
||||
@@ -682,25 +681,22 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
}
|
||||
}
|
||||
|
||||
size_t globalWorkSizeX = static_cast<size_t>(launch.gridDimX) * launch.blockDimX;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(launch.gridDimY) * launch.blockDimY;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(launch.gridDimZ) * launch.blockDimZ;
|
||||
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
|
||||
amd::HIPLaunchParams launch_params(launch.gridDimX, launch.gridDimY, launch.gridDimZ,
|
||||
launch.blockDimX, launch.blockDimY, launch.blockDimZ,
|
||||
launch.sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
result = ihipModuleLaunchKernel(
|
||||
launch.function, static_cast<uint32_t>(globalWorkSizeX),
|
||||
static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), launch.blockDimX, launch.blockDimY,
|
||||
launch.blockDimZ, launch.sharedMemBytes, launch.hStream, launch.kernelParams,
|
||||
launch.function, launch_params, launch.hStream, launch.kernelParams,
|
||||
nullptr, nullptr, nullptr, flags, extFlags,
|
||||
i, numDevices, prevGridSize, allGridSize, firstDevice);
|
||||
if (result != hipSuccess) {
|
||||
break;
|
||||
}
|
||||
prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
|
||||
prevGridSize += launch_params.global_.product();
|
||||
}
|
||||
|
||||
// Sync the execution streams on all devices
|
||||
@@ -810,13 +806,12 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
|
||||
return getStatFuncError;
|
||||
}
|
||||
const amd::Device* device = g_devices[deviceId]->devices()[0];
|
||||
size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
|
||||
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max() ||
|
||||
(blockDim.x * blockDim.y * blockDim.z > device->info().maxWorkGroupSize_)) {
|
||||
|
||||
amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
|
||||
blockDim.z, sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig() ||
|
||||
launch_params.local_.product() > device->info().maxWorkGroupSize_) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
@@ -824,15 +819,14 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
|
||||
if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
|
||||
//if (globalWorkSizeX == 0 || globalWorkSizeY == 0 || globalWorkSizeZ == 0) {
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|
||||
|| launch_params.global_[2] == 0) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
return ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
|
||||
static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y,
|
||||
blockDim.z, sharedMemBytes, hStream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
|
||||
return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr,
|
||||
nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
|
||||
@@ -1132,35 +1126,31 @@ hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
size_t globalWorkSizeX = static_cast<size_t>(config->gridDimX) * config->blockDimX;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(config->gridDimY) * config->blockDimY;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(config->gridDimZ) * config->blockDimZ;
|
||||
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
|
||||
amd::HIPLaunchParams launch_params(config->gridDimX, config->gridDimY, config->gridDimZ,
|
||||
config->blockDimX, config->blockDimY, config->blockDimZ,
|
||||
config->sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
HIP_RETURN(hipErrorInvalidConfiguration);
|
||||
}
|
||||
|
||||
if (config->numAttrs == 0) {
|
||||
HIP_RETURN(ihipModuleLaunchKernel(
|
||||
f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), config->blockDimX, config->blockDimY,
|
||||
config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr,
|
||||
nullptr, nullptr, 0));
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams, nullptr,
|
||||
nullptr, nullptr, 0));
|
||||
}
|
||||
|
||||
for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) {
|
||||
hipLaunchAttribute& attr = config->attrs[attr_idx];
|
||||
switch (attr.id) {
|
||||
case hipLaunchAttributeCooperative:
|
||||
{
|
||||
if (attr.value.cooperative != 0) {
|
||||
HIP_RETURN(ihipModuleLaunchKernel(
|
||||
f, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), config->blockDimX, config->blockDimY,
|
||||
config->blockDimZ, config->sharedMemBytes, config->hStream, kernelParams, nullptr,
|
||||
nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups));
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams,
|
||||
nullptr, nullptr, nullptr, 0,
|
||||
amd::NDRangeKernelCommand::CooperativeGroups));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogPrintfError("Attribute %u not supported", attr.id);
|
||||
break;
|
||||
|
||||
@@ -50,11 +50,10 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor
|
||||
hipDeviceptr_t* dptr, size_t* bytes);
|
||||
|
||||
extern hipError_t ihipModuleLaunchKernel(
|
||||
hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX,
|
||||
uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream,
|
||||
void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent,
|
||||
uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
|
||||
uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0);
|
||||
hipFunction_t f, amd::LaunchParams& launch_params, hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
|
||||
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0,
|
||||
uint64_t allGridSum = 0, uint32_t firstDevice = 0);
|
||||
static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) {
|
||||
// Workaround for device name mismatch.
|
||||
// Device name may contain feature strings delimited by '+', e.g.
|
||||
@@ -641,18 +640,14 @@ hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDi
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
|
||||
size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
|
||||
size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
|
||||
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
|
||||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
|
||||
amd::HIPLaunchParams launch_params(gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
|
||||
blockDim.z, sharedMemBytes);
|
||||
if (!launch_params.IsValidConfig()) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
return ihipModuleLaunchKernel(
|
||||
func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
|
||||
static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes,
|
||||
stream, args, nullptr, startEvent, stopEvent, flags);
|
||||
|
||||
return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr,
|
||||
startEvent, stopEvent, flags);
|
||||
}
|
||||
|
||||
// conversion routines between float and half precision
|
||||
|
||||
@@ -56,6 +56,10 @@ class NDRange : public EmbeddedObject {
|
||||
//! Construct a new index space of the given dimensions.
|
||||
explicit NDRange(size_t dimensions);
|
||||
|
||||
NDRange(size_t dataX, size_t dataY, size_t dataZ) : dimensions_(3) {
|
||||
data_[0] = dataX; data_[1] = dataY; data_[2] = dataZ;
|
||||
}
|
||||
|
||||
//! Copy constructor.
|
||||
NDRange(const NDRange& space);
|
||||
|
||||
@@ -112,6 +116,40 @@ class NDRange : public EmbeddedObject {
|
||||
//! Print this index space on the given stream.
|
||||
void printOn(FILE* file) const;
|
||||
#endif // DEBUG
|
||||
|
||||
const size_t* Data() const { return data_; }
|
||||
};
|
||||
|
||||
//! Stucture to store launch parameters.
|
||||
struct LaunchParams {
|
||||
NDRange global_; //!< Total number of work-items in N-dims
|
||||
NDRange local_; //!< Number of work-items in N-dims in a workgroup.
|
||||
uint32_t sharedMemBytes_; //!< Shared Memory bytes
|
||||
bool validConfig_; //!< Flag will be set to false when config is not correct.
|
||||
|
||||
LaunchParams(uint32_t globalX, uint32_t globalY, uint32_t globalZ, uint32_t localX,
|
||||
uint32_t localY, uint32_t localZ, uint32_t sharedMemBytes)
|
||||
: global_(globalX, globalY, globalZ), local_(localX, localY, localZ),
|
||||
sharedMemBytes_ (sharedMemBytes), validConfig_(true) {}
|
||||
|
||||
bool IsValidConfig() const { return validConfig_; }
|
||||
};
|
||||
|
||||
//! Structure to store launch parameters in HIP Style (global and local size needs computation).
|
||||
struct HIPLaunchParams : public LaunchParams {
|
||||
public:
|
||||
HIPLaunchParams(uint32_t gridX, uint32_t gridY, uint32_t gridZ, uint32_t blockX,
|
||||
uint32_t blockY, uint32_t blockZ, uint32_t sharedMemBytes)
|
||||
: LaunchParams(static_cast<uint32_t>(gridX) * blockX,
|
||||
static_cast<uint32_t>(gridY) * blockY,
|
||||
static_cast<uint32_t>(gridZ) * blockZ,
|
||||
blockX, blockY, blockZ, sharedMemBytes) {
|
||||
if (global_[0] > std::numeric_limits<uint32_t>::max() ||
|
||||
global_[1] > std::numeric_limits<uint32_t>::max() ||
|
||||
global_[2] > std::numeric_limits<uint32_t>::max()) {
|
||||
validConfig_ = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//! A container for the local and global worksizes.
|
||||
|
||||
Ссылка в новой задаче
Block a user