diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 614fa02be4..4f7ef96a48 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -274,31 +274,6 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, return hipErrorLaunchFailure; } } - address kernargs = nullptr; - // 'extra' is a struct that contains the following info: { - // HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs, - // HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size, - // HIP_LAUNCH_PARAM_END } - if (extra != nullptr) { - if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || - extra[4] != HIP_LAUNCH_PARAM_END) { - return hipErrorInvalidValue; - } - kernargs = reinterpret_cast
(extra[1]); - } - - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - if (kernelParams == nullptr) { - assert(kernargs != nullptr); - kernel->parameters().set(i, desc.size_, kernargs + desc.offset_, - desc.type_ == T_POINTER /*svmBound*/); - } else { - assert(extra == nullptr); - kernel->parameters().set(i, desc.size_, kernelParams[i], - desc.type_ == T_POINTER /*svmBound*/); - } - } return hipSuccess; } @@ -319,7 +294,6 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ}; amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; - address kernargs = nullptr; bool profileNDRange = (startEvent != nullptr || stopEvent != nullptr); @@ -335,10 +309,44 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, return hipErrorOutOfMemory; } - // Capture the kernel arguments - if (CL_SUCCESS != kernelCommand->captureAndValidate()) { - kernelCommand->release(); - return hipErrorOutOfMemory; + address kernargs = nullptr; + // 'extra' is a struct that contains the following info: { + // HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs, + // HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size, + // HIP_LAUNCH_PARAM_END } + if (extra != nullptr) { + assert(kernelParams == nullptr); + if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || + extra[4] != HIP_LAUNCH_PARAM_END) { + return hipErrorInvalidValue; + } + kernargs = reinterpret_cast(extra[1]); + } + + if (DEBUG_HIP_KERNARG_COPY_OPT) { + if (CL_SUCCESS != kernelCommand->AllocCaptureSetValidate(kernelParams, kernargs)) { + kernelCommand->release(); + return hipErrorOutOfMemory; + } + + } else { + for (size_t i = 0; i < kernel->signature().numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = kernel->signature().at(i); + if (kernelParams == nullptr) { + assert(kernargs != nullptr); + kernel->parameters().set(i, desc.size_, kernargs + desc.offset_, + desc.type_ == T_POINTER /*svmBound*/); + } else { + kernel->parameters().set(i, desc.size_, kernelParams[i], + desc.type_ == T_POINTER /*svmBound*/); + } + } + + // Capture the kernel arguments + if (CL_SUCCESS != kernelCommand->captureAndValidate()) { + kernelCommand->release(); + return hipErrorOutOfMemory; + } } command = kernelCommand; diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index 7424279ac8..5fa0cc6445 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -643,6 +643,27 @@ bool MigrateMemObjectsCommand::validateMemory() { return true; } +// ================================================================================================= +int32_t NDRangeKernelCommand::AllocCaptureSetValidate(void** kernelParams, address kernArgs) { + const amd::Device& device = queue()->device(); + // Validate the kernel before submission + if (!queue()->device().validateKernel(kernel(), queue()->vdev(), cooperativeGroups())) { + return CL_OUT_OF_RESOURCES; + } + + parameters_ = kernel().parameters().alloc(*queue()->vdev()); + if (parameters_ == nullptr) { + LogError("Cannot allocate memory for parameters_"); + return CL_OUT_OF_RESOURCES; + } + + if (!kernel().parameters().captureAndSet(kernelParams, kernArgs, parameters_)) { + LogError("Cannot capture and set the kernel parameters"); + return CL_OUT_OF_RESOURCES; + } + return CL_SUCCESS; +} + int32_t NDRangeKernelCommand::captureAndValidate() { const amd::Device& device = queue()->device(); // Validate the kernel before submission diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 03ecaaf4c8..5655f06602 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -1177,7 +1177,11 @@ class NDRangeKernelCommand : public Command { numWorkgroups_ = numWorkgroups; } + // Capture kernel parameters and validate int32_t captureAndValidate(); + + // Allocate, capture and set kernel parameters + int32_t AllocCaptureSetValidate(void** kernelParams, address kernArgs); }; class NativeFnCommand : public Command { diff --git a/rocclr/platform/kernel.cpp b/rocclr/platform/kernel.cpp index 52d4936a9b..2fb08f4e92 100644 --- a/rocclr/platform/kernel.cpp +++ b/rocclr/platform/kernel.cpp @@ -85,6 +85,91 @@ size_t KernelParameters::localMemSize(size_t minDataTypeAlignment) const { return memSize; } +// ================================================================================================= +address KernelParameters::alloc(device::VirtualDevice& vDev) { + + //! Information about which arguments are SVM pointers is stored after + // the actual parameters, but only if the device has any SVM capability + const size_t execInfoSize = getNumberOfSvmPtr() * sizeof(void*); + + address mem = vDev.allocKernelArguments(totalSize_ + execInfoSize, 128); + if (mem == nullptr) { + mem = reinterpret_cast(AlignedMemory::allocate(totalSize_ + execInfoSize, + PARAMETERS_MIN_ALIGNMENT)); + } else { + deviceKernelArgs_ = true; + } + + return mem; +} + +// ================================================================================================= +bool KernelParameters::captureAndSet(void** kernelParams, address kernArgs, address mem) { + + for (size_t idx = 0; idx < signature_.numParameters(); ++idx) { + KernelParameterDescriptor& desc = signature_.params()[idx]; + void* value = nullptr; + if (kernelParams != nullptr) { + value = kernelParams[idx]; + } else { + value = kernArgs + desc.offset_; + } + void* param = mem + desc.offset_; + uint32_t uint32_value = 0; + uint64_t uint64_value = 0; + Memory* memArg = nullptr; + amd::Memory** memories = reinterpret_cast