From 892071aeb255dd199c5ac79f49056bb431cdb6fc Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Wed, 15 May 2024 20:27:08 -0400 Subject: [PATCH] SWDEV-460948 - Changes to alloc, set, capture under single function. Change-Id: I7b2d40e99e812b97c53535c5e63c41ad64a8f543 --- hipamd/src/hip_module.cpp | 68 ++++++++++++++++------------- rocclr/platform/command.cpp | 21 +++++++++ rocclr/platform/command.hpp | 4 ++ rocclr/platform/kernel.cpp | 85 +++++++++++++++++++++++++++++++++++++ rocclr/platform/kernel.hpp | 6 +++ rocclr/utils/flags.hpp | 2 + 6 files changed, 156 insertions(+), 30 deletions(-) diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 614fa02be4..4f7ef96a48 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -274,31 +274,6 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, uint32_t globalWorkSizeX, return hipErrorLaunchFailure; } } - address kernargs = nullptr; - // 'extra' is a struct that contains the following info: { - // HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs, - // HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size, - // HIP_LAUNCH_PARAM_END } - if (extra != nullptr) { - if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || - extra[4] != HIP_LAUNCH_PARAM_END) { - return hipErrorInvalidValue; - } - kernargs = reinterpret_cast
(extra[1]); - } - - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - if (kernelParams == nullptr) { - assert(kernargs != nullptr); - kernel->parameters().set(i, desc.size_, kernargs + desc.offset_, - desc.type_ == T_POINTER /*svmBound*/); - } else { - assert(extra == nullptr); - kernel->parameters().set(i, desc.size_, kernelParams[i], - desc.type_ == T_POINTER /*svmBound*/); - } - } return hipSuccess; } @@ -319,7 +294,6 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, size_t localWorkSize[3] = {blockDimX, blockDimY, blockDimZ}; amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize); amd::Command::EventWaitList waitList; - address kernargs = nullptr; bool profileNDRange = (startEvent != nullptr || stopEvent != nullptr); @@ -335,10 +309,44 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, return hipErrorOutOfMemory; } - // Capture the kernel arguments - if (CL_SUCCESS != kernelCommand->captureAndValidate()) { - kernelCommand->release(); - return hipErrorOutOfMemory; + address kernargs = nullptr; + // 'extra' is a struct that contains the following info: { + // HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs, + // HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernargs_size, + // HIP_LAUNCH_PARAM_END } + if (extra != nullptr) { + assert(kernelParams == nullptr); + if (extra[0] != HIP_LAUNCH_PARAM_BUFFER_POINTER || extra[2] != HIP_LAUNCH_PARAM_BUFFER_SIZE || + extra[4] != HIP_LAUNCH_PARAM_END) { + return hipErrorInvalidValue; + } + kernargs = reinterpret_cast
(extra[1]); + } + + if (DEBUG_HIP_KERNARG_COPY_OPT) { + if (CL_SUCCESS != kernelCommand->AllocCaptureSetValidate(kernelParams, kernargs)) { + kernelCommand->release(); + return hipErrorOutOfMemory; + } + + } else { + for (size_t i = 0; i < kernel->signature().numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = kernel->signature().at(i); + if (kernelParams == nullptr) { + assert(kernargs != nullptr); + kernel->parameters().set(i, desc.size_, kernargs + desc.offset_, + desc.type_ == T_POINTER /*svmBound*/); + } else { + kernel->parameters().set(i, desc.size_, kernelParams[i], + desc.type_ == T_POINTER /*svmBound*/); + } + } + + // Capture the kernel arguments + if (CL_SUCCESS != kernelCommand->captureAndValidate()) { + kernelCommand->release(); + return hipErrorOutOfMemory; + } } command = kernelCommand; diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index 7424279ac8..5fa0cc6445 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -643,6 +643,27 @@ bool MigrateMemObjectsCommand::validateMemory() { return true; } +// ================================================================================================= +int32_t NDRangeKernelCommand::AllocCaptureSetValidate(void** kernelParams, address kernArgs) { + const amd::Device& device = queue()->device(); + // Validate the kernel before submission + if (!queue()->device().validateKernel(kernel(), queue()->vdev(), cooperativeGroups())) { + return CL_OUT_OF_RESOURCES; + } + + parameters_ = kernel().parameters().alloc(*queue()->vdev()); + if (parameters_ == nullptr) { + LogError("Cannot allocate memory for parameters_"); + return CL_OUT_OF_RESOURCES; + } + + if (!kernel().parameters().captureAndSet(kernelParams, kernArgs, parameters_)) { + LogError("Cannot capture and set the kernel parameters"); + return CL_OUT_OF_RESOURCES; + } + return CL_SUCCESS; +} + int32_t NDRangeKernelCommand::captureAndValidate() { const amd::Device& device = queue()->device(); // Validate the kernel before submission diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp index 03ecaaf4c8..5655f06602 100644 --- a/rocclr/platform/command.hpp +++ b/rocclr/platform/command.hpp @@ -1177,7 +1177,11 @@ class NDRangeKernelCommand : public Command { numWorkgroups_ = numWorkgroups; } + // Capture kernel parameters and validate int32_t captureAndValidate(); + + // Allocate, capture and set kernel parameters + int32_t AllocCaptureSetValidate(void** kernelParams, address kernArgs); }; class NativeFnCommand : public Command { diff --git a/rocclr/platform/kernel.cpp b/rocclr/platform/kernel.cpp index 52d4936a9b..2fb08f4e92 100644 --- a/rocclr/platform/kernel.cpp +++ b/rocclr/platform/kernel.cpp @@ -85,6 +85,91 @@ size_t KernelParameters::localMemSize(size_t minDataTypeAlignment) const { return memSize; } +// ================================================================================================= +address KernelParameters::alloc(device::VirtualDevice& vDev) { + + //! Information about which arguments are SVM pointers is stored after + // the actual parameters, but only if the device has any SVM capability + const size_t execInfoSize = getNumberOfSvmPtr() * sizeof(void*); + + address mem = vDev.allocKernelArguments(totalSize_ + execInfoSize, 128); + if (mem == nullptr) { + mem = reinterpret_cast
(AlignedMemory::allocate(totalSize_ + execInfoSize, + PARAMETERS_MIN_ALIGNMENT)); + } else { + deviceKernelArgs_ = true; + } + + return mem; +} + +// ================================================================================================= +bool KernelParameters::captureAndSet(void** kernelParams, address kernArgs, address mem) { + + for (size_t idx = 0; idx < signature_.numParameters(); ++idx) { + KernelParameterDescriptor& desc = signature_.params()[idx]; + void* value = nullptr; + if (kernelParams != nullptr) { + value = kernelParams[idx]; + } else { + value = kernArgs + desc.offset_; + } + void* param = mem + desc.offset_; + uint32_t uint32_value = 0; + uint64_t uint64_value = 0; + Memory* memArg = nullptr; + amd::Memory** memories = reinterpret_cast(mem + memoryObjOffset()); + if (desc.type_ == T_POINTER && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { + LP64_SWITCH(uint32_value, uint64_value) = *(LP64_SWITCH(uint32_t*, uint64_t*))value; + memArg = amd::MemObjMap::FindMemObj(*reinterpret_cast(value)); + memories[desc.info_.arrayIndex_] = memArg; + if (memArg != nullptr) { + memArg->retain(); + } + desc.info_.rawPointer_ = true; + } else if (desc.type_ == T_SAMPLER) { + LogError("Cannot handle Sampler now"); + return false; + } else if (desc.type_ == T_QUEUE) { + LogError("Cannot handle Queue now"); + return false; + } else { + switch (desc.size_) { + case 4: + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + uint32_value = desc.size_; + } else { + uint32_value = *(static_cast(value)); + } + break; + case 8: + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + uint64_value = desc.size_; + } else { + uint64_value = *(static_cast(value)); + } + break; + } + } + + switch (desc.size_) { + case sizeof(uint32_t): + *static_cast(param) = uint32_value; + break; + case sizeof(uint64_t): + *static_cast(param) = uint64_value; + break; + default: + ::memcpy(param, value, desc.size_); + break; + } + desc.info_.defined_ = true; + } + + execInfoOffset_ = totalSize_; + return true; +} + void KernelParameters::set(size_t index, size_t size, const void* value, bool svmBound) { KernelParameterDescriptor& desc = signature_.params()[index]; diff --git a/rocclr/platform/kernel.hpp b/rocclr/platform/kernel.hpp index d441b18b0d..445326284f 100644 --- a/rocclr/platform/kernel.hpp +++ b/rocclr/platform/kernel.hpp @@ -284,6 +284,12 @@ class KernelParameters : protected HeapObject { //! Returns true if arguemnts were allocated on device bool deviceKernelArgs() const { return (deviceKernelArgs_ == 1); } + + //! Allocate memory for kernel arguments to be set. + address alloc(device::VirtualDevice& vDev); + + //! Capture the arguments from signature and set. + bool captureAndSet(void** kernelParams, address kernArgs, address mem); }; /*! \brief Encapsulates a __kernel function and the argument values diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index 362fcb298b..b6553cb78e 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -251,6 +251,8 @@ release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false, \ "Enable/Disable graph debug dot print dump") \ release(bool, HIP_ALWAYS_USE_NEW_COMGR_UNBUNDLING_ACTION, false, \ "Force to always use new comgr unbundling action") \ +release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \ + "Enable/Disable multiple kern arg copies") \ namespace amd {