diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index bfcb18d81f..9b9d57d2a6 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1214,6 +1214,9 @@ class VirtualDevice : public amd::HeapObject { virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); } virtual void profilerAttach(bool enable) = 0; + + virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } + //! Get the blit manager object device::BlitManager& blitMgr() const { return *blitMgr_; } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 460f8503e2..5091785324 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -733,10 +733,6 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para const_address srcArgPtr = params + desc.offset_; if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { void* mem = allocKernArg(desc.size_, 128); - if (mem == nullptr) { - LogError("Out of memory"); - return false; - } memcpy(mem, srcArgPtr, desc.size_); const auto it = hsaKernel.patch().find(desc.offset_); WriteAqlArgAt(const_cast
(params), &mem, sizeof(void*), it->second); @@ -1240,6 +1236,17 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { return result; } +// ================================================================================================ +address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { + if (ROCR_SKIP_KERNEL_ARG_COPY) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + return reinterpret_cast
(allocKernArg(size, alignment)); + } else { + return nullptr; + } +} + // ================================================================================================ /* profilingBegin, when profiling is enabled, creates a timestamp to save in * virtualgpu's timestamp_, and calls start() to get the current host @@ -2703,17 +2710,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } } - // Find all parameters for the current kernel - - // Allocate buffer to hold kernel arguments - address argBuffer = (address)allocKernArg(gpuKernel.KernargSegmentByteSize(), - gpuKernel.KernargSegmentAlignment()); - - if (argBuffer == nullptr) { - LogError("Out of memory"); - return false; - } - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "ShaderName : %s", gpuKernel.name().c_str()); // Check if runtime has to setup hidden arguments @@ -2817,8 +2813,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } } - // Load all kernel arguments - WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0); + address argBuffer = const_cast
(parameters); + // Find all parameters for the current kernel + if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) { + // Allocate buffer to hold kernel arguments + argBuffer = reinterpret_cast
(allocKernArg(gpuKernel.KernargSegmentByteSize(), + gpuKernel.KernargSegmentAlignment())); + // Load all kernel arguments + WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0); + } + // Note: In a case of structs the size won't match, // since HSAIL compiler expects a reference... assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() && diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 30cb1c8016..27840770c4 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -322,6 +322,9 @@ class VirtualGPU : public device::VirtualDevice { void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {} virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){} + + virtual address allocKernelArguments(size_t size, size_t alignment) final; + /** * @brief Waits on an outstanding kernel without regard to how * it was dispatched - with or without a signal diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index b8da15fa82..a8d907d051 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -633,7 +633,8 @@ int32_t NDRangeKernelCommand::captureAndValidate() { int32_t error; uint64_t lclMemSize = kernel().getDeviceKernel(device)->workGroupInfo()->localMemSize_; - parameters_ = kernel().parameters().capture(device, sharedMemBytes_ + lclMemSize, &error); + parameters_ = kernel().parameters().capture(*queue()->vdev(), + sharedMemBytes_ + lclMemSize, &error); return error; } diff --git a/rocclr/platform/kernel.cpp b/rocclr/platform/kernel.cpp index 3fc579972b..80cb1a12be 100644 --- a/rocclr/platform/kernel.cpp +++ b/rocclr/platform/kernel.cpp @@ -154,14 +154,21 @@ void KernelParameters::set(size_t index, size_t size, const void* value, bool sv desc.info_.defined_ = true; } -address KernelParameters::capture(const Device& device, uint64_t lclMemSize, int32_t* error) { +address KernelParameters::capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error) { + const Device& device = vDev.device(); *error = CL_SUCCESS; + //! Information about which arguments are SVM pointers is stored after // the actual parameters, but only if the device has any SVM capability const size_t execInfoSize = getNumberOfSvmPtr() * sizeof(void*); - address mem = reinterpret_cast
(AlignedMemory::allocate( - totalSize_ + execInfoSize, PARAMETERS_MIN_ALIGNMENT)); + address mem = vDev.allocKernelArguments(totalSize_ + execInfoSize, 128); + if (mem == nullptr) { + mem = reinterpret_cast
(AlignedMemory::allocate(totalSize_ + execInfoSize, + PARAMETERS_MIN_ALIGNMENT)); + } else { + deviceKernelArgs_ = true; + } if (mem != nullptr) { ::memcpy(mem, values_, totalSize_); @@ -278,7 +285,9 @@ void KernelParameters::release(address mem, const amd::Device& device) const { } } - AlignedMemory::deallocate(mem); + if (!deviceKernelArgs()) { + AlignedMemory::deallocate(mem); + } } KernelSignature::KernelSignature(const std::vector& params, diff --git a/rocclr/platform/kernel.hpp b/rocclr/platform/kernel.hpp index 305eab180c..e46fc5a794 100644 --- a/rocclr/platform/kernel.hpp +++ b/rocclr/platform/kernel.hpp @@ -137,10 +137,11 @@ class KernelParameters : protected HeapObject { uint32_t totalSize_; //!< The total size of all captured parameters struct { - uint32_t validated_ : 1; //!< True if all parameters are defined. - uint32_t execNewVcop_ : 1; //!< special new VCOP for kernel execution - uint32_t execPfpaVcop_ : 1; //!< special PFPA VCOP for kernel execution - uint32_t unused : 29; //!< unused + uint32_t validated_ : 1; //!< True if all parameters are defined. + uint32_t execNewVcop_ : 1; //!< special new VCOP for kernel execution + uint32_t execPfpaVcop_ : 1; //!< special PFPA VCOP for kernel execution + uint32_t deviceKernelArgs_:1; //!< Kernel arguments allocated on device + uint32_t unused : 28; //!< unused }; public: @@ -154,7 +155,8 @@ class KernelParameters : protected HeapObject { queueObjects_(nullptr), validated_(0), execNewVcop_(0), - execPfpaVcop_(0) { + execPfpaVcop_(0), + deviceKernelArgs_(false) { totalSize_ = signature.paramsSize() + (signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*); values_ = reinterpret_cast
(this) + alignUp(sizeof(KernelParameters), 16); @@ -179,7 +181,8 @@ class KernelParameters : protected HeapObject { totalSize_(rhs.totalSize_), validated_(rhs.validated_), execNewVcop_(rhs.execNewVcop_), - execPfpaVcop_(rhs.execPfpaVcop_) { + execPfpaVcop_(rhs.execPfpaVcop_), + deviceKernelArgs_(false) { values_ = reinterpret_cast
(this) + alignUp(sizeof(KernelParameters), 16); memoryObjOffset_ = signature_.paramsSize(); memoryObjects_ = reinterpret_cast(values_ + memoryObjOffset_); @@ -210,7 +213,7 @@ class KernelParameters : protected HeapObject { size_t localMemSize(size_t minDataTypeAlignment) const; //! Capture the state of the parameters and return the stack base pointer. - address capture(const Device& device, uint64_t lclMemSize, int32_t* error); + address capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error); //! Release the captured state of the parameters. void release(address parameters, const amd::Device& device) const; @@ -278,6 +281,9 @@ class KernelParameters : protected HeapObject { //! get the PFPA VCOP in the execInfo container bool getExecPfpaVcop() const { return (execPfpaVcop_ == 1); } + + //! Returns true if arguemnts were allocated on device + bool deviceKernelArgs() const { return (deviceKernelArgs_ == 1); } }; /*! \brief Encapsulates a __kernel function and the argument values diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index 8d12e99fd5..5f0891af98 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -266,6 +266,8 @@ release(bool, ROC_USE_FGS_KERNARG, true, \ "Use fine grain kernel args segment for supported asics") \ release(uint, ROC_AQL_QUEUE_SIZE, 4096, \ "AQL queue size in AQL packets") \ +release(bool, ROCR_SKIP_KERNEL_ARG_COPY, false, \ + "If true, then runtime can skip kernel arg copy") \ release(bool, GPU_STREAMOPS_CP_WAIT, false, \ "Force the stream wait memory operation to wait on CP.")