diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index bfcb18d81f..9b9d57d2a6 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1214,6 +1214,9 @@ class VirtualDevice : public amd::HeapObject { virtual void submitStreamOperation(amd::StreamOperationCommand& cmd) { ShouldNotReachHere(); } virtual void profilerAttach(bool enable) = 0; + + virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } + //! Get the blit manager object device::BlitManager& blitMgr() const { return *blitMgr_; } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 460f8503e2..5091785324 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -733,10 +733,6 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para const_address srcArgPtr = params + desc.offset_; if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { void* mem = allocKernArg(desc.size_, 128); - if (mem == nullptr) { - LogError("Out of memory"); - return false; - } memcpy(mem, srcArgPtr, desc.size_); const auto it = hsaKernel.patch().find(desc.offset_); WriteAqlArgAt(const_cast
(params), &mem, sizeof(void*), it->second); @@ -1240,6 +1236,17 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { return result; } +// ================================================================================================ +address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { + if (ROCR_SKIP_KERNEL_ARG_COPY) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + return reinterpret_cast(allocKernArg(size, alignment)); + } else { + return nullptr; + } +} + // ================================================================================================ /* profilingBegin, when profiling is enabled, creates a timestamp to save in * virtualgpu's timestamp_, and calls start() to get the current host @@ -2703,17 +2710,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } } - // Find all parameters for the current kernel - - // Allocate buffer to hold kernel arguments - address argBuffer = (address)allocKernArg(gpuKernel.KernargSegmentByteSize(), - gpuKernel.KernargSegmentAlignment()); - - if (argBuffer == nullptr) { - LogError("Out of memory"); - return false; - } - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "ShaderName : %s", gpuKernel.name().c_str()); // Check if runtime has to setup hidden arguments @@ -2817,8 +2813,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } } - // Load all kernel arguments - WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0); + address argBuffer = const_cast(parameters); + // Find all parameters for the current kernel + if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) { + // Allocate buffer to hold kernel arguments + argBuffer = reinterpret_cast(allocKernArg(gpuKernel.KernargSegmentByteSize(), + gpuKernel.KernargSegmentAlignment())); + // Load all kernel arguments + WriteAqlArgAt(argBuffer, parameters, gpuKernel.KernargSegmentByteSize(), 0); + } + // Note: In a case of structs the size won't match, // since HSAIL compiler expects a reference... assert(gpuKernel.KernargSegmentByteSize() <= signature.paramsSize() && diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index 30cb1c8016..27840770c4 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -322,6 +322,9 @@ class VirtualGPU : public device::VirtualDevice { void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {} virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){} + + virtual address allocKernelArguments(size_t size, size_t alignment) final; + /** * @brief Waits on an outstanding kernel without regard to how * it was dispatched - with or without a signal diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp index b8da15fa82..a8d907d051 100644 --- a/rocclr/platform/command.cpp +++ b/rocclr/platform/command.cpp @@ -633,7 +633,8 @@ int32_t NDRangeKernelCommand::captureAndValidate() { int32_t error; uint64_t lclMemSize = kernel().getDeviceKernel(device)->workGroupInfo()->localMemSize_; - parameters_ = kernel().parameters().capture(device, sharedMemBytes_ + lclMemSize, &error); + parameters_ = kernel().parameters().capture(*queue()->vdev(), + sharedMemBytes_ + lclMemSize, &error); return error; } diff --git a/rocclr/platform/kernel.cpp b/rocclr/platform/kernel.cpp index 3fc579972b..80cb1a12be 100644 --- a/rocclr/platform/kernel.cpp +++ b/rocclr/platform/kernel.cpp @@ -154,14 +154,21 @@ void KernelParameters::set(size_t index, size_t size, const void* value, bool sv desc.info_.defined_ = true; } -address KernelParameters::capture(const Device& device, uint64_t lclMemSize, int32_t* error) { +address KernelParameters::capture(device::VirtualDevice& vDev, uint64_t lclMemSize, int32_t* error) { + const Device& device = vDev.device(); *error = CL_SUCCESS; + //! Information about which arguments are SVM pointers is stored after // the actual parameters, but only if the device has any SVM capability const size_t execInfoSize = getNumberOfSvmPtr() * sizeof(void*); - address mem = reinterpret_cast(AlignedMemory::allocate( - totalSize_ + execInfoSize, PARAMETERS_MIN_ALIGNMENT)); + address mem = vDev.allocKernelArguments(totalSize_ + execInfoSize, 128); + if (mem == nullptr) { + mem = reinterpret_cast(AlignedMemory::allocate(totalSize_ + execInfoSize, + PARAMETERS_MIN_ALIGNMENT)); + } else { + deviceKernelArgs_ = true; + } if (mem != nullptr) { ::memcpy(mem, values_, totalSize_); @@ -278,7 +285,9 @@ void KernelParameters::release(address mem, const amd::Device& device) const { } } - AlignedMemory::deallocate(mem); + if (!deviceKernelArgs()) { + AlignedMemory::deallocate(mem); + } } KernelSignature::KernelSignature(const std::vector