diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp index 175085d612..281fe5b4af 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp @@ -1532,43 +1532,37 @@ void Device::updateFreeMemory(size_t size, bool free) { void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { amd::Memory* mem = nullptr; + if (nullptr == svmPtr) { - bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; - void* ptr = hostAlloc(size, alignment, atomics); - - if (ptr != nullptr) { - // Copy paste from ORCA code. - // create a hidden buffer, which will allocated on the device later - mem = new (context) amd::Buffer(context, (CL_MEM_USE_HOST_PTR | (flags & 0xFFFF0000)), size, ptr); - if (mem == nullptr) { - LogError("failed to create a svm mem object!"); - return nullptr; - } - - if (!mem->create(ptr)) { - LogError("failed to create a svm hidden buffer!"); - mem->release(); - return nullptr; - } - - // add the information to context so that we can use it later. - amd::MemObjMap::AddMemObj(ptr, mem); - - return ptr; - } else { + // create a hidden buffer, which will allocated on the device later + mem = new (context) amd::Buffer(context, flags, size, reinterpret_cast(1)); + if (mem == nullptr) { + LogError("failed to create a svm mem object!"); return nullptr; } + + if (!mem->create(nullptr)) { + LogError("failed to create a svm hidden buffer!"); + mem->release(); + return nullptr; + } + // if the device supports SVM FGS, return the committed CPU address directly. + Memory* gpuMem = getRocMemory(mem); + + // add the information to context so that we can use it later. + amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem); + svmPtr = mem->getSvmPtr(); } else { - // Copy paste from ORCA code. // Find the existing amd::mem object mem = amd::MemObjMap::FindMemObj(svmPtr); - if (nullptr == mem) { return nullptr; } - return svmPtr; + svmPtr = mem->getSvmPtr(); } + + return svmPtr; } void Device::svmFree(void* ptr) const { @@ -1577,7 +1571,6 @@ void Device::svmFree(void* ptr) const { if (nullptr != svmMem) { svmMem->release(); amd::MemObjMap::RemoveMemObj(ptr); - hostFree(ptr); } } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp index fab34ad063..7f8096bc3a 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp @@ -581,7 +581,27 @@ void Buffer::destroy() { return; } - const cl_mem_flags memFlags = owner()->getMemFlags(); + cl_mem_flags memFlags = owner()->getMemFlags(); + + if (owner()->getSvmPtr() != nullptr) { + if (!dev().settings().enableCoarseGrainSVM_) { + memFlags |= CL_MEM_SVM_FINE_GRAIN_BUFFER; + } + const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER; + + if (isFineGrain) { + dev().hostFree(deviceMemory_, size()); + } else { + dev().memFree(deviceMemory_, size()); + } + + if (dev().settings().apuSystem_ || !isFineGrain) { + const_cast(dev()).updateFreeMemory(size(), true); + } + + return; + } + #ifdef WITH_AMDGPU_PRO if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) { dev().iPro().FreeDmaBuffer(deviceMemory_); @@ -628,6 +648,34 @@ bool Buffer::create() { return false; } + // Allocate backing storage in device local memory unless UHP or AHP are set + cl_mem_flags memFlags = owner()->getMemFlags(); + + if (owner()->getSvmPtr() != nullptr) { + if (!dev().settings().enableCoarseGrainSVM_) { + memFlags |= CL_MEM_SVM_FINE_GRAIN_BUFFER; + flags_ |= HostMemoryDirectAccess; + } + const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER; + + if (owner()->getSvmPtr() == reinterpret_cast(1)) { + if (isFineGrain) { + deviceMemory_ = dev().hostAlloc(size(), 1, false); + } else { + deviceMemory_ = dev().deviceLocalAlloc(size()); + } + owner()->setSvmPtr(deviceMemory_); + } else { + deviceMemory_ = owner()->getSvmPtr(); + } + + if (dev().settings().apuSystem_ || !isFineGrain) { + const_cast(dev()).updateFreeMemory(size(), false); + } + + return deviceMemory_ != nullptr; + } + // Interop buffer if (owner()->isInterop()) return createInteropBuffer(GL_ARRAY_BUFFER, 0); @@ -658,9 +706,6 @@ bool Buffer::create() { return true; } - // Allocate backing storage in device local memory unless UHP or AHP are set - const cl_mem_flags memFlags = owner()->getMemFlags(); - #ifdef WITH_AMDGPU_PRO if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) { void* host_ptr = nullptr; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp index 6c6c7d71da..039016781d 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp @@ -20,6 +20,7 @@ Settings::Settings() { pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION; enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; + enableCoarseGrainSVM_ = HSA_ENABLE_COARSE_GRAIN_SVM; maxWorkGroupSize_ = 1024; preferredWorkGroupSize_ = 256; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp index d3c601de4c..d7d6dd11ef 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp @@ -22,6 +22,7 @@ class Settings : public device::Settings { uint doublePrecision_ : 1; //!< Enables double precision support uint pollCompletion_ : 1; //!< Enables polling in HSA uint enableLocalMemory_ : 1; //!< Enable GPUVM memory + uint enableCoarseGrainSVM_ : 1; //!< Enable device memory for coarse grain SVM allocations uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch uint imageDMA_ : 1; //!< Enable direct image DMA transfers @@ -29,7 +30,7 @@ class Settings : public device::Settings { uint stagedXferWrite_ : 1; //!< Uses a staged buffer write uint singleFpDenorm_ : 1; //!< Support Single FP Denorm uint apuSystem_ : 1; //!< APU system - uint reserved_ : 20; + uint reserved_ : 21; }; uint value_; }; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index 9d9cdecd34..47b2f424d0 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1109,94 +1109,70 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { profilingEnd(cmd); } -void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); - amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); - amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); - profilingEnd(cmd); -} - -void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - Memory* srcDevMem = dev().getRocMemory(&cmd.source()); - Memory* dstDevMem = dev().getRocMemory(&cmd.destination()); +bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem, + bool entire, const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) { + Memory* srcDevMem = dev().getRocMemory(&srcMem); + Memory* dstDevMem = dev().getRocMemory(&dstMem); // Synchronize source and destination memory device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = cmd.isEntireMemory(); + syncFlags.skipEntire_ = entire; dstDevMem->syncCacheFromHost(*this, syncFlags); srcDevMem->syncCacheFromHost(*this); - amd::Coord3D size = cmd.size(); - - cl_command_type type = cmd.type(); bool result = false; bool srcImageBuffer = false; bool dstImageBuffer = false; // Force buffer copy for IMAGE1D_BUFFER - if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + if (srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { srcImageBuffer = true; type = CL_COMMAND_COPY_BUFFER; } - if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + if (dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { dstImageBuffer = true; type = CL_COMMAND_COPY_BUFFER; } - switch (cmd.type()) { + switch (type) { + case CL_COMMAND_SVM_MEMCPY: case CL_COMMAND_COPY_BUFFER: { - amd::Coord3D srcOrigin(cmd.srcOrigin()[0]); - amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); + amd::Coord3D realSrcOrigin(srcOrigin[0]); + amd::Coord3D realDstOrigin(dstOrigin[0]); + amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]); if (srcImageBuffer) { - const size_t elemSize = cmd.source().asImage()->getImageFormat().getElementSize(); - srcOrigin.c[0] *= elemSize; + const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize(); + realSrcOrigin.c[0] *= elemSize; if (dstImageBuffer) { - dstOrigin.c[0] *= elemSize; + realDstOrigin.c[0] *= elemSize; } - size.c[0] *= elemSize; + realSize.c[0] *= elemSize; } else if (dstImageBuffer) { - const size_t elemSize = cmd.destination().asImage()->getImageFormat().getElementSize(); - dstOrigin.c[0] *= elemSize; - size.c[0] *= elemSize; + const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize(); + realDstOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; } - result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, - cmd.isEntireMemory()); + result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, realSize, entire); break; } case CL_COMMAND_COPY_BUFFER_RECT: { - result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, - cmd.isEntireMemory()); + result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, srcRect, dstRect, size, entire); break; } case CL_COMMAND_COPY_IMAGE: { - result = blitMgr().copyImage(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), size, - cmd.isEntireMemory()); + result = blitMgr().copyImage(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire); break; } case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { - result = blitMgr().copyImageToBuffer(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), - size, cmd.isEntireMemory()); + result = blitMgr().copyImageToBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire); break; } case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { - result = blitMgr().copyBufferToImage(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), - size, cmd.isEntireMemory()); + result = blitMgr().copyBufferToImage(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire); break; } default: @@ -1206,11 +1182,103 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { if (!result) { LogError("submitCopyMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); + return false; } - cmd.destination().signalWrite(&dev()); + // Mark this as the most-recently written cache of the destination + dstMem.signalWrite(&dev()); + return true; +} +void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); + + profilingBegin(cmd); + + cl_command_type type = cmd.type(); + bool entire = cmd.isEntireMemory(); + + if (!copyMemory(type, cmd.source(), cmd.destination(), entire, cmd.srcOrigin(), + cmd.dstOrigin(), cmd.size(), cmd.srcRect(), cmd.dstRect())) { + cmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(cmd); +} + +void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { + // in-order semantics: previous commands need to be done before we start + releaseGpuMemoryFence(); + + profilingBegin(cmd); + // no op for FGS supported device + if (!dev().isFineGrainedSystem() && + dev().settings().enableCoarseGrainSVM_) { + amd::Coord3D srcOrigin(0, 0, 0); + amd::Coord3D dstOrigin(0, 0, 0); + amd::Coord3D size(cmd.srcSize(), 1, 1); + amd::BufferRect srcRect; + amd::BufferRect dstRect; + + bool result = false; + amd::Memory* srcMem = amd::MemObjMap::FindMemObj(cmd.src()); + amd::Memory* dstMem = amd::MemObjMap::FindMemObj(cmd.dst()); + + device::Memory::SyncFlags syncFlags; + if (nullptr != srcMem) { + srcMem->commitSvmMemory(); + srcOrigin.c[0] = + static_cast(cmd.src()) - static_cast
(srcMem->getSvmPtr()); + if (!(srcMem->validateRegion(srcOrigin, size))) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + if (nullptr != dstMem) { + dstMem->commitSvmMemory(); + dstOrigin.c[0] = + static_cast(cmd.dst()) - static_cast
(dstMem->getSvmPtr()); + if (!(dstMem->validateRegion(dstOrigin, size))) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + + if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space + amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize()); + result = true; + } else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space + Memory* memory = dev().getRocMemory(dstMem); + // Synchronize source and destination memory + syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); + memory->syncCacheFromHost(*this, syncFlags); + + result = blitMgr().writeBuffer(cmd.src(), *memory, dstOrigin, size, + dstMem->isEntirelyCovered(dstOrigin, size)); + // Mark this as the most-recently written cache of the destination + dstMem->signalWrite(&dev()); + } else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space + Memory* memory = dev().getRocMemory(srcMem); + // Synchronize source and destination memory + memory->syncCacheFromHost(*this); + + result = blitMgr().readBuffer(*memory, cmd.dst(), srcOrigin, size, + srcMem->isEntirelyCovered(srcOrigin, size)); + } else if (nullptr != srcMem && nullptr != dstMem) { // both in svm space + bool entire = + srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size); + result = + copyMemory(cmd.type(), *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect); + } + + if (!result) { + cmd.setStatus(CL_INVALID_OPERATION); + } + } else { + // direct memcpy for FGS enabled system + amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); + } profilingEnd(cmd); } @@ -1480,53 +1548,48 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { profilingEnd(cmd); } -void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); +bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, + size_t patternSize, const amd::Coord3D& origin, + const amd::Coord3D& size) { + Memory* memory = dev().getRocMemory(amdMemory); - profilingBegin(cmd); - - Memory* memory = dev().getRocMemory(&cmd.memory()); - - bool entire = cmd.isEntireMemory(); + bool entire = amdMemory->isEntirelyCovered(origin, size); // Synchronize memory from host if necessary device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; memory->syncCacheFromHost(*this, syncFlags); - cl_command_type type = cmd.type(); bool result = false; bool imageBuffer = false; float fillValue[4]; // Force fill buffer for IMAGE1D_BUFFER - if ((type == CL_COMMAND_FILL_IMAGE) && (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { type = CL_COMMAND_FILL_BUFFER; imageBuffer = true; } // Find the the right fill operation switch (type) { + case CL_COMMAND_SVM_MEMFILL: case CL_COMMAND_FILL_BUFFER: { - const void* pattern = cmd.pattern(); - size_t patternSize = cmd.patternSize(); - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); + amd::Coord3D realOrigin(origin[0]); + amd::Coord3D realSize(size[0]); // Reprogram fill parameters if it's an IMAGE1D_BUFFER object if (imageBuffer) { - size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; + size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize(); + realOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; memset(fillValue, 0, sizeof(fillValue)); - cmd.memory().asImage()->getImageFormat().formatColor(pattern, fillValue); + amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); pattern = fillValue; patternSize = elemSize; } - result = blitMgr().fillBuffer(*memory, pattern, patternSize, origin, size, entire); + result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, entire); break; } case CL_COMMAND_FILL_IMAGE: { - result = blitMgr().fillImage(*memory, cmd.pattern(), cmd.origin(), cmd.size(), entire); + result = blitMgr().fillImage(*memory, pattern, origin, size, entire); break; } default: @@ -1536,10 +1599,60 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { if (!result) { LogError("submitFillMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); } - cmd.memory().signalWrite(&dev()); + amdMemory->signalWrite(&dev()); + return true; +} + +void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); + + profilingBegin(cmd); + + if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(), + cmd.size())) { + cmd.setStatus(CL_INVALID_OPERATION); + } + profilingEnd(cmd); +} + +void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { + // in-order semantics: previous commands need to be done before we start + releaseGpuMemoryFence(); + + profilingBegin(cmd); + + if (!dev().isFineGrainedSystem() && + dev().settings().enableCoarseGrainSVM_) { + size_t patternSize = cmd.patternSize(); + size_t fillSize = patternSize * cmd.times(); + amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst()); + assert(dstMemory && "No svm Buffer to fill with!"); + size_t offset = reinterpret_cast(cmd.dst()) - + reinterpret_cast(dstMemory->getSvmPtr()); + + Memory* memory = dev().getRocMemory(dstMemory); + + amd::Coord3D origin(offset, 0, 0); + amd::Coord3D size(fillSize, 1, 1); + + assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size); + memory->syncCacheFromHost(*this, syncFlags); + + if (!fillMemory(cmd.type(), dstMemory, cmd.pattern(), cmd.patternSize(), origin, size)) { + cmd.setStatus(CL_INVALID_OPERATION); + } + // Mark this as the most-recently written cache of the destination + dstMemory->signalWrite(&dev()); + } else { + // for FGS capable device, fill CPU memory directly + amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); + } profilingEnd(cmd); } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp index 40758f8fd5..32b4237403 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -179,6 +179,10 @@ class VirtualGPU : public device::VirtualDevice { void submitFillMemory(amd::FillMemoryCommand& cmd); void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); + void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); + void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); + void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); + // { roc OpenCL integration // Added these stub (no-ops) implementation of pure virtual methods, // when integrating HSA and OpenCL branches. @@ -187,9 +191,6 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitSignal(amd::SignalCommand& cmd) {} virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {} - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); @@ -262,6 +263,27 @@ class VirtualGPU : public device::VirtualDevice { //! Returns TRUE if virtual queue was successfully allocatted bool createVirtualQueue(uint deviceQueueSize); + //! Common function for fill memory used by both svm Fill and non-svm fill + bool fillMemory(cl_command_type type, //!< the command type + amd::Memory* amdMemory, //!< memory object to fill + const void* pattern, //!< pattern to fill the memory + size_t patternSize, //!< pattern size + const amd::Coord3D& origin, //!< memory origin + const amd::Coord3D& size //!< memory size for filling + ); + + //! Common function for memory copy used by both svm Copy and non-svm Copy + bool copyMemory(cl_command_type type, //!< the command type + amd::Memory& srcMem, //!< source memory object + amd::Memory& dstMem, //!< destination memory object + bool entire, //!< flag of entire memory copy + const amd::Coord3D& srcOrigin, //!< source memory origin + const amd::Coord3D& dstOrigin, //!< destination memory object + const amd::Coord3D& size, //!< copy size + const amd::BufferRect& srcRect, //!< region of source for copy + const amd::BufferRect& dstRect //!< region of destination for copy + ); + //! Updates AQL header for the upcomming dispatch void setAqlHeader(uint16_t header) { aqlHeader_ = header; } diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index 27017cf5de..5fe68572cc 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -145,6 +145,8 @@ release(uint, HSA_SIGNAL_POOL_SIZE, 16, \ "Signal object pool size") \ release(bool, HSA_ENABLE_ATOMICS_32B, false, \ "1 = Enable SVM atomics in 32 bits (HSA backend-only). Any other value keeps then disabled.") \ +release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, IS_HIP, \ + "Enable device memory for coarse grain SVM allocations") \ release(bool, GPU_IFH_MODE, false, \ "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \ release(bool, GPU_MIPMAP, true, \ diff --git a/projects/clr/rocclr/runtime/utils/macros.hpp b/projects/clr/rocclr/runtime/utils/macros.hpp index 576e44c78d..30a549a45a 100644 --- a/projects/clr/rocclr/runtime/utils/macros.hpp +++ b/projects/clr/rocclr/runtime/utils/macros.hpp @@ -107,6 +107,10 @@ #define NOT_LIGHTNING(x) x #endif /* !WITH_LIGHTNING_COMPILER */ +#ifdef BUILD_HIP +#define IS_HIP true +#endif + #ifndef IS_LINUX #define IS_LINUX false #endif @@ -119,6 +123,9 @@ #ifndef IS_LIGHTNING #define IS_LIGHTNING false #endif +#ifndef IS_HIP +#define IS_HIP false +#endif #define IF_LEFT_true(x) x #define IF_LEFT_false(x)