diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp index 5baa2bca91..b6d3b77974 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp @@ -1337,6 +1337,12 @@ gpu::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ? Resource::Remote : Resource::Local; + + // Check if runtime can force a tiny buffer into USWC memory + if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) && + (owner.getMemFlags() & CL_MEM_READ_ONLY)) { + type = Resource::RemoteUSWC; + } if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) { type = Resource::BusAddressable; diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp index 3796aa2e9a..bdb7687f54 100644 --- a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp @@ -416,7 +416,8 @@ Memory::~Memory() { if ((owner() != nullptr) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) && (memoryType() != Resource::ExternalPhysical)) { // Unmap memory if direct access was requested - unmap(nullptr); + // Note: runtime will perform unmap on the actual resource destruction + //unmap(nullptr); } } diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp index f06b85268d..36a971ec85 100644 --- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -744,6 +744,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) { } } } + // Check if memory is locked already and restore CPU pointer + if (memRef_->cpuAddress_ != nullptr) { + address_ = memRef_->cpuAddress_; + memRef_->cpuAddress_ = nullptr; + mapCount_++; + } Pal::BufferViewInfo viewInfo = {}; viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); viewInfo.range = memRef_->iMem()->Desc().size; @@ -890,6 +896,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) { LogWarning("Image is bigger than the original mem object!"); } } + // Check if memory is locked already and restore CPU pointer + if (memRef_->cpuAddress_ != nullptr) { + address_ = memRef_->cpuAddress_; + memRef_->cpuAddress_ = nullptr; + mapCount_++; + } result = image_->BindGpuMemory(memRef_->gpuMem_, offset_); if (result != Pal::Result::Success) { @@ -1070,6 +1082,12 @@ bool Resource::create(MemoryType memType, CreateParams* params) { return false; } } + // Check if memory is locked already and restore CPU pointer + if (memRef_->cpuAddress_ != nullptr) { + address_ = memRef_->cpuAddress_; + memRef_->cpuAddress_ = nullptr; + mapCount_++; + } return true; } @@ -1079,7 +1097,7 @@ void Resource::free() { } // Sanity check for the map calls - if (mapCount_ != 0) { + if ((mapCount_ != 0) && (memoryType() != Remote) && (memoryType() != RemoteUSWC)) { LogWarning("Resource wasn't unlocked, but destroyed!"); } const bool wait = @@ -1105,10 +1123,16 @@ void Resource::free() { if (renames_.size() == 0) { // Destroy GSL resource if (iMem() != 0) { - //! @note: This is a workaround for bad applications that - //! don't unmap memory if (mapCount_ != 0) { - unmap(nullptr); + if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) { + //! @note: This is a workaround for bad applications that + //! don't unmap memory + unmap(nullptr); + } else { + // Delay CPU address unmap until memRef_ destruction + assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address"); + memRef_->cpuAddress_ = address_; + } } // Add resource to the cache if it's not assigned to a specific queue diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 855574e0e2..95c9667e41 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -298,7 +298,7 @@ bool VirtualGPU::Queue::flush() { palSdiRefs_.resize(0); // Remove old memory references - if ((memReferences_.size() > 1024) || (residency_size_ > residency_limit_)) { + if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) { for (auto it = memReferences_.begin(); it != memReferences_.end();) { if (it->second == cmdBufIdSlot_) { palMems_.push_back(it->first->iMem()); diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index b6e9c4997a..ef4d1644a7 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -41,8 +41,8 @@ class VirtualGPU : public device::VirtualDevice { public: class Queue : public amd::HeapObject { public: - static const uint MaxCmdBuffers = 8; - static const uint MaxCommands = 512; + static const uint MaxCmdBuffers = 16; + static const uint MaxCommands = 256; static const uint StartCmdBufIdx = 1; static const uint FirstMemoryReference = 0x80000000; static const uint64_t WaitTimeoutInNsec = 6000000000;