From 5b5b3b8cdcbe77c2d6fbf44d69d8ce123cec835d Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 4 Apr 2016 11:25:36 -0400 Subject: [PATCH] P4 to Git Change 1254144 by gandryey@gera-rcf-lnx on 2016/04/04 11:14:17 SWDEV-79445 - OCL generic changes and code clean-up - Move prepinned logic to the abstraciton layer Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#193 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#270 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#543 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#158 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#398 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.cpp#61 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.hpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.cpp#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#62 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#3 edit [ROCm/clr commit: 8756fa14cbc6b533e562056e61446c4454006b3b] --- projects/clr/rocclr/runtime/device/device.cpp | 83 ++++++++++++++++++- projects/clr/rocclr/runtime/device/device.hpp | 16 +++- .../rocclr/runtime/device/gpu/gpudevice.cpp | 82 +----------------- .../rocclr/runtime/device/gpu/gpudevice.hpp | 30 ------- .../rocclr/runtime/device/gpu/gpuvirtual.cpp | 4 +- .../rocclr/runtime/device/pal/paldevice.cpp | 82 +----------------- .../rocclr/runtime/device/pal/paldevice.hpp | 30 ------- .../rocclr/runtime/device/pal/palvirtual.cpp | 4 +- 8 files changed, 109 insertions(+), 222 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp index e085459449..449eb52202 100644 --- a/projects/clr/rocclr/runtime/device/device.cpp +++ b/projects/clr/rocclr/runtime/device/device.cpp @@ -225,7 +225,13 @@ Device::tearDown() } Device::Device(Device* parent) - : settings_(NULL), online_(true), blitProgram_(NULL), hwDebugMgr_(NULL), parent_(parent) + : settings_(NULL) + , online_(true) + , blitProgram_(NULL) + , hwDebugMgr_(NULL) + , parent_(parent) + , vaCacheAccess_(nullptr) + , vaCacheMap_(nullptr) { memset(&info_, '\0', sizeof(info_)); if (parent_ != NULL) { @@ -235,6 +241,11 @@ Device::Device(Device* parent) Device::~Device() { + CondLog((vaCacheMap_ != nullptr) && + (vaCacheMap_->size() != 0), "Application didn't unmap all host memory!"); + delete vaCacheMap_; + delete vaCacheAccess_; + // Destroy device settings if (settings_ != NULL) { delete settings_; @@ -255,6 +266,20 @@ Device::~Device() } } +bool +Device::create() +{ + vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true); + if (NULL == vaCacheAccess_) { + return false; + } + vaCacheMap_ = new std::map(); + if (NULL == vaCacheMap_) { + return false; + } + return true; +} + bool Device::isAncestor(const Device* sub) const { @@ -319,6 +344,62 @@ Device::registerDevice() devices_->push_back(this); } +void +Device::addVACache(device::Memory* memory) const +{ + // Make sure system memory has direct access + if (memory->isHostMemDirectAccess()) { + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + void* start = memory->owner()->getHostMem(); + size_t offset; + device::Memory* doubleMap = findMemoryFromVA(start, &offset); + + if (doubleMap == nullptr) { + // Insert the new entry + vaCacheMap_->insert(std::pair + (reinterpret_cast(start), memory)); + } + else { + LogError("Unexpected double map() call from the app!"); + } + } +} + +void +Device::removeVACache(const device::Memory* memory) const +{ + // Make sure system memory has direct access + if (memory->isHostMemDirectAccess() && memory->owner()) { + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + void* start = memory->owner()->getHostMem(); + vaCacheMap_->erase(reinterpret_cast(start)); + } +} + +device::Memory* +Device::findMemoryFromVA(const void* ptr, size_t* offset) const +{ + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + + uintptr_t key = reinterpret_cast(ptr); + std::map::iterator it = vaCacheMap_->upper_bound( + reinterpret_cast(ptr)); + if (it == vaCacheMap_->begin()) { + return nullptr; + } + + --it; + device::Memory* mem = it->second; + if (key >= it->first && key < (it->first + mem->size())) { + // ptr is in the range + *offset = key - it->first; + return mem; + } + return nullptr; +} bool IsHsaRequested(cl_device_type requestedType) { // Depending on HSA_RUNTIME and hint flags CL_HSA_XXXXX_AMD, diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp index a94ada3787..71e1bb0493 100644 --- a/projects/clr/rocclr/runtime/device/device.hpp +++ b/projects/clr/rocclr/runtime/device/device.hpp @@ -1519,6 +1519,9 @@ public: Device(Device* parent = NULL); virtual ~Device(); + //! Initializes abstraction layer device object + bool create(); + //! Increment the reference count uint retain() { // Only increment the reference count of sub-devices @@ -1733,6 +1736,15 @@ public: //! Remove the Hardware Debug Manager virtual void hwDebugManagerRemove() {} + //! Adds GPU memory to the VA cache list + void addVACache(device::Memory* memory) const; + + //! Removes GPU memory from the VA cache list + void removeVACache(const device::Memory* memory) const; + + //! Finds GPU memory from virtual address + device::Memory* findMemoryFromVA(const void* ptr, size_t* offset) const; + protected: //! Enable the specified extension char* getExtensionString(); @@ -1757,7 +1769,9 @@ private: typedef std::vector::iterator device_iterator; static std::vector* devices_; //!< All known devices - Device* parent_; //!< This device's parent + Device* parent_; //!< This device's parent + Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access + std::map* vaCacheMap_; //!< VA cache map }; struct KernelParameterDescriptor diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp index 792b96a4e4..d15cea66be 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp @@ -735,8 +735,6 @@ Device::Device() , mapCacheOps_(NULL) , xferRead_(NULL) , xferWrite_(NULL) - , vaCacheAccess_(NULL) - , vaCacheList_(NULL) , mapCache_(NULL) , resourceCache_(NULL) , heapInitComplete_(false) @@ -752,9 +750,6 @@ Device::~Device() delete hwDebugMgr_; hwDebugMgr_ = NULL; - CondLog(vaCacheList_ == NULL || - (vaCacheList_->size() != 0), "Application didn't unmap all host memory!"); - delete srdManager_; for (uint s = 0; s < scratch_.size(); ++s) { @@ -795,8 +790,6 @@ Device::~Device() delete vgpusAccess_; delete scratchAlloc_; delete mapCacheOps_; - delete vaCacheAccess_; - delete vaCacheList_; if (context_ != NULL) { context_->release(); @@ -811,6 +804,10 @@ extern const char* SchedulerSourceCode; bool Device::create(CALuint ordinal, CALuint numOfDevices) { + if (!amd::Device::create()) { + return false; + } + appProfile_.init(); bool smallMemSystem = false; @@ -875,15 +872,6 @@ Device::create(CALuint ordinal, CALuint numOfDevices) return false; } - vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true); - if (NULL == vaCacheAccess_) { - return false; - } - vaCacheList_ = new std::list(); - if (NULL == vaCacheList_) { - return false; - } - mapCache_ = new std::vector(); if (mapCache_ == NULL) { return false; @@ -1895,68 +1883,6 @@ Device::globalFreeMemory(size_t* freeMemory) const return true; } -void -Device::addVACache(Memory* memory) const -{ - // Make sure system memory has direct access - if (memory->isHostMemDirectAccess()) { - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - void* start = memory->owner()->getHostMem(); - void* end = reinterpret_cast
(start) + memory->owner()->getSize(); - size_t offset; - Memory* doubleMap = findMemoryFromVA(start, &offset); - - if (doubleMap == NULL) { - // Allocate a new entry - VACacheEntry* entry = new VACacheEntry(start, end, memory); - if (entry != NULL) { - vaCacheList_->push_back(entry); - } - } - else { - LogError("Unexpected double map() call from the app!"); - } - } -} - -void -Device::removeVACache(const Memory* memory) const -{ - // Make sure system memory has direct access - if (memory->isHostMemDirectAccess() && memory->owner()) { - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - void* start = memory->owner()->getHostMem(); - void* end = reinterpret_cast
(start) + memory->owner()->getSize(); - - // Find VA cache entry for the specified memory - for (const auto& entry : *vaCacheList_) { - if (entry->startAddress_ == start) { - CondLog((entry->endAddress_ != end), "Incorrect VA range"); - delete entry; - vaCacheList_->remove(entry); - break; - } - } - } -} - -Memory* -Device::findMemoryFromVA(const void* ptr, size_t* offset) const -{ - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - for (const auto& entry : *vaCacheList_) { - if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) { - *offset = static_cast(reinterpret_cast(ptr) - - reinterpret_cast(entry->startAddress_)); - return entry->memory_; - } - } - return NULL; -} - amd::Memory* Device::findMapTarget(size_t size) const { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp index 02957051a2..55328d48ae 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp @@ -310,25 +310,6 @@ public: const Device& gpuDevice_; //!< GPU device object }; - //! Virtual address cache entry - struct VACacheEntry : public amd::HeapObject - { - void* startAddress_; //!< Start virtual address - void* endAddress_; //!< End virtual address - Memory* memory_; //!< GPU memory, associated with the range - - //! Constructor - VACacheEntry( - void* startAddress, //!< Start virtual address - void* endAddress, //!< End virtual address - Memory* memory //!< GPU memory object - ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {} - - private: - //! Disable default constructor - VACacheEntry(); - }; - struct ScratchBuffer : public amd::HeapObject { uint regNum_; //!< The number of used scratch registers @@ -502,15 +483,6 @@ public: //! Returns transfer buffer object XferBuffers& xferRead() const { return *xferRead_; } - //! Adds GPU memory to the VA cache list - void addVACache(Memory* memory) const; - - //! Removes GPU memory from the VA cache list - void removeVACache(const Memory* memory) const; - - //! Finds GPU memory from virtual address - Memory* findMemoryFromVA(const void* ptr, size_t* offset) const; - //! Finds an appropriate map target amd::Memory* findMapTarget(size_t size) const; @@ -618,8 +590,6 @@ private: XferBuffers* xferRead_; //!< Transfer buffers read XferBuffers* xferWrite_; //!< Transfer buffers write - amd::Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access - std::list* vaCacheList_; //!< VA cache list std::vector* mapCache_; //!< Map cache info structure ResourceCache* resourceCache_; //!< Resource cache Engines engines_; //!< Available engines on device diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index 843c7adf1c..30c268429c 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -660,7 +660,7 @@ VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) size_t offset = 0; // Find if virtual address is a CL allocation - gpu::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); profilingBegin(vcmd, true); @@ -765,7 +765,7 @@ VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) gpu::Memory* memory = dev().getGpuMemory(&vcmd.destination()); size_t offset = 0; // Find if virtual address is a CL allocation - gpu::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); profilingBegin(vcmd, true); diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index f8b754fda3..2406795ae8 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -559,8 +559,6 @@ Device::Device() , mapCacheOps_(nullptr) , xferRead_(nullptr) , xferWrite_(nullptr) - , vaCacheAccess_(nullptr) - , vaCacheList_(nullptr) , mapCache_(nullptr) , resourceCache_(nullptr) , numComputeEngines_(0) @@ -578,9 +576,6 @@ Device::~Device() delete hwDebugMgr_; hwDebugMgr_ = nullptr; - CondLog(vaCacheList_ == nullptr || - (vaCacheList_->size() != 0), "Application didn't unmap all host memory!"); - delete srdManager_; for (uint s = 0; s < scratch_.size(); ++s) { @@ -618,8 +613,6 @@ Device::~Device() delete vgpusAccess_; delete scratchAlloc_; delete mapCacheOps_; - delete vaCacheAccess_; - delete vaCacheList_; if (context_ != nullptr) { context_->release(); @@ -633,6 +626,10 @@ extern const char* SchedulerSourceCode; bool Device::create(Pal::IDevice* device) { + if (!amd::Device::create()) { + return false; + } + appProfile_.init(); device_ = device; Pal::Result result; @@ -721,15 +718,6 @@ Device::create(Pal::IDevice* device) return false; } - vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true); - if (nullptr == vaCacheAccess_) { - return false; - } - vaCacheList_ = new std::list(); - if (nullptr == vaCacheList_) { - return false; - } - mapCache_ = new std::vector(); if (mapCache_ == nullptr) { return false; @@ -1630,68 +1618,6 @@ Device::globalFreeMemory(size_t* freeMemory) const return true; } -void -Device::addVACache(Memory* memory) const -{ - // Make sure system memory has direct access - if (memory->isHostMemDirectAccess()) { - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - void* start = memory->owner()->getHostMem(); - void* end = reinterpret_cast
(start) + memory->owner()->getSize(); - size_t offset; - Memory* doubleMap = findMemoryFromVA(start, &offset); - - if (doubleMap == nullptr) { - // Allocate a new entry - VACacheEntry* entry = new VACacheEntry(start, end, memory); - if (entry != nullptr) { - vaCacheList_->push_back(entry); - } - } - else { - LogError("Unexpected double map() call from the app!"); - } - } -} - -void -Device::removeVACache(const Memory* memory) const -{ - // Make sure system memory has direct access - if (memory->isHostMemDirectAccess() && memory->owner()) { - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - void* start = memory->owner()->getHostMem(); - void* end = reinterpret_cast
(start) + memory->owner()->getSize(); - - // Find VA cache entry for the specified memory - for (const auto& entry : *vaCacheList_) { - if (entry->startAddress_ == start) { - CondLog((entry->endAddress_ != end), "Incorrect VA range"); - delete entry; - vaCacheList_->remove(entry); - break; - } - } - } -} - -Memory* -Device::findMemoryFromVA(const void* ptr, size_t* offset) const -{ - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - for (const auto& entry : *vaCacheList_) { - if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) { - *offset = static_cast(reinterpret_cast(ptr) - - reinterpret_cast(entry->startAddress_)); - return entry->memory_; - } - } - return nullptr; -} - amd::Memory* Device::findMapTarget(size_t size) const { diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp index 7439682df1..a1c32b595d 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp @@ -242,25 +242,6 @@ public: const Device& gpuDevice_; //!< GPU device object }; - //! Virtual address cache entry - struct VACacheEntry : public amd::HeapObject - { - void* startAddress_; //!< Start virtual address - void* endAddress_; //!< End virtual address - Memory* memory_; //!< GPU memory, associated with the range - - //! Constructor - VACacheEntry( - void* startAddress, //!< Start virtual address - void* endAddress, //!< End virtual address - Memory* memory //!< GPU memory object - ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {} - - private: - //! Disable default constructor - VACacheEntry(); - }; - struct ScratchBuffer : public amd::HeapObject { uint regNum_; //!< The number of used scratch registers @@ -418,15 +399,6 @@ public: //! Returns transfer buffer object XferBuffers& xferRead() const { return *xferRead_; } - //! Adds GPU memory to the VA cache list - void addVACache(Memory* memory) const; - - //! Removes GPU memory from the VA cache list - void removeVACache(const Memory* memory) const; - - //! Finds GPU memory from virtual address - Memory* findMemoryFromVA(const void* ptr, size_t* offset) const; - //! Finds an appropriate map target amd::Memory* findMapTarget(size_t size) const; @@ -569,8 +541,6 @@ private: amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources XferBuffers* xferRead_; //!< Transfer buffers read XferBuffers* xferWrite_; //!< Transfer buffers write - amd::Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access - std::list* vaCacheList_; //!< VA cache list std::vector* mapCache_; //!< Map cache info structure ResourceCache* resourceCache_; //!< Resource cache uint numComputeEngines_; //!< The number of available compute engines diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 1cc00ef49f..1b1c80c93b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -922,7 +922,7 @@ VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) size_t offset = 0; // Find if virtual address is a CL allocation - pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); profilingBegin(vcmd, true); @@ -1027,7 +1027,7 @@ VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) pal::Memory* memory = dev().getGpuMemory(&vcmd.destination()); size_t offset = 0; // Find if virtual address is a CL allocation - pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); profilingBegin(vcmd, true);