From 5b5b3b8cdcbe77c2d6fbf44d69d8ce123cec835d Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Mon, 4 Apr 2016 11:25:36 -0400
Subject: [PATCH] P4 to Git Change 1254144 by gandryey@gera-rcf-lnx on
 2016/04/04 11:14:17

	SWDEV-79445 - OCL generic changes and code clean-up
	- Move prepinned logic to the abstraciton layer

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#193 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#270 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#543 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#158 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#398 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.cpp#61 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.hpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.cpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#3 edit


[ROCm/clr commit: 8756fa14cbc6b533e562056e61446c4454006b3b]
---
 projects/clr/rocclr/runtime/device/device.cpp | 83 ++++++++++++++++++-
 projects/clr/rocclr/runtime/device/device.hpp | 16 +++-
 .../rocclr/runtime/device/gpu/gpudevice.cpp   | 82 +-----------------
 .../rocclr/runtime/device/gpu/gpudevice.hpp   | 30 -------
 .../rocclr/runtime/device/gpu/gpuvirtual.cpp  |  4 +-
 .../rocclr/runtime/device/pal/paldevice.cpp   | 82 +-----------------
 .../rocclr/runtime/device/pal/paldevice.hpp   | 30 -------
 .../rocclr/runtime/device/pal/palvirtual.cpp  |  4 +-
 8 files changed, 109 insertions(+), 222 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp
index e085459449..449eb52202 100644
--- a/projects/clr/rocclr/runtime/device/device.cpp
+++ b/projects/clr/rocclr/runtime/device/device.cpp
@@ -225,7 +225,13 @@ Device::tearDown()
 }
 
 Device::Device(Device* parent)
-  : settings_(NULL), online_(true), blitProgram_(NULL), hwDebugMgr_(NULL), parent_(parent)
+    : settings_(NULL)
+    , online_(true)
+    , blitProgram_(NULL)
+    , hwDebugMgr_(NULL)
+    , parent_(parent)
+    , vaCacheAccess_(nullptr)
+    , vaCacheMap_(nullptr)
 {
     memset(&info_, '\0', sizeof(info_));
     if (parent_ != NULL) {
@@ -235,6 +241,11 @@ Device::Device(Device* parent)
 
 Device::~Device()
 {
+    CondLog((vaCacheMap_ != nullptr) &&
+        (vaCacheMap_->size() != 0), "Application didn't unmap all host memory!");
+    delete vaCacheMap_;
+    delete vaCacheAccess_;
+
     // Destroy device settings
     if (settings_ != NULL) {
         delete settings_;
@@ -255,6 +266,20 @@ Device::~Device()
     }
 }
 
+bool
+Device::create()
+{
+    vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true);
+    if (NULL == vaCacheAccess_) {
+        return false;
+    }
+    vaCacheMap_ = new std::map<uintptr_t, device::Memory*>();
+    if (NULL == vaCacheMap_) {
+        return false;
+    }
+    return true;
+}
+
 bool
 Device::isAncestor(const Device* sub) const
 {
@@ -319,6 +344,62 @@ Device::registerDevice()
     devices_->push_back(this);
 }
 
+void
+Device::addVACache(device::Memory* memory) const
+{
+    // Make sure system memory has direct access
+    if (memory->isHostMemDirectAccess()) {
+        // VA cache access must be serialised
+        amd::ScopedLock lk(*vaCacheAccess_);
+        void*   start = memory->owner()->getHostMem();
+        size_t  offset;
+        device::Memory*   doubleMap = findMemoryFromVA(start, &offset);
+
+        if (doubleMap == nullptr) {
+            // Insert the new entry
+            vaCacheMap_->insert(std::pair<uintptr_t, device::Memory*>
+                (reinterpret_cast<uintptr_t>(start), memory));
+        }
+        else {
+            LogError("Unexpected double map() call from the app!");
+        }
+    }
+}
+
+void
+Device::removeVACache(const device::Memory* memory) const
+{
+    // Make sure system memory has direct access
+    if (memory->isHostMemDirectAccess() && memory->owner()) {
+        // VA cache access must be serialised
+        amd::ScopedLock lk(*vaCacheAccess_);
+        void*   start = memory->owner()->getHostMem();
+        vaCacheMap_->erase(reinterpret_cast<uintptr_t>(start));
+    }
+}
+
+device::Memory*
+Device::findMemoryFromVA(const void* ptr, size_t* offset) const
+{
+    // VA cache access must be serialised
+    amd::ScopedLock lk(*vaCacheAccess_);
+
+    uintptr_t key = reinterpret_cast<uintptr_t>(ptr);
+    std::map<uintptr_t, device::Memory*>::iterator it = vaCacheMap_->upper_bound(
+        reinterpret_cast<uintptr_t>(ptr));
+    if (it == vaCacheMap_->begin()) {
+        return nullptr;
+    }
+
+    --it;
+    device::Memory* mem = it->second;
+    if (key >= it->first && key < (it->first + mem->size())) {
+        // ptr is in the range
+        *offset = key - it->first;
+        return mem;
+    }
+    return nullptr;
+}
 
 bool IsHsaRequested(cl_device_type requestedType) {
 // Depending on HSA_RUNTIME and hint flags CL_HSA_XXXXX_AMD,
diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp
index a94ada3787..71e1bb0493 100644
--- a/projects/clr/rocclr/runtime/device/device.hpp
+++ b/projects/clr/rocclr/runtime/device/device.hpp
@@ -1519,6 +1519,9 @@ public:
     Device(Device* parent = NULL);
     virtual ~Device();
 
+    //! Initializes abstraction layer device object
+    bool create();
+
     //! Increment the reference count
     uint retain() {
         // Only increment the reference count of sub-devices
@@ -1733,6 +1736,15 @@ public:
     //! Remove the Hardware Debug Manager
     virtual void hwDebugManagerRemove() {}
 
+    //! Adds GPU memory to the VA cache list
+    void addVACache(device::Memory* memory) const;
+
+    //! Removes GPU memory from the VA cache list
+    void removeVACache(const device::Memory* memory) const;
+
+    //! Finds GPU memory from virtual address
+    device::Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
+
 protected:
     //! Enable the specified extension
     char* getExtensionString();
@@ -1757,7 +1769,9 @@ private:
     typedef std::vector<Device*>::iterator device_iterator;
     static std::vector<Device*>* devices_; //!< All known devices
 
-    Device*         parent_;   //!< This device's parent
+    Device*     parent_;        //!< This device's parent
+    Monitor*    vaCacheAccess_; //!< Lock to serialize VA caching access
+    std::map<uintptr_t, device::Memory*>* vaCacheMap_;  //!< VA cache map
 };
 
 struct KernelParameterDescriptor
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index 792b96a4e4..d15cea66be 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -735,8 +735,6 @@ Device::Device()
     , mapCacheOps_(NULL)
     , xferRead_(NULL)
     , xferWrite_(NULL)
-    , vaCacheAccess_(NULL)
-    , vaCacheList_(NULL)
     , mapCache_(NULL)
     , resourceCache_(NULL)
     , heapInitComplete_(false)
@@ -752,9 +750,6 @@ Device::~Device()
     delete hwDebugMgr_;
     hwDebugMgr_ = NULL;
 
-    CondLog(vaCacheList_ == NULL ||
-        (vaCacheList_->size() != 0), "Application didn't unmap all host memory!");
-
     delete srdManager_;
 
     for (uint s = 0; s < scratch_.size(); ++s) {
@@ -795,8 +790,6 @@ Device::~Device()
     delete vgpusAccess_;
     delete scratchAlloc_;
     delete mapCacheOps_;
-    delete vaCacheAccess_;
-    delete vaCacheList_;
 
     if (context_ != NULL) {
         context_->release();
@@ -811,6 +804,10 @@ extern const char* SchedulerSourceCode;
 bool
 Device::create(CALuint ordinal, CALuint numOfDevices)
 {
+    if (!amd::Device::create()) {
+        return false;
+    }
+
     appProfile_.init();
 
     bool smallMemSystem = false;
@@ -875,15 +872,6 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
         return false;
     }
 
-    vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true);
-    if (NULL == vaCacheAccess_) {
-        return false;
-    }
-    vaCacheList_ = new std::list<VACacheEntry*>();
-    if (NULL == vaCacheList_) {
-        return false;
-    }
-
     mapCache_ = new std::vector<amd::Memory*>();
     if (mapCache_ == NULL) {
         return false;
@@ -1895,68 +1883,6 @@ Device::globalFreeMemory(size_t* freeMemory) const
     return true;
 }
 
-void
-Device::addVACache(Memory* memory) const
-{
-    // Make sure system memory has direct access
-    if (memory->isHostMemDirectAccess()) {
-        // VA cache access must be serialised
-        amd::ScopedLock lk(*vaCacheAccess_);
-        void*   start = memory->owner()->getHostMem();
-        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
-        size_t  offset;
-        Memory*   doubleMap = findMemoryFromVA(start, &offset);
-
-        if (doubleMap == NULL) {
-            // Allocate a new entry
-            VACacheEntry*   entry = new VACacheEntry(start, end, memory);
-            if (entry != NULL) {
-                vaCacheList_->push_back(entry);
-            }
-        }
-        else {
-            LogError("Unexpected double map() call from the app!");
-        }
-    }
-}
-
-void
-Device::removeVACache(const Memory* memory) const
-{
-    // Make sure system memory has direct access
-    if (memory->isHostMemDirectAccess() && memory->owner()) {
-        // VA cache access must be serialised
-        amd::ScopedLock lk(*vaCacheAccess_);
-        void*   start = memory->owner()->getHostMem();
-        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
-
-        // Find VA cache entry for the specified memory
-        for (const auto& entry : *vaCacheList_) {
-            if (entry->startAddress_ == start) {
-                CondLog((entry->endAddress_ != end), "Incorrect VA range");
-                delete entry;
-                vaCacheList_->remove(entry);
-                break;
-            }
-        }
-    }
-}
-
-Memory*
-Device::findMemoryFromVA(const void* ptr, size_t* offset) const
-{
-    // VA cache access must be serialised
-    amd::ScopedLock lk(*vaCacheAccess_);
-    for (const auto& entry : *vaCacheList_) {
-        if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) {
-            *offset = static_cast<size_t>(reinterpret_cast<const char*>(ptr) -
-                reinterpret_cast<char*>(entry->startAddress_));
-            return entry->memory_;
-        }
-    }
-    return NULL;
-}
-
 amd::Memory*
 Device::findMapTarget(size_t size) const
 {
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
index 02957051a2..55328d48ae 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -310,25 +310,6 @@ public:
         const Device&           gpuDevice_;     //!< GPU device object
     };
 
-    //! Virtual address cache entry
-    struct VACacheEntry : public amd::HeapObject
-    {
-        void*   startAddress_;  //!< Start virtual address
-        void*   endAddress_;    //!< End virtual address
-        Memory* memory_;        //!< GPU memory, associated with the range
-
-        //! Constructor
-        VACacheEntry(
-            void*   startAddress,   //!< Start virtual address
-            void*   endAddress,     //!< End virtual address
-            Memory* memory          //!< GPU memory object
-            ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {}
-
-    private:
-        //! Disable default constructor
-        VACacheEntry();
-    };
-
     struct ScratchBuffer : public amd::HeapObject
     {
         uint    regNum_;    //!< The number of used scratch registers
@@ -502,15 +483,6 @@ public:
     //! Returns transfer buffer object
     XferBuffers& xferRead() const { return *xferRead_; }
 
-    //! Adds GPU memory to the VA cache list
-    void addVACache(Memory* memory) const;
-
-    //! Removes GPU memory from the VA cache list
-    void removeVACache(const Memory* memory) const;
-
-    //! Finds GPU memory from virtual address
-    Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
-
     //! Finds an appropriate map target
     amd::Memory* findMapTarget(size_t size) const;
 
@@ -618,8 +590,6 @@ private:
     XferBuffers*    xferRead_;      //!< Transfer buffers read
     XferBuffers*    xferWrite_;     //!< Transfer buffers write
 
-    amd::Monitor*   vaCacheAccess_; //!< Lock to serialize VA caching access
-    std::list<VACacheEntry*>*   vaCacheList_; //!< VA cache list
     std::vector<amd::Memory*>*  mapCache_;  //!< Map cache info structure
     ResourceCache*  resourceCache_; //!< Resource cache
     Engines         engines_;       //!< Available engines on device
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 843c7adf1c..30c268429c 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -660,7 +660,7 @@ VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd)
 
     size_t offset = 0;
     // Find if virtual address is a CL allocation
-    gpu::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
+    device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
 
     profilingBegin(vcmd, true);
 
@@ -765,7 +765,7 @@ VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd)
     gpu::Memory* memory = dev().getGpuMemory(&vcmd.destination());
     size_t offset = 0;
     // Find if virtual address is a CL allocation
-    gpu::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
+    device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
 
     profilingBegin(vcmd, true);
 
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
index f8b754fda3..2406795ae8 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp
@@ -559,8 +559,6 @@ Device::Device()
     , mapCacheOps_(nullptr)
     , xferRead_(nullptr)
     , xferWrite_(nullptr)
-    , vaCacheAccess_(nullptr)
-    , vaCacheList_(nullptr)
     , mapCache_(nullptr)
     , resourceCache_(nullptr)
     , numComputeEngines_(0)
@@ -578,9 +576,6 @@ Device::~Device()
     delete hwDebugMgr_;
     hwDebugMgr_ = nullptr;
 
-    CondLog(vaCacheList_ == nullptr ||
-        (vaCacheList_->size() != 0), "Application didn't unmap all host memory!");
-
     delete srdManager_;
 
     for (uint s = 0; s < scratch_.size(); ++s) {
@@ -618,8 +613,6 @@ Device::~Device()
     delete vgpusAccess_;
     delete scratchAlloc_;
     delete mapCacheOps_;
-    delete vaCacheAccess_;
-    delete vaCacheList_;
 
     if (context_ != nullptr) {
         context_->release();
@@ -633,6 +626,10 @@ extern const char* SchedulerSourceCode;
 bool
 Device::create(Pal::IDevice* device)
 {
+    if (!amd::Device::create()) {
+        return false;
+    }
+
     appProfile_.init();
     device_ = device;
     Pal::Result result;
@@ -721,15 +718,6 @@ Device::create(Pal::IDevice* device)
         return false;
     }
 
-    vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true);
-    if (nullptr == vaCacheAccess_) {
-        return false;
-    }
-    vaCacheList_ = new std::list<VACacheEntry*>();
-    if (nullptr == vaCacheList_) {
-        return false;
-    }
-
     mapCache_ = new std::vector<amd::Memory*>();
     if (mapCache_ == nullptr) {
         return false;
@@ -1630,68 +1618,6 @@ Device::globalFreeMemory(size_t* freeMemory) const
     return true;
 }
 
-void
-Device::addVACache(Memory* memory) const
-{
-    // Make sure system memory has direct access
-    if (memory->isHostMemDirectAccess()) {
-        // VA cache access must be serialised
-        amd::ScopedLock lk(*vaCacheAccess_);
-        void*   start = memory->owner()->getHostMem();
-        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
-        size_t  offset;
-        Memory*   doubleMap = findMemoryFromVA(start, &offset);
-
-        if (doubleMap == nullptr) {
-            // Allocate a new entry
-            VACacheEntry*   entry = new VACacheEntry(start, end, memory);
-            if (entry != nullptr) {
-                vaCacheList_->push_back(entry);
-            }
-        }
-        else {
-            LogError("Unexpected double map() call from the app!");
-        }
-    }
-}
-
-void
-Device::removeVACache(const Memory* memory) const
-{
-    // Make sure system memory has direct access
-    if (memory->isHostMemDirectAccess() && memory->owner()) {
-        // VA cache access must be serialised
-        amd::ScopedLock lk(*vaCacheAccess_);
-        void*   start = memory->owner()->getHostMem();
-        void*   end = reinterpret_cast<address>(start) + memory->owner()->getSize();
-
-        // Find VA cache entry for the specified memory
-        for (const auto& entry : *vaCacheList_) {
-            if (entry->startAddress_ == start) {
-                CondLog((entry->endAddress_ != end), "Incorrect VA range");
-                delete entry;
-                vaCacheList_->remove(entry);
-                break;
-            }
-        }
-    }
-}
-
-Memory*
-Device::findMemoryFromVA(const void* ptr, size_t* offset) const
-{
-    // VA cache access must be serialised
-    amd::ScopedLock lk(*vaCacheAccess_);
-    for (const auto& entry : *vaCacheList_) {
-        if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) {
-            *offset = static_cast<size_t>(reinterpret_cast<const char*>(ptr) -
-                reinterpret_cast<char*>(entry->startAddress_));
-            return entry->memory_;
-        }
-    }
-    return nullptr;
-}
-
 amd::Memory*
 Device::findMapTarget(size_t size) const
 {
diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
index 7439682df1..a1c32b595d 100644
--- a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp
@@ -242,25 +242,6 @@ public:
         const Device&           gpuDevice_;     //!< GPU device object
     };
 
-    //! Virtual address cache entry
-    struct VACacheEntry : public amd::HeapObject
-    {
-        void*   startAddress_;  //!< Start virtual address
-        void*   endAddress_;    //!< End virtual address
-        Memory* memory_;        //!< GPU memory, associated with the range
-
-        //! Constructor
-        VACacheEntry(
-            void*   startAddress,   //!< Start virtual address
-            void*   endAddress,     //!< End virtual address
-            Memory* memory          //!< GPU memory object
-            ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {}
-
-    private:
-        //! Disable default constructor
-        VACacheEntry();
-    };
-
     struct ScratchBuffer : public amd::HeapObject
     {
         uint    regNum_;    //!< The number of used scratch registers
@@ -418,15 +399,6 @@ public:
     //! Returns transfer buffer object
     XferBuffers& xferRead() const { return *xferRead_; }
 
-    //! Adds GPU memory to the VA cache list
-    void addVACache(Memory* memory) const;
-
-    //! Removes GPU memory from the VA cache list
-    void removeVACache(const Memory* memory) const;
-
-    //! Finds GPU memory from virtual address
-    Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
-
     //! Finds an appropriate map target
     amd::Memory* findMapTarget(size_t size) const;
 
@@ -569,8 +541,6 @@ private:
     amd::Monitor*   mapCacheOps_;   //!< Lock to serialise cache for the map resources
     XferBuffers*    xferRead_;      //!< Transfer buffers read
     XferBuffers*    xferWrite_;     //!< Transfer buffers write
-    amd::Monitor*   vaCacheAccess_; //!< Lock to serialize VA caching access
-    std::list<VACacheEntry*>*   vaCacheList_; //!< VA cache list
     std::vector<amd::Memory*>*  mapCache_;  //!< Map cache info structure
     ResourceCache*  resourceCache_; //!< Resource cache
     uint            numComputeEngines_; //!< The number of available compute engines
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 1cc00ef49f..1b1c80c93b 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -922,7 +922,7 @@ VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd)
 
     size_t offset = 0;
     // Find if virtual address is a CL allocation
-    pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
+    device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
 
     profilingBegin(vcmd, true);
 
@@ -1027,7 +1027,7 @@ VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd)
     pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
     size_t offset = 0;
     // Find if virtual address is a CL allocation
-    pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
+    device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
 
     profilingBegin(vcmd, true);