P4 to Git Change 1101352 by gandryey@gera-dev-w7 on 2014/11/28 18:03:18

ECR #304775 - Make optimization for read map of USWC memory - If runtime detects USWC map with read operation, then it will switch to indirect map. This should improve map-read performance on APU(s) when USWC memory is used instead of frame buffer Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_memobj.cpp#72 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.cpp#269 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.hpp#89 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#172 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#234 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#486 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#134 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#112 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.hpp#43 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#340 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.cpp#88 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.hpp#45 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.cpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsavirtual.cpp#98 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.cpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsadevice.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsamemory.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#26 edit [ROCm/clr commit: 6d464be252]
2014-11-28 18:11:36 -05:00
commit f43897e51e
@@ -1123,6 +1123,7 @@ Device::allocMapTarget(
    amd::Memory&        mem,
    const amd::Coord3D& origin,
    const amd::Coord3D& region,
+    uint                mapFlags,
    size_t*             rowPitch,
    size_t*             slicePitch)
 {
@@ -143,6 +143,7 @@ public:
        amd::Memory&    mem,        //!< Abstraction layer memory object
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        );
@@ -572,15 +572,21 @@ Kernel::~Kernel()
 }

 void
-Memory::saveWriteMapInfo(
+Memory::saveMapInfo(
    const amd::Coord3D  origin,
    const amd::Coord3D  region,
+    uint                mapFlags,
    bool                entire)
 {
-    writeMapInfo_.origin_ = origin;
-    writeMapInfo_.region_ = region;
-    writeMapInfo_.entire_ = entire;
-    flags_ |= UnmapWrite;
+    if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
+        writeMapInfo_.origin_ = origin;
+        writeMapInfo_.region_ = region;
+        writeMapInfo_.entire_ = entire;
+        flags_ |= UnmapWrite;
+    }
+    if (mapFlags & CL_MAP_READ) {
+        flags_ |= UnmapRead;
+    }
 }

 Program::Program(amd::Device& device)
@@ -727,20 +727,24 @@ public:
    //! Saves map info for this object
    //! @note: It's not a thread safe operation, the app must implement
    //! synchronization for the multiple write maps if necessary
-    void saveWriteMapInfo(
+    void saveMapInfo(
        const amd::Coord3D  origin, //!< Origin of the map location
        const amd::Coord3D  region, //!< Mapped region
+        uint                mapFlags,   //< Map flags
        bool                entire  //!< True if the enitre memory was mapped
        );

    const WriteMapInfo* writeMapInfo() const { return &writeMapInfo_; }

    //! Clear memory object as mapped read only
-    void clearUnmapWrite() { flags_ &= ~UnmapWrite; }
+    void clearUnmapFlags() { flags_ &= ~(UnmapWrite | UnmapRead); }

-    //! Returns state of map read only flag
+    //! Returns state of map write flag
    bool isUnmapWrite() const { return (flags_ & UnmapWrite) ? true : false; }

+    //! Returns state of map read flag
+    bool isUnmapRead() const { return (flags_ & UnmapRead) ? true : false; }
+
    //! Returns state of memory direct access flag
    bool isHostMemDirectAccess() const
        { return (flags_ & HostMemoryDirectAccess) ? true : false; }
@@ -754,9 +758,10 @@ protected:
        HostMemoryDirectAccess  = 0x00000001,   //!< GPU has direct access to the host memory
        MapResourceAlloced      = 0x00000002,   //!< Map resource was allocated
        PinnedMemoryAlloced     = 0x00000004,   //!< An extra pinned resource was allocated
-        UnmapWrite              = 0x00000008,   //!< Memory was mapped read-only
+        UnmapWrite              = 0x00000008,   //!< Memory was mapped for write
        SubMemoryObject         = 0x00000010,   //!< Memory is sub-memory
        HostMemoryRegistered    = 0x00000020,   //!< Host memory was registered
+        UnmapRead               = 0x00000040,   //!< Memory was mapped for read
    };
    uint        flags_;         //!< Memory object flags

@@ -1587,6 +1592,7 @@ public:
        amd::Memory&    mem,        //!< Abstraction layer memory object
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        ) = 0;
@@ -2098,6 +2098,7 @@ Device::allocMapTarget(
    amd::Memory&        mem,
    const amd::Coord3D& origin,
    const amd::Coord3D& region,
+    uint                mapFlags,
    size_t*             rowPitch,
    size_t*             slicePitch)
 {
@@ -2109,7 +2110,7 @@ Device::allocMapTarget(
    }

    // Pass request over to memory
-    return memory->allocMapTarget(origin, region, rowPitch, slicePitch);
+    return memory->allocMapTarget(origin, region, mapFlags, rowPitch, slicePitch);
 }

 bool
@@ -111,6 +111,7 @@ public:
        amd::Memory&    mem,        //!< Abstraction layer memory object
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        ) { return NULL; }
@@ -437,6 +438,7 @@ public:
        amd::Memory&    mem,        //!< Abstraction layer memory object
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        );
@@ -906,6 +906,7 @@ void*
 Memory::allocMapTarget(
    const amd::Coord3D& origin,
    const amd::Coord3D& region,
+    uint                mapFlags,
    size_t*             rowPitch,
    size_t*             slicePitch)
 {
@@ -921,7 +922,8 @@ Memory::allocMapTarget(
    incIndMapCount();

    // If host memory exists, use it
-    if (owner()->getHostMem() != NULL) {
+    if ((owner()->getHostMem() != NULL) &&
+        (isCacheable() || !isHostMemDirectAccess() || !(mapFlags & CL_MAP_READ))) {
        mapAddress = reinterpret_cast<address>(owner()->getHostMem());
    }
    // If resource is a persistent allocation, we can use it directly
@@ -1226,6 +1228,7 @@ void*
 Image::allocMapTarget(
    const amd::Coord3D& origin,
    const amd::Coord3D& region,
+    uint                mapFlags,
    size_t*             rowPitch,
    size_t*             slicePitch)
 {
@@ -1245,7 +1248,8 @@ Image::allocMapTarget(
    incIndMapCount();

    // If host memory exists, use it
-    if (owner()->getHostMem() != NULL) {
+    if ((owner()->getHostMem() != NULL) &&
+        (isCacheable() || !isHostMemDirectAccess() || !(mapFlags & CL_MAP_READ))) {
        useRemoteResource = false;
        mapAddress = reinterpret_cast<address>(owner()->getHostMem());
        amd::Image* amdImage = owner()->asImage();
@@ -121,6 +121,7 @@ public:
    virtual void* allocMapTarget(
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        );
@@ -288,6 +289,7 @@ public:
    virtual void* allocMapTarget(
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        );
@@ -1085,14 +1085,15 @@ VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd)

    gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory());

-    // Save write map info for unmap copy
-    if (vcmd.mapFlags() & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
-        memory->saveWriteMapInfo(vcmd.origin(),
-            vcmd.size(), vcmd.isEntireMemory());
-    }
+    // Save map info for unmap operation
+    memory->saveMapInfo(vcmd.origin(), vcmd.size(),
+        vcmd.mapFlags(), vcmd.isEntireMemory());

    // If we have host memory, use it
-    if (memory->owner()->getHostMem() != NULL) {
+    if ((memory->owner()->getHostMem() != NULL) &&
+        (memory->isCacheable() ||
+         !memory->isHostMemDirectAccess() ||
+         !(vcmd.mapFlags() & CL_MAP_READ))) {
        if (!memory->isHostMemDirectAccess()) {
            // Make sure GPU finished operation before
            // synchronization with the backing store
@@ -1177,7 +1178,10 @@ VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd)
    amd::Memory* owner = memory->owner();

    // We used host memory
-    if (owner->getHostMem() != NULL) {
+    if ((owner->getHostMem() != NULL) &&
+        (memory->isCacheable() ||
+         !memory->isHostMemDirectAccess() ||
+         !memory->isUnmapRead())) {
        if (memory->isUnmapWrite()) {
            // Target is the backing store, so sync
            owner->signalWrite(NULL);
@@ -1254,8 +1258,8 @@ VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd)
        vcmd.setStatus(CL_INVALID_VALUE);
    }

-    // Clear read only flag
-    memory->clearUnmapWrite();
+    // Clear unmap flags
+    memory->clearUnmapFlags();

    profilingEnd(vcmd);
 }
@@ -1357,31 +1361,20 @@ VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd)

    profilingBegin(vcmd, true);

-    //check if the ptr is in the svm space
-    amd::Memory* svmMem = vcmd.getSvmMem();
-    if (NULL == svmMem) {
-        LogWarning("wrong svm address ");
-        vcmd.setStatus(CL_INVALID_VALUE);
-        return;
-    }
-
    // Make sure we have memory for the command execution
-    gpu::Memory* memory = dev().getGpuMemory(svmMem);
+    gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());

-    if (vcmd.mapFlags() & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
-        memory->saveWriteMapInfo(vcmd.origin(), vcmd.size(), vcmd.isEntireMemory());
-    }
+    memory->saveMapInfo(vcmd.origin(), vcmd.size(),
+        vcmd.mapFlags(), vcmd.isEntireMemory());

    if (memory->mapMemory() != NULL) {
        if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
            amd::Coord3D dstOrigin(0, 0, 0);
-            if (memory->cal()->buffer_) {
-                if (!blitMgr().copyBuffer(*memory,
-                    *memory->mapMemory(), vcmd.origin(), dstOrigin,
-                    vcmd.size(), vcmd.isEntireMemory())) {
-                    LogError("submitSVMMapMemory() - copy failed");
-                    vcmd.setStatus(CL_MAP_FAILURE);
-                }
+            assert(memory->cal()->buffer_ && "SVM memory can't be an image");
+            if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(),
+                vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory())) {
+                LogError("submitSVMMapMemory() - copy failed");
+                vcmd.setStatus(CL_MAP_FAILURE);
            }
        }
    }
@@ -1399,30 +1392,18 @@ VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd)
    amd::ScopedLock lock(execution());
    profilingBegin(vcmd, true);

-    amd::Memory* svmMem = vcmd.getSvmMem();
-    if (NULL == svmMem) {
-        LogWarning("wrong svm address ");
-        vcmd.setStatus(CL_INVALID_VALUE);
-        return;
-    }
-
-    gpu::Memory* memory = dev().getGpuMemory(svmMem);
+    gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());

    if (memory->mapMemory() != NULL) {
        if (memory->isUnmapWrite()) {
            amd::Coord3D srcOrigin(0, 0, 0);
            // Target is a remote resource, so copy
-            assert(memory->mapMemory() != NULL);
-            if (memory->cal()->buffer_) {
-                if (!blitMgr().copyBuffer(
-                    *memory->mapMemory(), *memory,
-                    srcOrigin,
-                    memory->writeMapInfo()->origin_,
-                    memory->writeMapInfo()->region_,
-                    memory->writeMapInfo()->entire_)) {
-                    LogError("submitUnmapMemory() - copy failed");
-                    vcmd.setStatus(CL_OUT_OF_RESOURCES);
-                }
+            assert(memory->cal()->buffer_ && "SVM memory can't be an image");
+            if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, srcOrigin,
+                memory->writeMapInfo()->origin_, memory->writeMapInfo()->region_,
+                memory->writeMapInfo()->entire_)) {
+                LogError("submitSvmUnmapMemory() - copy failed");
+                vcmd.setStatus(CL_OUT_OF_RESOURCES);
            }
        }
    }
@@ -685,6 +685,7 @@ Device::allocMapTarget(
    amd::Memory&        mem,
    const amd::Coord3D& origin,
    const amd::Coord3D& region,
+    uint                mapFlags,
    size_t*             rowPitch,
    size_t*             slicePitch)
 {
@@ -696,7 +697,7 @@ Device::allocMapTarget(
    }

    // Pass request over to memory
-    return memory->allocMapTarget(origin, region, rowPitch, slicePitch);
+    return memory->allocMapTarget(origin, region, mapFlags, rowPitch, slicePitch);
 }

 bool
@@ -191,6 +191,7 @@ public:
        amd::Memory&    mem,        //!< Abstraction layer memory object
        const amd::Coord3D& origin, //!< The map location in memory
        const amd::Coord3D& region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t* rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t* slicePitch = NULL   //!< Slice for the mapped memory
        ) {
@@ -323,6 +324,7 @@ public:
        amd::Memory &mem,   //!< Abstraction layer memory object
        const amd::Coord3D &origin, //!< The map location in memory
        const amd::Coord3D &region, //!< The map region in memory
+        uint    mapFlags,           //!< Map flags
        size_t *rowPitch = NULL,    //!< Row pitch for the mapped memory
        size_t *slicePitch = NULL   //!< Slice for the mapped memory
        );
@@ -79,6 +79,7 @@ Memory::freeMapMemory()
 void *
 Memory::allocMapTarget(const amd::Coord3D &origin,
                       const amd::Coord3D &region,
+                       uint mapFlags,
                       size_t *rowPitch,
                       size_t *slicePitch) 
 {
@@ -146,7 +147,7 @@ Memory::cpuMap(
 {
    // Create the map target.
    void * mapTarget =
-        allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), rowPitch, slicePitch);
+        allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch);

    // Sync to map target if no direct access.
    if (!isHostMemDirectAccess()) {
@@ -862,6 +863,7 @@ Image::createView(Image &parent)

 void* Image::allocMapTarget(const amd::Coord3D& origin,
    const amd::Coord3D& region,
+    uint    mapFlags,
    size_t* rowPitch,
    size_t* slicePitch)
 {
@@ -35,6 +35,7 @@ class Memory : public device::Memory {
  // of an indirect map for a given memory object
  virtual void *allocMapTarget(const amd::Coord3D &origin,
                               const amd::Coord3D &region,
+                               uint mapFlags,
                               size_t *rowPitch,
                               size_t *slicePitch);

@@ -168,6 +169,7 @@ public:
    //! of an indirect map for a given memory object
    virtual void* allocMapTarget(const amd::Coord3D& origin,
        const amd::Coord3D& region,
+        uint    mapFlags,
        size_t* rowPitch,
        size_t* slicePitch);

@@ -589,9 +589,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)

    // Save map write requirement.
    if (mapFlag & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) {
-        devMemory->saveWriteMapInfo(cmd.origin(),
-                                    cmd.size(),
-                                    cmd.isEntireMemory());
+        devMemory->saveMapInfo(cmd.origin(), cmd.size(),
+            mapFlag, cmd.isEntireMemory());
    }

    // Sync to the map target.
@@ -686,7 +685,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
            }
        }

-        devMemory->clearUnmapWrite();
+        devMemory->clearUnmapFlags();

        cmd.memory().signalWrite(&dev());
    }