diff --git a/include/hcc_detail/staging_buffer.h b/include/hcc_detail/staging_buffer.h
index 4dd4b251e7..799de58c3e 100644
--- a/include/hcc_detail/staging_buffer.h
+++ b/include/hcc_detail/staging_buffer.h
@@ -26,11 +26,11 @@ THE SOFTWARE.
 
 //-------------------------------------------------------------------------------------------------
 // An optimized "staging buffer" used to implement Host-To-Device and Device-To-Host copies.
-// Some GPUs may not be able to directly access host memory, and in these cases we need to 
+// Some GPUs may not be able to directly access host memory, and in these cases we need to
 // stage the copy through a pinned staging buffer.  For example, the CopyHostToDevice
 // uses the CPU to copy to a pinned "staging buffer", and then use the GPU DMA engine to copy
 // from the staging buffer to the final destination.  The copy is broken into buffer-sized chunks
-// to limit the size of the buffer and also to provide better performance by overlapping the CPU copies 
+// to limit the size of the buffer and also to provide better performance by overlapping the CPU copies
 // with the DMA copies.
 //
 // PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA
@@ -41,7 +41,7 @@ struct StagingBuffer {
 
     static const int _max_buffers = 4;
 
-    StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) ;
+    StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers) ;
     ~StagingBuffer();
 
     void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
@@ -55,13 +55,14 @@ struct StagingBuffer {
 
 private:
     hsa_agent_t     _hsa_agent;
+    hsa_agent_t     _cpu_agent;
     size_t          _bufferSize;  // Size of the buffers.
     int             _numBuffers;
 
     char            *_pinnedStagingBuffer[_max_buffers];
     hsa_signal_t     _completion_signal[_max_buffers];
     hsa_signal_t     _completion_signal2[_max_buffers]; // P2P needs another set of signals.
-    std::mutex       _copy_lock;    // provide thread-safe access 
+    std::mutex       _copy_lock;    // provide thread-safe access
 };
 
 #endif
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index c6c8691419..b4796d006f 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -183,8 +183,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
     if (! assertQueueEmpty) {
         tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
         _av.wait();
-    } 
-     
+    }
+
     if (crit->_last_copy_signal) {
         tprintf (DB_SYNC, "stream %p wait for lastCopy:#%lu...\n", this, lastCopySeqId(crit) );
         this->waitCopy(crit, crit->_last_copy_signal);
@@ -212,7 +212,7 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty)
 
 // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
 // The packed _peerAgents can efficiently be used on each memory allocation.
-template<> 
+template<>
 void ihipDeviceCriticalBase_t<DeviceMutex>::recomputePeerAgents()
 {
     _peerCnt = 0;
@@ -223,7 +223,7 @@ void ihipDeviceCriticalBase_t<DeviceMutex>::recomputePeerAgents()
 
 
 template<>
-bool ihipDeviceCriticalBase_t<DeviceMutex>::isPeer(const ihipDevice_t *peer) 
+bool ihipDeviceCriticalBase_t<DeviceMutex>::isPeer(const ihipDevice_t *peer)
 {
     auto match = std::find(_peers.begin(), _peers.end(), peer);
     return (match != std::end(_peers));
@@ -231,7 +231,7 @@ bool ihipDeviceCriticalBase_t<DeviceMutex>::isPeer(const ihipDevice_t *peer)
 
 
 template<>
-bool ihipDeviceCriticalBase_t<DeviceMutex>::addPeer(ihipDevice_t *peer) 
+bool ihipDeviceCriticalBase_t<DeviceMutex>::addPeer(ihipDevice_t *peer)
 {
     auto match = std::find(_peers.begin(), _peers.end(), peer);
     if (match == std::end(_peers)) {
@@ -247,7 +247,7 @@ bool ihipDeviceCriticalBase_t<DeviceMutex>::addPeer(ihipDevice_t *peer)
 
 
 template<>
-bool ihipDeviceCriticalBase_t<DeviceMutex>::removePeer(ihipDevice_t *peer) 
+bool ihipDeviceCriticalBase_t<DeviceMutex>::removePeer(ihipDevice_t *peer)
 {
     auto match = std::find(_peers.begin(), _peers.end(), peer);
     if (match != std::end(_peers)) {
@@ -281,7 +281,7 @@ void ihipDeviceCriticalBase_t<DeviceMutex>::addStream(ihipStream_t *stream)
 
 //---
 //Flavor that takes device index.
-ihipDevice_t * getDevice(unsigned deviceIndex) 
+ihipDevice_t * getDevice(unsigned deviceIndex)
 {
     if (ihipIsValidDevice(deviceIndex)) {
         return &g_devices[deviceIndex];
@@ -512,7 +512,7 @@ void ihipDevice_t::locked_reset()
         ihipStream_t *stream = *streamI;
         (*streamI)->locked_wait();
         tprintf(DB_SYNC, " delete stream=%p\n", stream);
-        
+
         delete stream;
     }
     // Clear the list.
@@ -562,10 +562,8 @@ void ihipDevice_t::init(unsigned device_index, unsigned deviceCnt, hc::accelerat
 
     tprintf(DB_SYNC, "created device with default_stream=%p\n", _default_stream);
 
-    hsa_region_t *pinnedHostRegion;
-    pinnedHostRegion = static_cast<hsa_region_t*>(_acc.get_hsa_am_system_region());
-    _staging_buffer[0] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
-    _staging_buffer[1] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
+    _staging_buffer[0] = new StagingBuffer(_hsa_agent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
+    _staging_buffer[1] = new StagingBuffer(_hsa_agent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
 
 };
 
@@ -608,13 +606,8 @@ void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) {
   }
 }
 
-// CPU agent used for verification
-hsa_agent_t cpu_agent_;
 hsa_agent_t gpu_agent_;
-int gpu_region_count;
-// System region
-hsa_amd_memory_pool_t sys_region_;
-hsa_amd_memory_pool_t gpu_region_;
+hsa_amd_memory_pool_t gpu_pool_;
 
 hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
     if (data == NULL) {
@@ -636,27 +629,7 @@ hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
     return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) {
-    if (data == NULL) {
-        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
-    }
-
-    hsa_device_type_t hsa_device_type;
-    hsa_status_t hsa_error_code =
-    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
-    if (hsa_error_code != HSA_STATUS_SUCCESS) {
-        return hsa_error_code;
-    }
-
-    if (hsa_device_type == HSA_DEVICE_TYPE_CPU) {
-        *((hsa_agent_t*)data) = agent;
-        return HSA_STATUS_INFO_BREAK;
-    }
-
-    return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t GetDeviceRegion(hsa_amd_memory_pool_t region, void* data) {
+hsa_status_t GetDevicePool(hsa_amd_memory_pool_t pool, void* data) {
     if (NULL == data) {
         return HSA_STATUS_ERROR_INVALID_ARGUMENT;
     }
@@ -665,50 +638,21 @@ hsa_status_t GetDeviceRegion(hsa_amd_memory_pool_t region, void* data) {
     hsa_amd_segment_t segment;
     uint32_t flag;
 
-    err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
     ErrorCheck(err);
     if (HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS;
-    err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
     ErrorCheck(err);
-    *((hsa_amd_memory_pool_t*)data) = region;
+    *((hsa_amd_memory_pool_t*)data) = pool;
     return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t FindGlobalRegion(hsa_amd_memory_pool_t region, void* data) {
-    if (NULL == data) {
-        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
-    }
-
-    hsa_status_t err;
-    hsa_amd_segment_t segment;
-    uint32_t flag;
-    err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
-    ErrorCheck(err);
-
-    err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
-    ErrorCheck(err);
-    if ((HSA_AMD_SEGMENT_GLOBAL == segment) &&
-        (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) {
-        *((hsa_amd_memory_pool_t*)data) = region;
-    }
-    return HSA_STATUS_SUCCESS;
-}
-
-void FindDeviceRegion()
+void FindDevicePool()
 {
     hsa_status_t err = hsa_iterate_agents(FindGpuDevice, &gpu_agent_);
     ErrorCheck(err);
 
-    err = hsa_amd_agent_iterate_memory_pools(gpu_agent_, GetDeviceRegion, &gpu_region_);
-    ErrorCheck(err);
-}
-
-void FindSystemRegion()
-{
-    hsa_status_t err = hsa_iterate_agents(FindCpuDevice, &cpu_agent_);
-    ErrorCheck(err);
-
-    err = hsa_amd_agent_iterate_memory_pools(cpu_agent_, FindGlobalRegion, &sys_region_);
+    err = hsa_amd_agent_iterate_memory_pools(gpu_agent_, GetDevicePool, &gpu_pool_);
     ErrorCheck(err);
 }
 
@@ -857,9 +801,8 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop)
     /* Computemode for HSA Devices is always : cudaComputeModeDefault */
     prop->computeMode = 0;
 
-    FindSystemRegion();
-    FindDeviceRegion();
-    int access=checkAccess(cpu_agent_, gpu_region_);
+    FindDevicePool();
+    int access=checkAccess(g_cpu_agent, gpu_pool_);
     if(0!= access){
         isLargeBar= 1;
     }
@@ -1166,6 +1109,12 @@ void ihipInit()
         }
     }
 
+    hsa_status_t err = hsa_iterate_agents(findCpuAgent, &g_cpu_agent);
+    if (err != HSA_STATUS_INFO_BREAK) {
+        // didn't find a CPU.
+        throw ihipException(hipErrorRuntimeOther);
+    }
+
     g_devices = new ihipDevice_t[deviceCnt];
     g_deviceCnt = 0;
     for (int i=0; i<accs.size(); i++) {
@@ -1186,14 +1135,6 @@ void ihipInit()
         assert(deviceCnt == g_deviceCnt);
     }
 
-
-    hsa_status_t err = hsa_iterate_agents(findCpuAgent, &g_cpu_agent);
-    if (err != HSA_STATUS_INFO_BREAK) {
-        // didn't find a CPU.
-        throw ihipException(hipErrorRuntimeOther);
-    }
-
-
     tprintf(DB_SYNC, "pid=%u %-30s\n", getpid(), "<ihipInit>");
 }
 
@@ -1260,7 +1201,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
 {
     HIP_INIT_API(stream, grid, block, lp);
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20 
+#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid.x;
     lp->grid_dim.y = grid.y;
     lp->grid_dim.z = grid.z;
@@ -1289,7 +1230,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri
 {
     HIP_INIT_API(stream, grid, block, lp);
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20 
+#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid;
     lp->grid_dim.y = 1;
     lp->grid_dim.z = 1;
@@ -1319,7 +1260,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri
 {
     HIP_INIT_API(stream, grid, block, lp);
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20 
+#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid.x;
     lp->grid_dim.y = grid.y;
     lp->grid_dim.z = grid.z;
@@ -1349,7 +1290,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
 {
     HIP_INIT_API(stream, grid, block, lp);
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20 
+#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid;
     lp->grid_dim.y = 1;
     lp->grid_dim.z = 1;
@@ -1479,7 +1420,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked,
 
 
 // Setup the copyCommandType and the copy agents (for hsa_amd_memory_async_copy)
-// srcPhysAcc is the physical location of the src data.  For many copies this is the 
+// srcPhysAcc is the physical location of the src data.  For many copies this is the
 void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent)
 {
     // current* represents the device associated with the specified stream.
@@ -1669,8 +1610,8 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
 
         } else {
             assert(0); // currently no fallback for this path.
-        } 
-        
+        }
+
     } else {
         // If not special case - these can all be handled by the hsa async copy:
         ihipCommand_t commandType;
diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp
index c6c23089bd..69f22e38b0 100644
--- a/src/staging_buffer.cpp
+++ b/src/staging_buffer.cpp
@@ -28,28 +28,64 @@ THE SOFTWARE.
 #include "hcc_detail/hip_hcc.h"
 #define THROW_ERROR(e) throw ihipException(e)
 #else
-#define THROW_ERROR(e) throw 
-#define tprintf(trace_level, ...) 
+#define THROW_ERROR(e) throw
+#define tprintf(trace_level, ...)
 #endif
 
-extern hsa_agent_t g_cpu_agent; // defined in hip_hcc.cpp
+void error_check1(hsa_status_t hsa_error_code, int line_num, std::string str) {
+  if ((hsa_error_code != HSA_STATUS_SUCCESS)&& (hsa_error_code != HSA_STATUS_INFO_BREAK))  {
+    printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(),line_num);
+  }
+}
+
+#define ErrorCheck(x) error_check1(x, __LINE__, __FILE__)
+hsa_amd_memory_pool_t sys_pool_;
+
+hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data) {
+    if (NULL == data) {
+        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
+
+    hsa_status_t err;
+    hsa_amd_segment_t segment;
+    uint32_t flag;
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+    ErrorCheck(err);
+
+    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
+    ErrorCheck(err);
+    if ((HSA_AMD_SEGMENT_GLOBAL == segment) &&
+        (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
+        *((hsa_amd_memory_pool_t*)data) = pool;
+    }
+    return HSA_STATUS_SUCCESS;
+}
 
 //-------------------------------------------------------------------------------------------------
-StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) :
+StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers) :
     _hsa_agent(hsaAgent),
+    _cpu_agent(cpuAgent),
     _bufferSize(bufferSize),
     _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers)
 {
+    hsa_status_t err = hsa_amd_agent_iterate_memory_pools(_cpu_agent, FindGlobalPool, &sys_pool_);
+    ErrorCheck(err);
     for (int i=0; i<_numBuffers; i++) {
         // TODO - experiment with alignment here.
-        hsa_status_t s1 = hsa_memory_allocate(systemRegion, _bufferSize, (void**) (&_pinnedStagingBuffer[i]) );
+        err = hsa_amd_memory_pool_allocate(sys_pool_, _bufferSize, 0, (void**)(&_pinnedStagingBuffer[i]));
+        ErrorCheck(err);
 
-        if ((s1 != HSA_STATUS_SUCCESS) || (_pinnedStagingBuffer[i] == NULL)) {
+        if ((err != HSA_STATUS_SUCCESS) || (_pinnedStagingBuffer[i] == NULL)) {
             THROW_ERROR(hipErrorMemoryAllocation);
         }
+
+        err = hsa_amd_agents_allow_access(1, &hsaAgent, NULL, _pinnedStagingBuffer[i]);
+        ErrorCheck(err);
+
         hsa_signal_create(0, 0, NULL, &_completion_signal[i]);
         hsa_signal_create(0, 0, NULL, &_completion_signal2[i]);
     }
+
 };
 
 
@@ -58,7 +94,7 @@ StagingBuffer::~StagingBuffer()
 {
     for (int i=0; i<_numBuffers; i++) {
         if (_pinnedStagingBuffer[i]) {
-            hsa_memory_free(_pinnedStagingBuffer[i]);
+            hsa_amd_memory_pool_free(_pinnedStagingBuffer[i]);
             _pinnedStagingBuffer[i] = NULL;
         }
         hsa_signal_destroy(_completion_signal[i]);
@@ -88,11 +124,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_
         THROW_ERROR (hipErrorInvalidValue);
     }
     int bufferIndex = 0;
-#if 0
-    for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ;  bytesRemaining -= _bufferSize) {
 
-        size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining;
-#endif
     size_t theseBytes= sizeBytes;
     //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle);
     //hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
@@ -110,7 +142,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_
 
     hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
 
-    hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+    hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, _cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
     //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);
 
     if (hsa_status != HSA_STATUS_SUCCESS) {
@@ -119,26 +151,8 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_
     tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle);
     hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
     hsa_amd_memory_unlock(const_cast<char*> (srcp));
-#if 0
-        srcp += theseBytes;
-        dstp += theseBytes;
-        if (++bufferIndex >= _numBuffers) {
-            bufferIndex = 0;
-        }
-#endif
-    // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 
-    waitFor = NULL; 
-#if 0
-//    }
-
-    // TODO -
-    printf ("unpin the memory\n");
-
-
-    for (int i=0; i<_numBuffers; i++) {
-        hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
-    }
-#endif
+    // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
+    waitFor = NULL;
 }
 
 
@@ -177,10 +191,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte
 
 
         hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
-
-        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], _cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
         tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);
-
         if (hsa_status != HSA_STATUS_SUCCESS) {
             THROW_ERROR ((hipErrorRuntimeMemory));
         }
@@ -191,8 +203,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte
             bufferIndex = 0;
         }
 
-        // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 
-        waitFor = NULL; 
+        // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
+        waitFor = NULL;
     }
 
 
@@ -229,7 +241,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_
 
     hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
 
-    hsa_status = hsa_amd_memory_async_copy(locked_destp,g_cpu_agent , srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+    hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpu_agent , srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
 
     if (hsa_status != HSA_STATUS_SUCCESS) {
         THROW_ERROR (hipErrorRuntimeMemory);
@@ -273,7 +285,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte
 
             tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu  async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
             hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
-            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
             if (hsa_status != HSA_STATUS_SUCCESS) {
                 THROW_ERROR (hipErrorRuntimeMemory);
             }
@@ -281,8 +293,8 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte
             srcp0 += theseBytes;
 
 
-            // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 
-            waitFor = NULL; 
+            // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
+            waitFor = NULL;
         }
 
         // Now unload the staging buffers:
@@ -337,7 +349,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void*
 
             tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu  async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
             hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
-            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
             if (hsa_status != HSA_STATUS_SUCCESS) {
                 THROW_ERROR (hipErrorRuntimeMemory);
             }
@@ -345,8 +357,8 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void*
             srcp0 += theseBytes;
 
 
-            // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 
-            waitFor = NULL; 
+            // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1
+            waitFor = NULL;
         }
 
         // Now unload the staging buffers:
@@ -365,8 +377,8 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void*
 
             tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
             hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1);
-            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent /*not used*/, theseBytes,
-                                      hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], 
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpu_agent /*not used*/, theseBytes,
+                                      hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex],
                                       _completion_signal2[bufferIndex]);
 
             dstp1 += theseBytes;