diff --git a/include/hcc_detail/staging_buffer.h b/include/hcc_detail/staging_buffer.h index 4dd4b251e7..799de58c3e 100644 --- a/include/hcc_detail/staging_buffer.h +++ b/include/hcc_detail/staging_buffer.h @@ -26,11 +26,11 @@ THE SOFTWARE. //------------------------------------------------------------------------------------------------- // An optimized "staging buffer" used to implement Host-To-Device and Device-To-Host copies. -// Some GPUs may not be able to directly access host memory, and in these cases we need to +// Some GPUs may not be able to directly access host memory, and in these cases we need to // stage the copy through a pinned staging buffer. For example, the CopyHostToDevice // uses the CPU to copy to a pinned "staging buffer", and then use the GPU DMA engine to copy // from the staging buffer to the final destination. The copy is broken into buffer-sized chunks -// to limit the size of the buffer and also to provide better performance by overlapping the CPU copies +// to limit the size of the buffer and also to provide better performance by overlapping the CPU copies // with the DMA copies. // // PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA @@ -41,7 +41,7 @@ struct StagingBuffer { static const int _max_buffers = 4; - StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) ; + StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); @@ -55,13 +55,14 @@ struct StagingBuffer { private: hsa_agent_t _hsa_agent; + hsa_agent_t _cpu_agent; size_t _bufferSize; // Size of the buffers. int _numBuffers; char *_pinnedStagingBuffer[_max_buffers]; hsa_signal_t _completion_signal[_max_buffers]; hsa_signal_t _completion_signal2[_max_buffers]; // P2P needs another set of signals. - std::mutex _copy_lock; // provide thread-safe access + std::mutex _copy_lock; // provide thread-safe access }; #endif diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index c6c8691419..b4796d006f 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -183,8 +183,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty if (! assertQueueEmpty) { tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this); _av.wait(); - } - + } + if (crit->_last_copy_signal) { tprintf (DB_SYNC, "stream %p wait for lastCopy:#%lu...\n", this, lastCopySeqId(crit) ); this->waitCopy(crit, crit->_last_copy_signal); @@ -212,7 +212,7 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty) // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted. // The packed _peerAgents can efficiently be used on each memory allocation. -template<> +template<> void ihipDeviceCriticalBase_t::recomputePeerAgents() { _peerCnt = 0; @@ -223,7 +223,7 @@ void ihipDeviceCriticalBase_t::recomputePeerAgents() template<> -bool ihipDeviceCriticalBase_t::isPeer(const ihipDevice_t *peer) +bool ihipDeviceCriticalBase_t::isPeer(const ihipDevice_t *peer) { auto match = std::find(_peers.begin(), _peers.end(), peer); return (match != std::end(_peers)); @@ -231,7 +231,7 @@ bool ihipDeviceCriticalBase_t::isPeer(const ihipDevice_t *peer) template<> -bool ihipDeviceCriticalBase_t::addPeer(ihipDevice_t *peer) +bool ihipDeviceCriticalBase_t::addPeer(ihipDevice_t *peer) { auto match = std::find(_peers.begin(), _peers.end(), peer); if (match == std::end(_peers)) { @@ -247,7 +247,7 @@ bool ihipDeviceCriticalBase_t::addPeer(ihipDevice_t *peer) template<> -bool ihipDeviceCriticalBase_t::removePeer(ihipDevice_t *peer) +bool ihipDeviceCriticalBase_t::removePeer(ihipDevice_t *peer) { auto match = std::find(_peers.begin(), _peers.end(), peer); if (match != std::end(_peers)) { @@ -281,7 +281,7 @@ void ihipDeviceCriticalBase_t::addStream(ihipStream_t *stream) //--- //Flavor that takes device index. -ihipDevice_t * getDevice(unsigned deviceIndex) +ihipDevice_t * getDevice(unsigned deviceIndex) { if (ihipIsValidDevice(deviceIndex)) { return &g_devices[deviceIndex]; @@ -512,7 +512,7 @@ void ihipDevice_t::locked_reset() ihipStream_t *stream = *streamI; (*streamI)->locked_wait(); tprintf(DB_SYNC, " delete stream=%p\n", stream); - + delete stream; } // Clear the list. @@ -562,10 +562,8 @@ void ihipDevice_t::init(unsigned device_index, unsigned deviceCnt, hc::accelerat tprintf(DB_SYNC, "created device with default_stream=%p\n", _default_stream); - hsa_region_t *pinnedHostRegion; - pinnedHostRegion = static_cast(_acc.get_hsa_am_system_region()); - _staging_buffer[0] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); - _staging_buffer[1] = new StagingBuffer(_hsa_agent, *pinnedHostRegion, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[0] = new StagingBuffer(_hsa_agent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[1] = new StagingBuffer(_hsa_agent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); }; @@ -608,13 +606,8 @@ void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) { } } -// CPU agent used for verification -hsa_agent_t cpu_agent_; hsa_agent_t gpu_agent_; -int gpu_region_count; -// System region -hsa_amd_memory_pool_t sys_region_; -hsa_amd_memory_pool_t gpu_region_; +hsa_amd_memory_pool_t gpu_pool_; hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { if (data == NULL) { @@ -636,27 +629,7 @@ hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { return HSA_STATUS_SUCCESS; } -hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_device_type_t hsa_device_type; - hsa_status_t hsa_error_code = - hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); - if (hsa_error_code != HSA_STATUS_SUCCESS) { - return hsa_error_code; - } - - if (hsa_device_type == HSA_DEVICE_TYPE_CPU) { - *((hsa_agent_t*)data) = agent; - return HSA_STATUS_INFO_BREAK; - } - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t GetDeviceRegion(hsa_amd_memory_pool_t region, void* data) { +hsa_status_t GetDevicePool(hsa_amd_memory_pool_t pool, void* data) { if (NULL == data) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -665,50 +638,21 @@ hsa_status_t GetDeviceRegion(hsa_amd_memory_pool_t region, void* data) { hsa_amd_segment_t segment; uint32_t flag; - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); ErrorCheck(err); if (HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS; - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); ErrorCheck(err); - *((hsa_amd_memory_pool_t*)data) = region; + *((hsa_amd_memory_pool_t*)data) = pool; return HSA_STATUS_SUCCESS; } -hsa_status_t FindGlobalRegion(hsa_amd_memory_pool_t region, void* data) { - if (NULL == data) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_status_t err; - hsa_amd_segment_t segment; - uint32_t flag; - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - ErrorCheck(err); - - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); - ErrorCheck(err); - if ((HSA_AMD_SEGMENT_GLOBAL == segment) && - (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { - *((hsa_amd_memory_pool_t*)data) = region; - } - return HSA_STATUS_SUCCESS; -} - -void FindDeviceRegion() +void FindDevicePool() { hsa_status_t err = hsa_iterate_agents(FindGpuDevice, &gpu_agent_); ErrorCheck(err); - err = hsa_amd_agent_iterate_memory_pools(gpu_agent_, GetDeviceRegion, &gpu_region_); - ErrorCheck(err); -} - -void FindSystemRegion() -{ - hsa_status_t err = hsa_iterate_agents(FindCpuDevice, &cpu_agent_); - ErrorCheck(err); - - err = hsa_amd_agent_iterate_memory_pools(cpu_agent_, FindGlobalRegion, &sys_region_); + err = hsa_amd_agent_iterate_memory_pools(gpu_agent_, GetDevicePool, &gpu_pool_); ErrorCheck(err); } @@ -857,9 +801,8 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; - FindSystemRegion(); - FindDeviceRegion(); - int access=checkAccess(cpu_agent_, gpu_region_); + FindDevicePool(); + int access=checkAccess(g_cpu_agent, gpu_pool_); if(0!= access){ isLargeBar= 1; } @@ -1166,6 +1109,12 @@ void ihipInit() } } + hsa_status_t err = hsa_iterate_agents(findCpuAgent, &g_cpu_agent); + if (err != HSA_STATUS_INFO_BREAK) { + // didn't find a CPU. + throw ihipException(hipErrorRuntimeOther); + } + g_devices = new ihipDevice_t[deviceCnt]; g_deviceCnt = 0; for (int i=0; i"); } @@ -1260,7 +1201,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ { HIP_INIT_API(stream, grid, block, lp); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 +#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; @@ -1289,7 +1230,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri { HIP_INIT_API(stream, grid, block, lp); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 +#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid; lp->grid_dim.y = 1; lp->grid_dim.z = 1; @@ -1319,7 +1260,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri { HIP_INIT_API(stream, grid, block, lp); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 +#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; @@ -1349,7 +1290,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g { HIP_INIT_API(stream, grid, block, lp); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 +#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid; lp->grid_dim.y = 1; lp->grid_dim.z = 1; @@ -1479,7 +1420,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, // Setup the copyCommandType and the copy agents (for hsa_amd_memory_async_copy) -// srcPhysAcc is the physical location of the src data. For many copies this is the +// srcPhysAcc is the physical location of the src data. For many copies this is the void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent) { // current* represents the device associated with the specified stream. @@ -1669,8 +1610,8 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const } else { assert(0); // currently no fallback for this path. - } - + } + } else { // If not special case - these can all be handled by the hsa async copy: ihipCommand_t commandType; diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp index c6c23089bd..69f22e38b0 100644 --- a/src/staging_buffer.cpp +++ b/src/staging_buffer.cpp @@ -28,28 +28,64 @@ THE SOFTWARE. #include "hcc_detail/hip_hcc.h" #define THROW_ERROR(e) throw ihipException(e) #else -#define THROW_ERROR(e) throw -#define tprintf(trace_level, ...) +#define THROW_ERROR(e) throw +#define tprintf(trace_level, ...) #endif -extern hsa_agent_t g_cpu_agent; // defined in hip_hcc.cpp +void error_check1(hsa_status_t hsa_error_code, int line_num, std::string str) { + if ((hsa_error_code != HSA_STATUS_SUCCESS)&& (hsa_error_code != HSA_STATUS_INFO_BREAK)) { + printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(),line_num); + } +} + +#define ErrorCheck(x) error_check1(x, __LINE__, __FILE__) +hsa_amd_memory_pool_t sys_pool_; + +hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data) { + if (NULL == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(err); + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(err); + if ((HSA_AMD_SEGMENT_GLOBAL == segment) && + (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) { + *((hsa_amd_memory_pool_t*)data) = pool; + } + return HSA_STATUS_SUCCESS; +} //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) : +StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers) : _hsa_agent(hsaAgent), + _cpu_agent(cpuAgent), _bufferSize(bufferSize), _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) { + hsa_status_t err = hsa_amd_agent_iterate_memory_pools(_cpu_agent, FindGlobalPool, &sys_pool_); + ErrorCheck(err); for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. - hsa_status_t s1 = hsa_memory_allocate(systemRegion, _bufferSize, (void**) (&_pinnedStagingBuffer[i]) ); + err = hsa_amd_memory_pool_allocate(sys_pool_, _bufferSize, 0, (void**)(&_pinnedStagingBuffer[i])); + ErrorCheck(err); - if ((s1 != HSA_STATUS_SUCCESS) || (_pinnedStagingBuffer[i] == NULL)) { + if ((err != HSA_STATUS_SUCCESS) || (_pinnedStagingBuffer[i] == NULL)) { THROW_ERROR(hipErrorMemoryAllocation); } + + err = hsa_amd_agents_allow_access(1, &hsaAgent, NULL, _pinnedStagingBuffer[i]); + ErrorCheck(err); + hsa_signal_create(0, 0, NULL, &_completion_signal[i]); hsa_signal_create(0, 0, NULL, &_completion_signal2[i]); } + }; @@ -58,7 +94,7 @@ StagingBuffer::~StagingBuffer() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { - hsa_memory_free(_pinnedStagingBuffer[i]); + hsa_amd_memory_pool_free(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } hsa_signal_destroy(_completion_signal[i]); @@ -88,11 +124,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ THROW_ERROR (hipErrorInvalidValue); } int bufferIndex = 0; -#if 0 - for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { - size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; -#endif size_t theseBytes= sizeBytes; //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); //hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); @@ -110,7 +142,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, _cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { @@ -119,26 +151,8 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast (srcp)); -#if 0 - srcp += theseBytes; - dstp += theseBytes; - if (++bufferIndex >= _numBuffers) { - bufferIndex = 0; - } -#endif - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; -#if 0 -// } - - // TODO - - printf ("unpin the memory\n"); - - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } -#endif + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } @@ -177,10 +191,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], _cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR ((hipErrorRuntimeMemory)); } @@ -191,8 +203,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte bufferIndex = 0; } - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } @@ -229,7 +241,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_ hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(locked_destp,g_cpu_agent , srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpu_agent , srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); @@ -273,7 +285,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -281,8 +293,8 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte srcp0 += theseBytes; - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } // Now unload the staging buffers: @@ -337,7 +349,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -345,8 +357,8 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* srcp0 += theseBytes; - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } // Now unload the staging buffers: @@ -365,8 +377,8 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent /*not used*/, theseBytes, - hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpu_agent /*not used*/, theseBytes, + hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], _completion_signal2[bufferIndex]); dstp1 += theseBytes;