From 42a3ed544c9a6881dd0ce2bfe12fc18de2e330f5 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Sat, 9 Jul 2016 19:29:55 +0530 Subject: [PATCH] D2H and H2D unpinned memory transfer support Change-Id: If6d6c970f435e5d917d5cc6cddc2ee2918cd1c37 Conflicts: src/hip_hcc.cpp --- include/hcc_detail/hip_hcc.h | 2 +- src/hip_hcc.cpp | 201 ++++++++++++++++++++++++++++++++--- src/staging_buffer.cpp | 94 ++++++++++++---- 3 files changed, 260 insertions(+), 37 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 10c6cf5ff0..635b65f384 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -606,7 +606,7 @@ public: // Data, set at initialization: unsigned _compute_units; StagingBuffer *_staging_buffer[2]; // one buffer for each direction. - + int isLargeBar; unsigned _device_flags; diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 097cd702e4..70890d9374 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -35,7 +35,7 @@ THE SOFTWARE. #include #include #include - +#include #include #include @@ -50,6 +50,9 @@ extern const char *ihipErrorString(hipError_t hip_error); const int release = 1; +#define MEMCPY_D2H_STAGING_VS_PININPLACE_COPY_THRESHOLD 4194304 +#define MEMCPY_H2D_DIRECT_VS_STAGING_COPY_THRESHOLD 65336 +#define MEMCPY_H2D_STAGING_VS_PININPLACE_COPY_THRESHOLD 1048576 int HIP_LAUNCH_BLOCKING = 0; @@ -60,6 +63,10 @@ int HIP_DB= 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. int HIP_PININPLACE = 0; +int HIP_OPTIMAL_MEM_TRANSFER = 0; //ENV Variable to test different memory transfer logics +int HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING = 0; +int HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE = 0; +int HIP_D2H_MEM_TRANSFER_THRESHOLD = 0; int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ @@ -616,11 +623,124 @@ ihipDevice_t::~ihipDevice_t() #define ErrorCheck(x) error_check(x, __LINE__, __FILE__) void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) { - if (hsa_error_code != HSA_STATUS_SUCCESS) { + if ((hsa_error_code != HSA_STATUS_SUCCESS)&& (hsa_error_code != HSA_STATUS_INFO_BREAK)) { printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(),line_num); } } +// CPU agent used for verification +hsa_agent_t cpu_agent_; +hsa_agent_t gpu_agent_; +int gpu_region_count; +// System region +hsa_amd_memory_pool_t sys_region_; +hsa_amd_memory_pool_t gpu_region_; + +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_CPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t GetDeviceRegion(hsa_amd_memory_pool_t region, void* data) { + if (NULL == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS; + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(err); + *((hsa_amd_memory_pool_t*)data) = region; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindGlobalRegion(hsa_amd_memory_pool_t region, void* data) { + if (NULL == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(err); + + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(err); + if ((HSA_AMD_SEGMENT_GLOBAL == segment) && + (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { + *((hsa_amd_memory_pool_t*)data) = region; + } + return HSA_STATUS_SUCCESS; +} + +void FindDeviceRegion() +{ + hsa_status_t err = hsa_iterate_agents(FindGpuDevice, &gpu_agent_); + ErrorCheck(err); + + err = hsa_amd_agent_iterate_memory_pools(gpu_agent_, GetDeviceRegion, &gpu_region_); + ErrorCheck(err); +} + +void FindSystemRegion() +{ + hsa_status_t err = hsa_iterate_agents(FindCpuDevice, &cpu_agent_); + ErrorCheck(err); + + err = hsa_amd_agent_iterate_memory_pools(cpu_agent_, FindGlobalRegion, &sys_region_); + ErrorCheck(err); +} + +int checkAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) +{ + hsa_status_t err; + hsa_amd_memory_pool_access_t access; + err = hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(err); + return access; +} + hsa_status_t get_region_info(hsa_region_t region, void* data) { hsa_status_t err; @@ -757,6 +877,17 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; + FindSystemRegion(); + FindDeviceRegion(); + int access=checkAccess(cpu_agent_, gpu_region_); + if(0!= access){ + isLargeBar= 1; + } + else{ + isLargeBar=0; + } + + // Get Max Threads Per Multiprocessor HsaSystemProperties props; @@ -994,13 +1125,30 @@ void ihipInit() READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy."); READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy. Under development."); + READ_ENV_I(release, HIP_OPTIMAL_MEM_TRANSFER, 0, "For optimal memory transfers for unpinned memory.Under testing."); + READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing."); + READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, 0, "Threshold value for H2D unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); + READ_ENV_I(release, HIP_D2H_MEM_TRANSFER_THRESHOLD, 0, "Threshold value for D2H unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); READ_ENV_I(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, "Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence" ); READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host. -1 means ignore these dependencies. (debug mode)"); READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands - instead wait for dependency on host. -1 means ifnore these dependencies (debug mode)"); + if (HIP_OPTIMAL_MEM_TRANSFER && !HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING) { + HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING= MEMCPY_H2D_DIRECT_VS_STAGING_COPY_THRESHOLD; + fprintf (stderr, "warning: env var HIP_OPTIMAL_MEM_TRANSFER=0x%x but HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING=0.Using default value for this.\n", HIP_OPTIMAL_MEM_TRANSFER); + } + if (HIP_OPTIMAL_MEM_TRANSFER && !HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE) { + HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE= MEMCPY_H2D_STAGING_VS_PININPLACE_COPY_THRESHOLD; + fprintf (stderr, "warning: env var HIP_OPTIMAL_MEM_TRANSFER=0x%x but HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE=0.Using default value for this.\n", HIP_OPTIMAL_MEM_TRANSFER); + } + + if (HIP_OPTIMAL_MEM_TRANSFER && !HIP_D2H_MEM_TRANSFER_THRESHOLD) { + HIP_D2H_MEM_TRANSFER_THRESHOLD= MEMCPY_D2H_STAGING_VS_PININPLACE_COPY_THRESHOLD; + fprintf (stderr, "warning: env var HIP_OPTIMAL_MEM_TRANSFER=0x%x but HIP_D2H_MEM_TRANSFER_THRESHOLD=0.Using default value for this.\n", HIP_OPTIMAL_MEM_TRANSFER); + } // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { fprintf (stderr, "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?)", HIP_DB); @@ -1437,16 +1585,32 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if(!srcTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - - if (HIP_PININPLACE) { - device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } else { - device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + if(HIP_OPTIMAL_MEM_TRANSFER) + { + if((device->isLargeBar)&&(sizeBytes < HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING)){ + memcpy(dst,src,sizeBytes); + std::atomic_thread_fence(std::memory_order_release); + } + else{ + if(sizeBytes > HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE){ + //if (HIP_PININPLACE) { + device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } else { + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } + // The copy waits for inputs and then completes before returning so can reset queue to empty: + this->wait(crit, true); + } } - - // The copy waits for inputs and then completes before returning so can reset queue to empty: - this->wait(crit, true); - } else { + else { + if (HIP_PININPLACE) { + device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } else { + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } + } + } + else { // TODO - remove, slow path. tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); #if USE_AV_COPY @@ -1481,10 +1645,21 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); //printf ("staged-copy- read dep signals\n"); - device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + if(HIP_OPTIMAL_MEM_TRANSFER) + { + if(sizeBytes> HIP_D2H_MEM_TRANSFER_THRESHOLD){ + device->_staging_buffer[1]->CopyDeviceToHostPinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + }else { + //printf ("staged-copy- read dep signals\n"); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } + }else + { + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } if(crit->_last_command_type == ihipCommandKernel){ std::cout<<"Destroying depSignal MemcpySync"<wait(crit, true); diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp index be5058e47b..c6c23089bd 100644 --- a/src/staging_buffer.cpp +++ b/src/staging_buffer.cpp @@ -88,42 +88,48 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ THROW_ERROR (hipErrorInvalidValue); } int bufferIndex = 0; +#if 0 for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; +#endif + size_t theseBytes= sizeBytes; + //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); + //hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO + void *locked_srcp; + //hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsa_agent, 1, &locked_srcp); + hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_hsa_agent, 1, &locked_srcp); + //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); + //printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp); - void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO - void *locked_srcp; - hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsa_agent, 1, &locked_srcp); - //hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_hsa_agent, 1, &locked_srcp); - tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); - printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp); + if (hsa_status != HSA_STATUS_SUCCESS) { + THROW_ERROR (hipErrorRuntimeMemory); + } - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - - hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); - tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } + hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); + if (hsa_status != HSA_STATUS_SUCCESS) { + THROW_ERROR (hipErrorRuntimeMemory); + } + tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_amd_memory_unlock(const_cast (srcp)); +#if 0 srcp += theseBytes; dstp += theseBytes; if (++bufferIndex >= _numBuffers) { bufferIndex = 0; } - - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; - } +#endif + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; +#if 0 +// } // TODO - printf ("unpin the memory\n"); @@ -132,6 +138,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ for (int i=0; i<_numBuffers; i++) { hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } +#endif } @@ -194,6 +201,47 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte } } + +void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + std::lock_guard l (_copy_lock); + + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + if (sizeBytes >= UINT64_MAX/2) { + THROW_ERROR (hipErrorInvalidValue); + } + int bufferIndex = 0; + size_t theseBytes= sizeBytes; + void *locked_destp; + + hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (dstp), theseBytes, &_hsa_agent, 1, &locked_destp); + + + if (hsa_status != HSA_STATUS_SUCCESS) { + THROW_ERROR (hipErrorRuntimeMemory); + } + + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + + hsa_status = hsa_amd_memory_async_copy(locked_destp,g_cpu_agent , srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + + if (hsa_status != HSA_STATUS_SUCCESS) { + THROW_ERROR (hipErrorRuntimeMemory); + } + tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_amd_memory_unlock(const_cast (dstp)); + + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; +} + //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsa_agent).