diff --git a/projects/hip/docs/markdown/hip_porting_guide.md b/projects/hip/docs/markdown/hip_porting_guide.md index cb599a5c4a..f4b1f8a7e4 100644 --- a/projects/hip/docs/markdown/hip_porting_guide.md +++ b/projects/hip/docs/markdown/hip_porting_guide.md @@ -115,7 +115,7 @@ directory names. ```shell -> hipconverinplace.sh MY_SRC_DIR +> hipconvertinplace.sh MY_SRC_DIR ``` diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 1607131e32..8df29aadb8 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -498,9 +498,14 @@ inline bool ihipStream_t::preKernelCommand() this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id) } else { - tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", - this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); - this->waitAndReclaimOlder(_last_copy_signal); + if (HIP_DISABLE_HW_KERNEL_DEP != -1) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); + this->waitAndReclaimOlder(_last_copy_signal); + } else { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (IGNORE dependency)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); + } } } _last_command_type = ihipCommandKernel; @@ -548,9 +553,15 @@ inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitS } if (HIP_DISABLE_HW_COPY_DEP && needSync) { - // do the wait here on the host, and disable the device-side command resolution. - hsa_signal_wait_acquire(*waitSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - needSync = 0; + if (HIP_DISABLE_HW_COPY_DEP == -1) { + tprintf (TRACE_SYNC, "IGNORE copy dependency\n") + + } else { + tprintf (TRACE_SYNC, "HOST-wait for copy dependency\n") + // do the wait here on the host, and disable the device-side command resolution. + hsa_signal_wait_acquire(*waitSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + needSync = 0; + } } _last_command_type = copyType; @@ -937,8 +948,8 @@ void ihipInit() READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); READ_ENV_I(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, "Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence" ); - READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host."); - READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands - instead wait for dependency on host."); + READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host. -1 means ignore these dependencies. (debug mode)"); + READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands - instead wait for dependency on host. -1 means ifnore these dependencies (debug mode)"); READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device"); READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy. If clear, set the dep for each copy."); @@ -2262,15 +2273,14 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB } } + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (HIP_STAGING_BUFFERS) { std::lock_guard l (device->_copy_lock[0]); //printf ("staged-copy- read dep signals\n"); - hsa_signal_t depSignal; - int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); - if (HIP_PININPLACE) { device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { @@ -2287,14 +2297,17 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB if (HIP_STAGING_BUFFERS) { std::lock_guard l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]); //printf ("staged-copy- read dep signals\n"); - hsa_signal_t depSignal; - int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyD2H); device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { // TODO - remove, slow path. hc::am_copy(dst, src, sizeBytes); } } else if (kind == hipMemcpyHostToHost) { // TODO-refactor. + + if (depSignalCnt) { + // host waits before doing host memory copy. + hsa_signal_wait_acquire(depSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } memcpy(dst, src, sizeBytes); } else { @@ -2315,8 +2328,6 @@ void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeB #if USE_ROCR_V2 - hsa_signal_t depSignal; - int depSignalCnt = stream->copyCommand(NULL, &depSignal, copyType); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, device->_copy_signal); #else hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal);