diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index dc72714e3e..860c6f87a8 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -88,6 +88,10 @@ int HIP_HOST_COHERENT = 1; int HIP_SYNC_HOST_ALLOC = 1; +int HIP_SYNC_STREAM_WAIT = 0; + +int HIP_FORCE_NULL_STREAM=0; + #if (__hcc_workweek__ >= 17300) // Make sure we have required bug fix in HCC @@ -1249,6 +1253,9 @@ void HipReadEnv() READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); + READ_ENV_I(release, HIP_FORCE_NULL_STREAM, 0, "Force all stream allocations to secretly return the null stream"); + + READ_ENV_I(release, HIP_SYNC_STREAM_WAIT, 0, "hipStreamWaitEvent will synchronize to host"); READ_ENV_I(release, HIP_HOST_COHERENT, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index 56ca37b3e2..88717e48bf 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -65,8 +65,10 @@ extern int HIP_HIDDEN_FREE_MEM; //--- // Chicken bits for disabling functionality to work around potential issues: extern int HIP_SYNC_HOST_ALLOC; +extern int HIP_SYNC_STREAM_WAIT; extern int HIP_SYNC_NULL_STREAM; +extern int HIP_FORCE_NULL_STREAM; // TODO - remove when this is standard behavior. extern int HCC_OPT_FLUSH; diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp index 51aeb01412..7dd6efd39c 100644 --- a/hipamd/src/hip_stream.cpp +++ b/hipamd/src/hip_stream.cpp @@ -38,21 +38,26 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags) hipError_t e = hipSuccess; if (ctx) { - hc::accelerator acc = ctx->getWriteableDevice()->_acc; - // TODO - se try-catch loop to detect memory exception? - // - //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete: - //This matches CUDA stream behavior: + if (HIP_FORCE_NULL_STREAM) { + *stream = 0; + } else { + hc::accelerator acc = ctx->getWriteableDevice()->_acc; - { - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); + // TODO - se try-catch loop to detect memory exception? + // + //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete: + //This matches CUDA stream behavior: - auto istream = new ihipStream_t(ctx, acc.create_view(), flags); + { + // Obtain mutex access to the device critical data, release by destructor + LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); - ctxCrit->addStream(istream); - *stream = istream; + auto istream = new ihipStream_t(ctx, acc.create_view(), flags); + + ctxCrit->addStream(istream); + *stream = istream; + } } tprintf(DB_SYNC, "hipStreamCreate, %s\n", ToString(*stream).c_str()); @@ -93,18 +98,15 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int } else if (event->_state != hipEventStatusUnitialized) { - if (stream != hipStreamNull) { - + if (HIP_SYNC_STREAM_WAIT || (HIP_SYNC_NULL_STREAM && (stream == 0))) { + // conservative wait on host for the specified event to complete: + event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + } else { + stream = ihipSyncAndResolveStream(stream); // This will user create_blocking_marker to wait on the specified queue. stream->locked_streamWaitEvent(event); - - } else { - // TODO-hcc Convert to use create_blocking_marker(...) functionality. - // Currently we have a super-conservative version of this - block on host, and drain the queue. - // This should create a barrier packet in the target queue. - // TODO-HIP_SYNC_NULL_STREAM - stream->locked_wait(); } + } // else event not recorded, return immediately and don't create marker. return ihipLogStatus(e); @@ -122,15 +124,14 @@ hipError_t hipStreamQuery(hipStream_t stream) stream = device->_defaultStream; } - int pendingOps = 0; + bool isEmpty = 0; { LockedAccessor_StreamCrit_t crit(stream->_criticalData); - pendingOps = crit->_av.get_pending_async_ops(); + isEmpty = crit->_av.get_is_empty(); } - - hipError_t e = (pendingOps > 0) ? hipErrorNotReady : hipSuccess; + hipError_t e = isEmpty ? hipSuccess : hipErrorNotReady ; return ihipLogStatus(e); } @@ -170,7 +171,9 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { - e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream + if (!HIP_FORCE_NULL_STREAM) { + e = hipErrorInvalidResourceHandle; + } } else { stream->locked_wait(); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 9bbd43828c..f5b1b79550 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -446,9 +446,41 @@ int main(int argc, char *argv[]) if (p_tests & 0x1000) { - printf ("==> Test 0x1000 try null stream\n"); - hipStreamQuery(0/* try null stream*/); + printf ("==> Test 0x1000 simple null stream tests\n"); + // try some null stream: + hipStreamQuery(0); + + + hipStream_t s1; + hipEvent_t e1; + + { + // stream null waits on event in s1 stream: + HIPCHECK(hipStreamCreate(&s1)); + HIPCHECK(hipEventCreate(&e1)); + + HIPCHECK(hipEventRecord(e1, s1)) + + HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); + + HIPCHECK(hipStreamDestroy(s1)); + HIPCHECK(hipEventDestroy(e1)); + } + + { + // stream s1 waits on event in null stream: + HIPCHECK(hipStreamCreate(&s1)); + HIPCHECK(hipEventCreate(&e1)); + + HIPCHECK(hipEventRecord(e1, hipStream_t(0))) + + HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); + + HIPCHECK(hipStreamDestroy(s1)); + HIPCHECK(hipEventDestroy(e1)); + } + } @@ -471,8 +503,8 @@ int main(int argc, char *argv[]) } - { - printf ("test: alternating memcpy/count-reverse followed by event\n"); + if (p_tests & 0x4000 ) { + printf ("test: %x alternating memcpy/count-reverse followed by event\n", p_tests); RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); }