From a55ce5bee421434bdfdaaa8d30e1778bca7525f1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:04:23 -0500 Subject: [PATCH] Add initial HIP_SYNC_NULL_STREAM=0 mode. This eliminates host-synchronization for null stream. Instead, the null-stream uses GPU-side events to wait for other streams. Default is OFF pending additional testing. Add enhanced null-stream test. Also refine HIP_TRACE_API. [ROCm/hip commit: 27877f8854c88c6b806b5528c81faf9009ccab78] --- .../include/hip/hcc_detail/hip_runtime_api.h | 7 +- projects/hip/src/grid_launch.cpp | 2 +- projects/hip/src/hip_device.cpp | 2 +- projects/hip/src/hip_event.cpp | 16 +- projects/hip/src/hip_hcc.cpp | 145 +++++++++++----- projects/hip/src/hip_hcc_internal.h | 12 +- projects/hip/src/hip_memory.cpp | 8 +- projects/hip/src/hip_module.cpp | 4 +- projects/hip/src/hip_stream.cpp | 4 +- .../src/runtimeApi/stream/hipNullStream.cpp | 156 ++++++++++++++---- projects/hip/tests/src/test_common.h | 60 ++++++- 11 files changed, 320 insertions(+), 96 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 175fd64d29..e1aecef1e8 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -602,9 +602,12 @@ hipError_t hipStreamQuery(hipStream_t stream); * * @return #hipSuccess, #hipErrorInvalidResourceHandle * - * If the null stream is specified, this command blocks until all + * This command is host-synchronous : the host will block until the specified stream is empty. + * + * This command follows standard null-stream semantics. Specifically, specifying the null stream will cause the + * command to wait for other streams on the same device to complete all pending operations. + * * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking. - * This command is host-synchronous : the host will block until the stream is empty. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy * diff --git a/projects/hip/src/grid_launch.cpp b/projects/hip/src/grid_launch.cpp index cac01df7dc..ffa50dec95 100644 --- a/projects/hip/src/grid_launch.cpp +++ b/projects/hip/src/grid_launch.cpp @@ -52,7 +52,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream) { - if ((HIP_TRACE_API & (1 << TRACE_CMD)) || + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) { std::stringstream os; diff --git a/projects/hip/src/hip_device.cpp b/projects/hip/src/hip_device.cpp index 01a213190f..93c1c20484 100644 --- a/projects/hip/src/hip_device.cpp +++ b/projects/hip/src/hip_device.cpp @@ -298,7 +298,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device) hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) { HIP_INIT_API(props, device); - return ihipGetDeviceProperties(props, device); + return ihipLogStatus(ihipGetDeviceProperties(props, device)); } hipError_t hipSetDeviceFlags( unsigned int flags) diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp index 61ac5cd3ab..fbaf5cc463 100644 --- a/projects/hip/src/hip_event.cpp +++ b/projects/hip/src/hip_event.cpp @@ -114,14 +114,17 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) HIP_INIT_API(event, stream); if (event && event->_state != hipEventStatusUnitialized) { + stream = ihipSyncAndResolveStream(stream); + event->_stream = stream; - if (stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream == NULL) { + + // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 + // If stream == NULL, wait on all queues. - // TODO-HCC fix this - is this conservative or still uses device timestamps? - // TODO-HCC can we use barrier or event marker to implement better solution? ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); event->_state = hipEventStatusRecorded; @@ -164,9 +167,10 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (event->_stream == NULL) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { auto *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + // TODO-HIP_SYNC_NULL_STREAM - can remove this code + ctx->locked_syncDefaultStream(true, true); return ihipLogStatus(hipSuccess); } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 07604fe85d..979a2e5028 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -92,6 +92,9 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; +// Sync on host between +int HIP_SYNC_NULL_STREAM = 1; + int HCC_OPT_FLUSH = 0; @@ -289,6 +292,32 @@ inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCri assert(streamCrit->_hasQueue); } +hc::hcWaitMode ihipStream_t::waitMode() const +{ + hc::hcWaitMode waitMode = hc::hcWaitModeActive; + + if (_scheduleMode == Auto) { + if (g_deviceCnt > g_numLogicalThreads) { + waitMode = hc::hcWaitModeActive; + } else { + waitMode = hc::hcWaitModeBlocked; + } + } else if (_scheduleMode == Spin) { + waitMode = hc::hcWaitModeActive; + } else if (_scheduleMode == Yield) { + waitMode = hc::hcWaitModeBlocked; + } else { + assert(0); // bad wait mode. + } + + if (HIP_WAIT_MODE == 1) { + waitMode = hc::hcWaitModeBlocked; + } else if (HIP_WAIT_MODE == 2) { + waitMode = hc::hcWaitModeActive; + } + + return waitMode; +} //Wait for all kernel and data copy commands in this stream to complete. //This signature should be used in routines that already have locked the stream mutex @@ -296,29 +325,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { if (crit->_hasQueue) { tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - hc::hcWaitMode waitMode = hc::hcWaitModeActive; - if (_scheduleMode == Auto) { - if (g_deviceCnt > g_numLogicalThreads) { - waitMode = hc::hcWaitModeActive; - } else { - waitMode = hc::hcWaitModeBlocked; - } - } else if (_scheduleMode == Spin) { - waitMode = hc::hcWaitModeActive; - } else if (_scheduleMode == Yield) { - waitMode = hc::hcWaitModeBlocked; - } else { - assert(0); // bad wait mode. - } - - if (HIP_WAIT_MODE == 1) { - waitMode = hc::hcWaitModeBlocked; - } else if (HIP_WAIT_MODE == 2) { - waitMode = hc::hcWaitModeActive; - } - - crit->_av.wait(waitMode); + crit->_av.wait(waitMode()); } else { tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); } @@ -337,7 +345,7 @@ void ihipStream_t::locked_wait() }; // Causes current stream to wait for specified event to complete: -// Note this does not require any kind of host serialization. +// Note this does not provide any kind of host serialization. void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); @@ -1061,26 +1069,57 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) // Implement "default" stream syncronization // This waits for all other streams to drain before continuing. // If waitOnSelf is set, this additionally waits for the default stream to empty. -void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf) +// In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other +// activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. +void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { LockedAccessor_CtxCrit_t crit(_criticalData); - tprintf(DB_SYNC, "syncDefaultStream\n"); + tprintf(DB_SYNC, "syncDefaultStream \n"); + + // Vector of ops sent to each stream that will complete before ops sent to null stream: + std::vector depOps; for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { + if (HIP_SYNC_NULL_STREAM) { - if (waitOnSelf || (stream != _defaultStream)) { - // TODO-hcc - use blocking or active wait here? - // TODO-sync - cudaDeviceBlockingSync - stream->locked_wait(); + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream + if (!(stream->_flags & hipStreamNonBlocking)) { + + if (waitOnSelf || (stream != _defaultStream)) { + stream->locked_wait(); + } + } + } else { + if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); + + // The last marker will provide appropriate visibility: + if (!streamCrit->_av.get_is_empty()) { + depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + } } } } + + + + // Enqueue a barrier to wait on all the barriers we sent above: + if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { + LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); + if (syncHost) { + defaultCf.wait(); // TODO - account for active or blocking here. + } + } + + tprintf(DB_SYNC, " syncDefaultStream depOps=%zu\n", depOps.size()); + } @@ -1267,6 +1306,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_FAIL_SOC, 0, "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation. Bit 0x1=Fail on async copy with unpinned memory. Bit 0x2=Fail peer copy rather than use staging buffer copy"); READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); + READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); // TODO - review, can we remove this? READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); @@ -1274,7 +1314,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impact HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { @@ -1415,17 +1455,44 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { - ihipCtx_t *device = ihipGetTlsDefaultCtx(); + ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); #ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - device->locked_syncDefaultStream(false); + ctx->locked_syncDefaultStream(false, false); #endif - return device->_defaultStream; + return ctx->_defaultStream; } else { - // ALl streams have to wait for legacy default stream to be empty: + // All streams have to wait for legacy default stream to be empty: if (!(stream->_flags & hipStreamNonBlocking)) { - tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str()); - stream->getCtx()->_defaultStream->locked_wait(); + if (HIP_SYNC_NULL_STREAM) { + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + stream->getCtx()->_defaultStream->locked_wait(); + } else { + ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; + + tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); + + bool needMarker = false; + hc::completion_future dcf; + { + LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); + // TODO - could call create_blocking_marker(queue) + if (!defaultStreamCrit->_av.get_is_empty()) { + needMarker = true; + + // TODO - add "none_scope". + dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } + } + + if (needMarker) { + // ensure any commands sent to this stream wait on the NULL stream before continuing + LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); + // TODO - could be "noret" version of create_blocking_marker + thisStreamCrit->_av.create_blocking_marker(dcf); + } + } } return stream; diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 7787242ca7..0d080f9225 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -66,6 +66,8 @@ extern int HIP_COHERENT_HOST_ALLOC; // Chicken bits for disabling functionality to work around potential issues: extern int HIP_SYNC_HOST_ALLOC; +extern int HIP_SYNC_NULL_STREAM; + // TODO - remove when this is standard behavior. extern int HCC_OPT_FLUSH; @@ -187,11 +189,11 @@ extern const char *API_COLOR_END; //--- -//HIP Trace modes +//HIP Trace modes - use with HIP_TRACE_API=... #define TRACE_ALL 0 // 0x1 #define TRACE_KCMD 1 // 0x2, kernel command #define TRACE_MCMD 2 // 0x4, memory command -#define TRACE_MEM 3 // 0x8 +#define TRACE_MEM 3 // 0x8, memory allocation or deallocation. //--- @@ -276,7 +278,7 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); API_TRACE(0, __VA_ARGS__); -// Like above, but will trace with TRACE_CMD. +// Like above, but will trace with a specified "special" bit. // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU: // kernel launches, copy commands, memory sets, etc. #define HIP_INIT_SPECIAL_API(tbit, ...) \ @@ -521,8 +523,10 @@ public: void locked_waitEvent(hipEvent_t event); void locked_recordEvent(hipEvent_t event); + ihipStreamCritical_t &criticalData() { return _criticalData; }; //--- + hc::hcWaitMode waitMode() const; // Use this if we already have the stream critical data mutex: void wait(LockedAccessor_StreamCrit_t &crit); @@ -786,7 +790,7 @@ public: // Functions: void locked_removeStream(ihipStream_t *s); void locked_reset(); void locked_waitAllStreams(); - void locked_syncDefaultStream(bool waitOnSelf); + void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); // Will allocate a queue and assign it to the needyStream: hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index cef676b572..5501fec734 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -525,7 +525,7 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind); if(symbolName == nullptr) { @@ -598,7 +598,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind, stream); if(symbolName == nullptr) { @@ -807,7 +807,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); if(width > dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); hipError_t e = hipSuccess; @@ -1041,7 +1041,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index b359e7a63c..da01f23769 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -352,14 +352,14 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *func = sym; hmod->funcTrack.push_back(*func); } - return ihipLogStatus(ret); + return ret; } hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name){ HIP_INIT_API(hfunc, hmod, name); - return ihipModuleGetSymbol(hfunc, hmod, name); + return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name)); } diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index d7f8717725..34b4bc8851 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -150,7 +150,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; @@ -174,7 +174,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; diff --git a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp index f8d201cb51..380979f6bc 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -27,8 +27,9 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include -unsigned p_streams = 6; +unsigned p_streams =16; int p_repeat = 10; +int p_db = 0; template @@ -45,7 +46,7 @@ vectorADDRepeat(hipLaunchParm lp, for (int j=1; j<=repeat;j++) { for (size_t i=offset; i::Streamer(size_t numElements, bool useNullStream) : HIPCHECK(hipStreamCreate(&_stream)); } HIPCHECK(hipEventCreate(&_event)); + + H2D(); + }; +template +void Streamer::H2D() +{ + HIPCHECK(hipMemcpy(_A_d, _A_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(_B_d, _B_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); +} + +template +void Streamer::D2H() +{ + HIPCHECK(hipMemcpy(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost)); +} + +template +void Streamer::reset() +{ + HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); + H2D(); + +} + + template void Streamer::enqueAsync() { @@ -131,6 +161,10 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--repeat") || (!strcmp(arg, "-r"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_repeat)) { + failed("Bad repeat argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -138,6 +172,15 @@ void parseMyArguments(int argc, char *argv[]) }; +void +printBuffer(std::string name, int *f, size_t numElements) +{ + std::cout << name << "\n"; + for (size_t i=0; i FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; - float *expected_H = (float*)malloc(numElements*sizeof(float)); + int *expected_H = (int*)malloc(numElements*sizeof(int)); - auto nullStreamer = new FloatStreamer(numElements, true); + auto nullStreamer = new IntStreamer(numElements, true); + + // Expected resultr - last streamer runs vectorADDRepeat, then nullstreamer adds lastStreamer->_C_d + lastStreamer->_C_d for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat) *2; } for (int i=0; i Test 0x1 runAsnc\n"); - for (int i=0; ienqueAsync(); + for (int s=1; s Test %x runAsnc, #streams=%d\n", (1<reset(); + + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + + if (p_db) { + HIPCHECK(hipDeviceSynchronize()); + lastStreamer->D2H(); + printBuffer("lastStream _A_h", lastStreamer->_A_h, min(numElements, size_t(20))); + printBuffer("lastStream _B_h", lastStreamer->_B_h, min(numElements, size_t(20))); + printBuffer("lastStream _C_h", lastStreamer->_C_h, min(numElements, size_t(20))); + } + nullStreamer->D2H(); + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - - auto lastStreamer = streamers[p_streams - 1]; - - // Dispatch to NULL stream, should wait for prior async activity to complete. - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); - HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); - HIPCHECK(hipStreamSynchronize(0)); - - - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - if (p_tests & 0x2) { - printf ("==> Test 0x2 runAsnc-odd-only\n"); - for (int i=0; ienqueAsync(); + for (int s=1; sreset(); + printf ("==> Test %x runAsnc-odd-only, #streams=%d\n", tmask, s); + for (int i=0; ienqueAsync(); + } } + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + nullStreamer->D2H(); + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } + // Expected resultr - last streamer runs vectorADDRepeat + for (size_t i=0; i_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat); + } + + if (p_tests & 0x20000) { + + assert (p_streams >=2); // need a couple streams in order to run this test. + nullStreamer->reset(); + printf ("\n==> Test hipStreamSynchronize with defaultStream \n"); + + // Enqueue a long-running job to stream1 + streamers[0]->enqueAsync(); + + // Check to see if synchronizing on a null stream synchronizes all other streams or just the null stream. + // This function follows null stream semantics and will wait for all other blocking streams before returning. + // This will wait on the host + HIPCHECK(hipStreamSynchronize(0)); + + // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. + HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); + + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + } + passed(); } diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h index 633ee6f825..1a6e51e08e 100644 --- a/projects/hip/tests/src/test_common.h +++ b/projects/hip/tests/src/test_common.h @@ -184,6 +184,20 @@ addCountReverse( const T *A_d, } +void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) +{ + // Initialize the host data: + for (size_t i=0; i void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) @@ -217,15 +231,10 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - // Initialize the host data: - for (size_t i=0; i void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, @@ -367,6 +376,43 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } +// Assumes C_h contains vector add of A_h + B_h +// Calls the test "failed" macro if a mismatch is detected. +template +void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch=true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for (size_t i=0; i