From bb0afa4e38c28deecbe57f69d7ef84e347c6b8ac Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 30 Aug 2016 17:29:50 -0500 Subject: [PATCH] Refactor for stream->_av. - move _av into stream critical section. ( HCC accelerator_view is not thread-safe but HIP steram is. ) - Refactored many places in code that need to acquire critical section. some were previously thread races, ie enqueueing marker. -remove support for GRID_LAUNCH_VERSION < 20 -Enable USE_AV_COPY based on HCC work-week. - Review hipModule docs, some calrity/editing. Change-Id: I3ce7c25ece048c3504f55ecd4683e506bb1fc8b6 [ROCm/hip commit: e76a272d48138acdea06bb98089b57e4df214787] --- projects/hip/include/hcc_detail/hip_hcc.h | 57 ++++--- projects/hip/include/hcc_detail/hip_runtime.h | 19 +-- .../hip/include/hcc_detail/hip_runtime_api.h | 16 +- projects/hip/src/hip_event.cpp | 19 ++- projects/hip/src/hip_hcc.cpp | 142 ++++++++---------- projects/hip/src/hip_memory.cpp | 54 ++----- projects/hip/src/hip_stream.cpp | 4 + 7 files changed, 128 insertions(+), 183 deletions(-) diff --git a/projects/hip/include/hcc_detail/hip_hcc.h b/projects/hip/include/hcc_detail/hip_hcc.h index 01c36afde4..fb10af1106 100644 --- a/projects/hip/include/hcc_detail/hip_hcc.h +++ b/projects/hip/include/hcc_detail/hip_hcc.h @@ -32,7 +32,7 @@ THE SOFTWARE. // #define USE_MEMCPYTOSYMBOL // //Use the new HCC accelerator_view::copy instead of am_copy -#define USE_AV_COPY 0 +#define USE_AV_COPY (__hcc_workweek__ >= 16351) // Compile peer-to-peer support. // >= 2 : use HCC hc:accelerator::get_is_peer @@ -353,18 +353,36 @@ struct LockedBase { }; +class ihipModule_t{ +public: + hsa_executable_t executable; + hsa_code_object_t object; + std::string fileName; + void *ptr; + size_t size; +}; + + +class ihipFunction_t{ +public: + hsa_executable_symbol_t kernel_symbol; + uint64_t kernel; +}; + + template class ihipStreamCriticalBase_t : public LockedBase { public: - ihipStreamCriticalBase_t() : + ihipStreamCriticalBase_t(hc::accelerator_view av) : _last_command_type(ihipCommandCopyH2H), _last_copy_signal(NULL), _signalCursor(0), _oldest_live_sig_id(1), _streamSigId(0), _kernelCnt(0), - _signalCnt(0) + _signalCnt(0), + _av(av) { _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); }; @@ -395,27 +413,14 @@ public: // 2 are required if a barrier packet is inserted. uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id. + + hc::accelerator_view _av; }; typedef ihipStreamCriticalBase_t ihipStreamCritical_t; typedef LockedAccessor LockedAccessor_StreamCrit_t; -class ihipModule_t{ -public: - hsa_executable_t executable; - hsa_code_object_t object; - std::string fileName; - void *ptr; - size_t size; -}; - - -class ihipFunction_t{ -public: - hsa_executable_symbol_t kernel_symbol; - uint64_t kernel; -}; // Internal stream structure. class ihipStream_t { @@ -431,16 +436,23 @@ typedef uint64_t SeqNum_t ; void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind); + int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + //--- - // Thread-safe accessors - these acquire / release mutex: - bool lockopen_preKernelCommand(); + // Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex. + LockedAccessor_StreamCrit_t lockopen_preKernelCommand(); void lockclose_postKernelCommand(hc::completion_future &kernel_future); - int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); void locked_reclaimSignals(SIGSEQNUM sigNum); void locked_wait(bool assertQueueEmpty=false); - SIGSEQNUM locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); }; + + hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); }; + + void locked_waitEvent(hipEvent_t event); + void locked_recordEvent(hipEvent_t event); + + //--- // Use this if we already have the stream critical data mutex: void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false); @@ -462,7 +474,6 @@ public: //--- //Public member vars - these are set at initialization and never change: SeqNum_t _id; // monotonic sequence ID - hc::accelerator_view _av; unsigned _flags; diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h index 727604d8d8..108f73a7eb 100644 --- a/projects/hip/include/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hcc_detail/hip_runtime.h @@ -50,9 +50,8 @@ THE SOFTWARE. #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0. -#define USE_GRID_LAUNCH_20 1 #else -#define USE_GRID_LAUNCH_20 0 +#error (HCC must support GRID_LAUNCH_20) #endif #define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*) 0x01) @@ -633,7 +632,6 @@ extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp); #define KNRM "\x1B[0m" #define KGRN "\x1B[32m" -#if USE_GRID_LAUNCH_20 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ do {\ grid_launch_parm lp;\ @@ -645,21 +643,6 @@ do {\ _kernelName (lp, ##__VA_ARGS__);\ ihipPostLaunchKernel(trueStream, lp);\ } while(0) -#else -#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ -do {\ - grid_launch_parm lp;\ - lp.groupMemBytes = _groupMemBytes; \ - hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \ - if (HIP_TRACE_API) {\ - fprintf(stderr, KGRN "<_state != hipEventStatusUnitialized) { - eh->_stream = stream; + if (event && event->_state != hipEventStatusUnitialized) { + event->_stream = stream; if (stream == NULL) { // If stream == NULL, wait on all queues. @@ -83,16 +82,16 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true); - eh->_timestamp = hc::get_system_ticks(); - eh->_state = hipEventStatusRecorded; + event->_timestamp = hc::get_system_ticks(); + event->_state = hipEventStatusRecorded; return ihipLogStatus(hipSuccess); } else { - eh->_state = hipEventStatusRecording; + event->_state = hipEventStatusRecording; // Clear timestamps - eh->_timestamp = 0; - eh->_marker = stream->_av.create_marker(); - - eh->_copySeqId = stream->locked_lastCopySeqId(); + event->_timestamp = 0; + + // Record the event in the stream: + stream->locked_recordEvent(event); return ihipLogStatus(hipSuccess); } diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index ea8604b7c4..ca650e5283 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -195,9 +195,9 @@ ihipSignal_t::~ihipSignal_t() //--- ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags) : _id(0), // will be set by add function. - _av(av), _flags(flags), - _ctx(ctx) + _ctx(ctx), + _criticalData(av) { tprintf(DB_SYNC, " streamCreate: stream=%p\n", this); }; @@ -246,7 +246,7 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty { if (! assertQueueEmpty) { tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this); - _av.wait(); + crit->_av.wait(); } if (crit->_last_copy_signal) { @@ -273,6 +273,18 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty) }; + +// Create a marker in this stream. +// Save state in the event so it can track the status of the event. +void ihipStream_t::locked_recordEvent(hipEvent_t event) +{ + // Lock the stream to prevent simultaneous access + LockedAccessor_StreamCrit_t crit(_criticalData); + + event->_marker = crit->_av.create_marker(); + event->_copySeqId = lastCopySeqId(crit); +} + //============================================================================= @@ -388,11 +400,10 @@ int HIP_NUM_KERNELS_INFLIGHT = 128; //into the stream to mimic CUDA stream semantics. (some hardware uses separate //queues for data commands and kernel commands, and no implicit ordering is provided). // -bool ihipStream_t::lockopen_preKernelCommand() +LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() { LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/); - bool addedSync = false; if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){ this->wait(crit); @@ -402,9 +413,8 @@ bool ihipStream_t::lockopen_preKernelCommand() // If switching command types, we need to add a barrier packet to synchronize things. if (crit->_last_command_type != ihipCommandKernel) { if (crit->_last_copy_signal) { - addedSync = true; - hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); + hsa_queue_t * q = (hsa_queue_t*) (crit->_av.get_hsa_queue()); if (HIP_DISABLE_HW_KERNEL_DEP == 0) { this->enqueueBarrier(q, crit->_last_copy_signal, NULL); tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", @@ -422,7 +432,7 @@ bool ihipStream_t::lockopen_preKernelCommand() crit->_last_command_type = ihipCommandKernel; } - return addedSync; + return crit; } @@ -433,6 +443,11 @@ void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFutu // We locked _criticalData in the lockopen_preKernelCommand() so OK to access here: _criticalData._last_kernel_future = kernelFuture; + if (HIP_LAUNCH_BLOCKING) { + kernelFuture.wait(); + tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str()); + } + _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand. }; @@ -457,7 +472,7 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t needSync = 1; ihipSignal_t *depSignal = allocSignal(crit); hsa_signal_store_relaxed(depSignal->_hsaSignal,1); - this->enqueueBarrier(static_cast(_av.get_hsa_queue()), NULL, depSignal); + this->enqueueBarrier(static_cast(crit->_av.get_hsa_queue()), NULL, depSignal); *waitSignal = depSignal->_hsaSignal; } else if (crit->_last_copy_signal) { needSync = 1; @@ -500,11 +515,17 @@ void ihipStream_t::launchModuleKernel(hsa_signal_t signal, uint64_t kernel){ hsa_status_t status; void *kern; - hsa_amd_memory_pool_t *pool = reinterpret_cast(_av.get_hsa_kernarg_region()); + + // Lock stream to prevent other threads from enqueueing kernels at same time. + LockedAccessor_StreamCrit_t crit (_criticalData); + + hc::accelerator_view av = crit->_av; + + hsa_amd_memory_pool_t *pool = reinterpret_cast(av.get_hsa_kernarg_region()); status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern); - status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern); + status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)av.get_hsa_agent(), 0, kern); memcpy(kern, kernarg, kernSize); - hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue(); + hsa_queue_t *Queue = (hsa_queue_t*)av.get_hsa_queue(); const uint32_t queue_mask = Queue->size-1; uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue); hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]); @@ -1117,8 +1138,9 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c while (std::getline(ss, device_id, ',')) { if (atoi(device_id.c_str()) >= 0) { g_hip_visible_devices.push_back(atoi(device_id.c_str())); - }else// Any device number after invalid number will not present + } else { // Any device number after invalid number will not present break; + } } // Print out the number of ids if (HIP_PRINT_ENV) { @@ -1327,7 +1349,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ { HIP_INIT(); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; @@ -1336,27 +1357,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ lp->group_dim.z = block.z; lp->barrier_bit = barrier_bit_queue_default; lp->launch_fence = -1; -#else - lp->gridDim.x = grid.x; - lp->gridDim.y = grid.y; - lp->gridDim.z = grid.z; - lp->groupDim.x = block.x; - lp->groupDim.y = block.y; - lp->groupDim.z = block.z; -#endif - stream->lockopen_preKernelCommand(); -// *av = &stream->_av; - lp->av = &stream->_av; + + auto crit = stream->lockopen_preKernelCommand(); + lp->av = &(crit->_av); lp->cf = new hc::completion_future; -// lp->av = static_cast(av); -// lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); } + + hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp) { - HIP_INIT_API(stream, grid, block, lp); + HIP_INIT(); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid; lp->grid_dim.y = 1; lp->grid_dim.z = 1; @@ -1365,28 +1377,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri lp->group_dim.z = block.z; lp->barrier_bit = barrier_bit_queue_default; lp->launch_fence = -1; -#else - lp->gridDim.x = grid; - lp->gridDim.y = 1; - lp->gridDim.z = 1; - lp->groupDim.x = block.x; - lp->groupDim.y = block.y; - lp->groupDim.z = block.z; -#endif - stream->lockopen_preKernelCommand(); -// *av = &stream->_av; - lp->av = &stream->_av; + + auto crit = stream->lockopen_preKernelCommand(); + lp->av = &(crit->_av); lp->cf = new hc::completion_future; -// lp->av = static_cast(av); -// lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); } + hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp) { - HIP_INIT_API(stream, grid, block, lp); + HIP_INIT(); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid.x; lp->grid_dim.y = grid.y; lp->grid_dim.z = grid.z; @@ -1395,28 +1397,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri lp->group_dim.z = 1; lp->barrier_bit = barrier_bit_queue_default; lp->launch_fence = -1; -#else - lp->gridDim.x = grid.x; - lp->gridDim.y = grid.y; - lp->gridDim.z = grid.z; - lp->groupDim.x = block; - lp->groupDim.y = 1; - lp->groupDim.z = 1; -#endif - stream->lockopen_preKernelCommand(); -// *av = &stream->_av; - lp->av = &stream->_av; + + auto crit = stream->lockopen_preKernelCommand(); + lp->av = &(crit->_av); lp->cf = new hc::completion_future; -// lp->av = static_cast(av); -// lp->cf = static_cast(malloc(sizeof(hc::completion_future))); return (stream); } + hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp) { - HIP_INIT_API(stream, grid, block, lp); + HIP_INIT(); stream = ihipSyncAndResolveStream(stream); -#if USE_GRID_LAUNCH_20 lp->grid_dim.x = grid; lp->grid_dim.y = 1; lp->grid_dim.z = 1; @@ -1425,37 +1417,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g lp->group_dim.z = 1; lp->barrier_bit = barrier_bit_queue_default; lp->launch_fence = -1; -#else - lp->gridDim.x = grid; - lp->gridDim.y = 1; - lp->gridDim.z = 1; - lp->groupDim.x = block; - lp->groupDim.y = 1; - lp->groupDim.z = 1; -#endif - stream->lockopen_preKernelCommand(); -// *av = &stream->_av; - lp->av = &stream->_av; - lp->cf = new hc::completion_future; -// lp->av = static_cast(av); -// lp->cf = static_cast(malloc(sizeof(hc::completion_future))); + + auto crit = stream->lockopen_preKernelCommand(); + lp->av = &(crit->_av); + lp->cf = new hc::completion_future; // TODO, is this necessary? return (stream); } //--- //Called after kernel finishes execution. +//This releases the lock on the stream. void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp) { -// stream->lockclose_postKernelCommand(cf); - stream->lockclose_postKernelCommand(*lp.cf); - if (HIP_LAUNCH_BLOCKING) { - tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream); - } + stream->lockclose_postKernelCommand(*(lp.cf)); } -// //================================================================================================= // HIP API Implementation // @@ -1629,7 +1607,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const // TODO - remove, slow path. tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); #if USE_AV_COPY - _av.copy(src,dst,sizeBytes); + crit->_av.copy(src,dst,sizeBytes); #else hc::am_copy(dst, src, sizeBytes); #endif @@ -1677,7 +1655,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const // TODO - remove, slow path. tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); #if USE_AV_COPY - _av.copy(src, dst, sizeBytes); + crit->_av.copy(src, dst, sizeBytes); #else hc::am_copy(dst, src, sizeBytes); #endif @@ -1879,7 +1857,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a stream = device->_defaultStream; } - *av = &(stream->_av); + *av = stream->locked_getAv(); hipError_t err = hipSuccess; return ihipLogStatus(err); diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index df1b70a3f8..beb84bdb6f 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -678,9 +678,12 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, +// TODO - make member function of stream? template hc::completion_future -ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) +ihipMemsetKernel(hipStream_t stream, + LockedAccessor_StreamCrit_t &crit, + T * ptr, T val, size_t sizeBytes) { int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); const int threads_per_wg = 256; @@ -696,7 +699,7 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) hc::completion_future cf = hc::parallel_for_each( - stream->_av, + crit->_av, ext_tile, [=] (hc::tiled_index<1> idx) __attribute__((hc)) @@ -713,41 +716,6 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) return cf; } -template -hc::completion_future -ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes) -{ - int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits); - const int threads_per_wg = 256; - - int threads = wg * threads_per_wg; - if (threads > sizeBytes) { - threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg; - } - - - hc::extent<1> ext(threads); - auto ext_tile = ext.tile(threads_per_wg); - - hc::completion_future cf = - hc::parallel_for_each( - stream->_av, - ext_tile, - [=] (hc::tiled_index<1> idx) - __attribute__((hc)) - { - int offset = amp_get_global_id(0); - // TODO-HCC - change to hc_get_local_size() - int stride = amp_get_local_size(0) * hc_get_num_groups(0) ; - - for (int i=offset; ilockopen_preKernelCommand(); + auto crit = stream->lockopen_preKernelCommand(); hc::completion_future cf ; @@ -771,7 +739,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s try { value = value & 0xff; unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + cf = ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -779,7 +747,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } else { // use a slow byte-per-workitem copy: try { - cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); + cf = ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -814,7 +782,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) stream = ihipSyncAndResolveStream(stream); if (stream) { - stream->lockopen_preKernelCommand(); + auto crit = stream->lockopen_preKernelCommand(); hc::completion_future cf ; @@ -823,7 +791,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) try { value = value & 0xff; unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + cf = ihipMemsetKernel (stream, crit, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); } catch (std::exception &ex) { e = hipErrorInvalidValue; @@ -831,7 +799,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) } else { // use a slow byte-per-workitem copy: try { - cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); + cf = ihipMemsetKernel (stream, crit, static_cast (dst), value, sizeBytes); } catch (std::exception &ex) { e = hipErrorInvalidValue; diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index a204e2f79a..5d900fcaeb 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -84,6 +84,10 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int hipError_t e = hipSuccess; +#ifdef USE_AV_COPY + printf ("USE_AV_COPY\n"); +#endif + { // TODO-hcc Convert to use create_blocking_marker(...) functionality. // Currently we have a super-conservative version of this - block on host, and drain the queue.