Refactor for stream->_av.

- move _av into stream critical section. ( HCC accelerator_view is not thread-safe but HIP steram is. ) - Refactored many places in code that need to acquire critical section. some were previously thread races, ie enqueueing marker. -remove support for GRID_LAUNCH_VERSION < 20 -Enable USE_AV_COPY based on HCC work-week. - Review hipModule docs, some calrity/editing. Change-Id: I3ce7c25ece048c3504f55ecd4683e506bb1fc8b6 [ROCm/hip commit: e76a272d48]
2016-08-30 17:29:50 -05:00
@@ -32,7 +32,7 @@ THE SOFTWARE.
 // #define USE_MEMCPYTOSYMBOL
 //
 //Use the new HCC accelerator_view::copy instead of am_copy
-#define USE_AV_COPY 0
+#define USE_AV_COPY (__hcc_workweek__ >= 16351) 

 // Compile peer-to-peer support.
 // >= 2 : use HCC hc:accelerator::get_is_peer
@@ -353,18 +353,36 @@ struct LockedBase {
 };


+class ihipModule_t{
+public:
+  hsa_executable_t executable;
+  hsa_code_object_t object;
+  std::string fileName;
+  void *ptr;
+  size_t size;
+};
+
+
+class ihipFunction_t{
+public:
+  hsa_executable_symbol_t kernel_symbol;
+  uint64_t kernel;
+};
+
+
 template <typename MUTEX_TYPE>
 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
 {
 public:
-    ihipStreamCriticalBase_t() :
+    ihipStreamCriticalBase_t(hc::accelerator_view av) :
        _last_command_type(ihipCommandCopyH2H),
        _last_copy_signal(NULL),
        _signalCursor(0),
        _oldest_live_sig_id(1),
        _streamSigId(0),
        _kernelCnt(0),
-        _signalCnt(0)
+        _signalCnt(0),
+        _av(av)
    {
        _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
    };
@@ -395,27 +413,14 @@ public:
                                               //   2 are required if a barrier packet is inserted.
    uint32_t                    _kernelCnt;    // Count of inflight kernels in this stream.  Reset at ::wait().
    SIGSEQNUM                   _streamSigId;      // Monotonically increasing unique signal id.
+
+    hc::accelerator_view        _av;
 };


 typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
 typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;

-class ihipModule_t{
-public:
-  hsa_executable_t executable;
-  hsa_code_object_t object;
-  std::string fileName;
-  void *ptr;
-  size_t size;
-};
-
-
-class ihipFunction_t{
-public:
-  hsa_executable_symbol_t kernel_symbol;
-  uint64_t kernel;
-};

 // Internal stream structure.
 class ihipStream_t {
@@ -431,16 +436,23 @@ typedef uint64_t SeqNum_t ;

    void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);

+    int                  preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
+
    //---
-    // Thread-safe accessors - these acquire / release mutex:
-    bool                 lockopen_preKernelCommand();
+    // Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex.
+    LockedAccessor_StreamCrit_t  lockopen_preKernelCommand();
    void                 lockclose_postKernelCommand(hc::completion_future &kernel_future);

-    int                  preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);

    void                 locked_reclaimSignals(SIGSEQNUM sigNum);
    void                 locked_wait(bool assertQueueEmpty=false);
-    SIGSEQNUM            locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); };
+
+    hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
+
+    void                 locked_waitEvent(hipEvent_t event);
+    void                 locked_recordEvent(hipEvent_t event);
+
+    //---

    // Use this if we already have the stream critical data mutex:
    void                 wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
@@ -462,7 +474,6 @@ public:
    //---
    //Public member vars - these are set at initialization and never change:
    SeqNum_t                    _id;   // monotonic sequence ID
-    hc::accelerator_view        _av;
    unsigned                    _flags;


@@ -50,9 +50,8 @@ THE SOFTWARE.

 #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
 // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
-#define USE_GRID_LAUNCH_20 1
 #else
-#define USE_GRID_LAUNCH_20 0
+#error (HCC must support GRID_LAUNCH_20)
 #endif

 #define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*) 0x01)
@@ -633,7 +632,6 @@ extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
 #define KNRM  "\x1B[0m"
 #define KGRN  "\x1B[32m"

-#if USE_GRID_LAUNCH_20
 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
 do {\
  grid_launch_parm lp;\
@@ -645,21 +643,6 @@ do {\
  _kernelName (lp, ##__VA_ARGS__);\
  ihipPostLaunchKernel(trueStream, lp);\
 } while(0)
-#else
-#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
-do {\
-  grid_launch_parm lp;\
-  lp.groupMemBytes = _groupMemBytes; \
-  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
-    if (HIP_TRACE_API) {\
-        fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
-                #_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
-    }\
-  _kernelName (lp, ##__VA_ARGS__);\
-  ihipPostLaunchKernel(trueStream, lp);\
-} while(0)
-
-#endif


 #elif defined (__HCC_C__)
@@ -1307,7 +1307,7 @@ hipError_t hipDriverGetVersion(int *driverVersion) ;
 hipError_t hipModuleLoad(hipModule_t *module, const char *fname);

 /**
- * @brief Freeing the module
+ * @brief Frees the module
 *
 * @param [in] module
 *
@@ -1319,7 +1319,7 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 hipError_t hipModuleUnload(hipModule_t module);

 /**
- * @brief Function with kname will be extracted present in module
+ * @brief Function with kname will be extracted if present in module
 *
 * @param [in] module
 * @param [in] kname
@@ -1330,19 +1330,20 @@ hipError_t hipModuleUnload(hipModule_t module);
 hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);

 /**
- * @brief returns device memory pointer and size of the kernel present in the module with symbol - name
+ * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name
 *
- * @param [in] moodule
- * @param [in] name
 * @param [out] dptr
 * @param [out[ bytes
+ * @param [in] hmod
+ * @param [in] name
 *
 * @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
 */
 hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);

+
 /**
- * @brief builds module from code object which resides in host memory. And image is pointer to that location.
+ * @brief builds module from code object which resides in host memory. Image is pointer to that location.
 *
 * @param [in] image
 * @param [out] module
@@ -1351,8 +1352,9 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h
 */
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);

+
 /**
- * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
 *
 * @param [in[ f
 * @param [in] gridDimX
@@ -72,9 +72,8 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
 {
    HIP_INIT_API(event, stream);

-    ihipEvent_t *eh = event;
-    if (eh && eh->_state != hipEventStatusUnitialized)   {
-        eh->_stream = stream;
+    if (event && event->_state != hipEventStatusUnitialized)   {
+        event->_stream = stream;

        if (stream == NULL) {
            // If stream == NULL, wait on all queues.
@@ -83,16 +82,16 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
            ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
            ctx->locked_syncDefaultStream(true);

-            eh->_timestamp = hc::get_system_ticks();
-            eh->_state = hipEventStatusRecorded;
+            event->_timestamp = hc::get_system_ticks();
+            event->_state = hipEventStatusRecorded;
            return ihipLogStatus(hipSuccess);
        } else {
-            eh->_state  = hipEventStatusRecording;
+            event->_state  = hipEventStatusRecording;
            // Clear timestamps
-            eh->_timestamp = 0;
-            eh->_marker = stream->_av.create_marker();
-            
-            eh->_copySeqId = stream->locked_lastCopySeqId();
+            event->_timestamp = 0;
+
+            // Record the event in the stream:
+            stream->locked_recordEvent(event);

            return ihipLogStatus(hipSuccess);
        }
@@ -195,9 +195,9 @@ ihipSignal_t::~ihipSignal_t()
 //---
 ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags) :
    _id(0), // will be set by add function.
-    _av(av),
    _flags(flags),
-    _ctx(ctx)
+    _ctx(ctx),
+    _criticalData(av)
 {
    tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
 };
@@ -246,7 +246,7 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
 {
    if (! assertQueueEmpty) {
        tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
-        _av.wait();
+        crit->_av.wait();
    }

    if (crit->_last_copy_signal) {
@@ -273,6 +273,18 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty)

 };

+
+// Create a marker in this stream.
+// Save state in the event so it can track the status of the event.
+void ihipStream_t::locked_recordEvent(hipEvent_t event)
+{
+    // Lock the stream to prevent simultaneous access
+    LockedAccessor_StreamCrit_t crit(_criticalData);
+
+    event->_marker = crit->_av.create_marker();
+    event->_copySeqId = lastCopySeqId(crit);
+}
+
 //=============================================================================


@@ -388,11 +400,10 @@ int HIP_NUM_KERNELS_INFLIGHT = 128;
 //into the stream to mimic CUDA stream semantics.  (some hardware uses separate
 //queues for data commands and kernel commands, and no implicit ordering is provided).
 //
-bool ihipStream_t::lockopen_preKernelCommand()
+LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
 {
    LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);

-    bool addedSync = false;

    if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
        this->wait(crit);
@@ -402,9 +413,8 @@ bool ihipStream_t::lockopen_preKernelCommand()
    // If switching command types, we need to add a barrier packet to synchronize things.
    if (crit->_last_command_type != ihipCommandKernel) {
        if (crit->_last_copy_signal) {
-            addedSync = true;

-            hsa_queue_t * q =  (hsa_queue_t*)_av.get_hsa_queue();
+            hsa_queue_t * q =  (hsa_queue_t*) (crit->_av.get_hsa_queue());
            if (HIP_DISABLE_HW_KERNEL_DEP == 0) {
                this->enqueueBarrier(q, crit->_last_copy_signal, NULL);
                tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n",
@@ -422,7 +432,7 @@ bool ihipStream_t::lockopen_preKernelCommand()
        crit->_last_command_type = ihipCommandKernel;
    }

-    return addedSync;
+    return crit;
 }


@@ -433,6 +443,11 @@ void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFutu
    // We locked _criticalData in the lockopen_preKernelCommand() so OK to access here:
    _criticalData._last_kernel_future = kernelFuture;

+    if (HIP_LAUNCH_BLOCKING) {
+        kernelFuture.wait();
+        tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str());
+    }
+
    _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
 };

@@ -457,7 +472,7 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
            needSync = 1;
            ihipSignal_t *depSignal = allocSignal(crit);
            hsa_signal_store_relaxed(depSignal->_hsaSignal,1);
-            this->enqueueBarrier(static_cast<hsa_queue_t*>(_av.get_hsa_queue()), NULL, depSignal);
+            this->enqueueBarrier(static_cast<hsa_queue_t*>(crit->_av.get_hsa_queue()), NULL, depSignal);
            *waitSignal = depSignal->_hsaSignal;
        } else if (crit->_last_copy_signal) {
            needSync = 1;
@@ -500,11 +515,17 @@ void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
                        uint64_t kernel){
    hsa_status_t status;
    void *kern;
-    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
+
+    // Lock stream to prevent other threads from enqueueing kernels at same time.
+    LockedAccessor_StreamCrit_t crit (_criticalData);
+
+    hc::accelerator_view av = crit->_av;
+
+    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(av.get_hsa_kernarg_region());
    status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
-    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
+    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)av.get_hsa_agent(), 0, kern);
    memcpy(kern, kernarg, kernSize);
-    hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
+    hsa_queue_t *Queue = (hsa_queue_t*)av.get_hsa_queue();
    const uint32_t queue_mask = Queue->size-1;
    uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
    hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
@@ -1117,8 +1138,9 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c
        while (std::getline(ss, device_id, ',')) {
            if (atoi(device_id.c_str()) >= 0) {
                g_hip_visible_devices.push_back(atoi(device_id.c_str()));
-            }else// Any device number after invalid number will not present
+            } else { // Any device number after invalid number will not present
                break;
+            }
        }
        // Print out the number of ids
        if (HIP_PRINT_ENV) {
@@ -1327,7 +1349,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
 {
    HIP_INIT();
    stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
    lp->grid_dim.x = grid.x;
    lp->grid_dim.y = grid.y;
    lp->grid_dim.z = grid.z;
@@ -1336,27 +1357,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
    lp->group_dim.z = block.z;
    lp->barrier_bit = barrier_bit_queue_default;
    lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid.x;
-    lp->gridDim.y = grid.y;
-    lp->gridDim.z = grid.z;
-    lp->groupDim.x = block.x;
-    lp->groupDim.y = block.y;
-    lp->groupDim.z = block.z;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
    lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
 }
+
+
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
    stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
    lp->grid_dim.x = grid;
    lp->grid_dim.y = 1;
    lp->grid_dim.z = 1;
@@ -1365,28 +1377,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri
    lp->group_dim.z = block.z;
    lp->barrier_bit = barrier_bit_queue_default;
    lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid;
-    lp->gridDim.y = 1;
-    lp->gridDim.z = 1;
-    lp->groupDim.x = block.x;
-    lp->groupDim.y = block.y;
-    lp->groupDim.z = block.z;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
    lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
 }

+
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
    stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
    lp->grid_dim.x = grid.x;
    lp->grid_dim.y = grid.y;
    lp->grid_dim.z = grid.z;
@@ -1395,28 +1397,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri
    lp->group_dim.z = 1;
    lp->barrier_bit = barrier_bit_queue_default;
    lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid.x;
-    lp->gridDim.y = grid.y;
-    lp->gridDim.z = grid.z;
-    lp->groupDim.x = block;
-    lp->groupDim.y = 1;
-    lp->groupDim.z = 1;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
    lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
    return (stream);
 }

+
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
    stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
    lp->grid_dim.x = grid;
    lp->grid_dim.y = 1;
    lp->grid_dim.z = 1;
@@ -1425,37 +1417,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
    lp->group_dim.z = 1;
    lp->barrier_bit = barrier_bit_queue_default;
    lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid;
-    lp->gridDim.y = 1;
-    lp->gridDim.z = 1;
-    lp->groupDim.x = block;
-    lp->groupDim.y = 1;
-    lp->groupDim.z = 1;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
-    lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
+    lp->cf = new hc::completion_future; // TODO, is this necessary?
    return (stream);
 }


 //---
 //Called after kernel finishes execution.
+//This releases the lock on the stream.
 void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp)
 {
-//    stream->lockclose_postKernelCommand(cf);
-    stream->lockclose_postKernelCommand(*lp.cf);
-    if (HIP_LAUNCH_BLOCKING) {
-        tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream);
-    }
+    stream->lockclose_postKernelCommand(*(lp.cf));
 }


-//
 //=================================================================================================
 // HIP API Implementation
 //
@@ -1629,7 +1607,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
                // TODO - remove, slow path.
                tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
 #if USE_AV_COPY
-                _av.copy(src,dst,sizeBytes);
+                crit->_av.copy(src,dst,sizeBytes);
 #else
                hc::am_copy(dst, src, sizeBytes);
 #endif
@@ -1677,7 +1655,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
            // TODO - remove, slow path.
                tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
 #if USE_AV_COPY
-                _av.copy(src, dst, sizeBytes);
+                crit->_av.copy(src, dst, sizeBytes);
 #else
                hc::am_copy(dst, src, sizeBytes);
 #endif
@@ -1879,7 +1857,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
        stream = device->_defaultStream;
    }

-    *av = &(stream->_av);
+    *av = stream->locked_getAv();

    hipError_t err = hipSuccess;
    return ihipLogStatus(err);
@@ -678,9 +678,12 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,



+// TODO - make member function of stream?
 template <typename T>
 hc::completion_future
-ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
+ihipMemsetKernel(hipStream_t stream, 
+    LockedAccessor_StreamCrit_t &crit,
+    T * ptr, T val, size_t sizeBytes)
 {
    int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
    const int threads_per_wg = 256;
@@ -696,7 +699,7 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)

    hc::completion_future cf =
    hc::parallel_for_each(
-            stream->_av,
+            crit->_av,
            ext_tile,
            [=] (hc::tiled_index<1> idx)
            __attribute__((hc))
@@ -713,41 +716,6 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
    return cf;
 }

-template <typename T>
-hc::completion_future
-ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
-{
-    int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
-    const int threads_per_wg = 256;
-
-    int threads = wg * threads_per_wg;
-    if (threads > sizeBytes) {
-        threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
-    }
-
-
-    hc::extent<1> ext(threads);
-    auto ext_tile = ext.tile(threads_per_wg);
-
-    hc::completion_future cf =
-    hc::parallel_for_each(
-            stream->_av,
-            ext_tile,
-            [=] (hc::tiled_index<1> idx)
-            __attribute__((hc))
-    {
-        int offset = amp_get_global_id(0);
-        // TODO-HCC - change to hc_get_local_size()
-        int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
-
-        for (int i=offset; i<sizeBytes; i+=stride) {
-            c[i] = a[i];
-        }
-    });
-
-    return cf;
-}
-


 // TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
@@ -762,7 +730,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
    stream =  ihipSyncAndResolveStream(stream);

    if (stream) {
-        stream->lockopen_preKernelCommand();
+        auto crit = stream->lockopen_preKernelCommand();

        hc::completion_future cf ;

@@ -771,7 +739,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
            try {
                value = value & 0xff;
                unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
+                cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
            }
            catch (std::exception &ex) {
                e = hipErrorInvalidValue;
@@ -779,7 +747,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
        } else {
            // use a slow byte-per-workitem copy:
            try {
-                cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
+                cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
            }
            catch (std::exception &ex) {
                e = hipErrorInvalidValue;
@@ -814,7 +782,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
    stream =  ihipSyncAndResolveStream(stream);

    if (stream) {
-        stream->lockopen_preKernelCommand();
+        auto crit = stream->lockopen_preKernelCommand();

        hc::completion_future cf ;

@@ -823,7 +791,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
            try {
                value = value & 0xff;
                unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
+                cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
            }
            catch (std::exception &ex) {
                e = hipErrorInvalidValue;
@@ -831,7 +799,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
        } else {
            // use a slow byte-per-workitem copy:
            try {
-                cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
+                cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
            }
            catch (std::exception &ex) {
                e = hipErrorInvalidValue;
@@ -84,6 +84,10 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int

    hipError_t e = hipSuccess;

+#ifdef USE_AV_COPY
+     printf ("USE_AV_COPY\n");
+#endif
+
    {
        // TODO-hcc Convert to use create_blocking_marker(...) functionality.
        // Currently we have a super-conservative version of this - block on host, and drain the queue.