From bb0afa4e38c28deecbe57f69d7ef84e347c6b8ac Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Tue, 30 Aug 2016 17:29:50 -0500
Subject: [PATCH] Refactor for stream->_av.

- move _av into stream critical section.  ( HCC accelerator_view is not
  thread-safe but HIP steram is. )
- Refactored many places in code that need to acquire critical section.
some were previously thread races, ie enqueueing marker.

-remove support for GRID_LAUNCH_VERSION < 20
-Enable USE_AV_COPY based on HCC work-week.
- Review hipModule docs, some calrity/editing.

Change-Id: I3ce7c25ece048c3504f55ecd4683e506bb1fc8b6


[ROCm/hip commit: e76a272d48138acdea06bb98089b57e4df214787]
---
 projects/hip/include/hcc_detail/hip_hcc.h     |  57 ++++---
 projects/hip/include/hcc_detail/hip_runtime.h |  19 +--
 .../hip/include/hcc_detail/hip_runtime_api.h  |  16 +-
 projects/hip/src/hip_event.cpp                |  19 ++-
 projects/hip/src/hip_hcc.cpp                  | 142 ++++++++----------
 projects/hip/src/hip_memory.cpp               |  54 ++-----
 projects/hip/src/hip_stream.cpp               |   4 +
 7 files changed, 128 insertions(+), 183 deletions(-)
diff --git a/projects/hip/include/hcc_detail/hip_hcc.h b/projects/hip/include/hcc_detail/hip_hcc.h
index 01c36afde4..fb10af1106 100644
--- a/projects/hip/include/hcc_detail/hip_hcc.h
+++ b/projects/hip/include/hcc_detail/hip_hcc.h
@@ -32,7 +32,7 @@ THE SOFTWARE.
 // #define USE_MEMCPYTOSYMBOL
 //
 //Use the new HCC accelerator_view::copy instead of am_copy
-#define USE_AV_COPY 0
+#define USE_AV_COPY (__hcc_workweek__ >= 16351) 
 
 // Compile peer-to-peer support.
 // >= 2 : use HCC hc:accelerator::get_is_peer
@@ -353,18 +353,36 @@ struct LockedBase {
 };
 
 
+class ihipModule_t{
+public:
+  hsa_executable_t executable;
+  hsa_code_object_t object;
+  std::string fileName;
+  void *ptr;
+  size_t size;
+};
+
+
+class ihipFunction_t{
+public:
+  hsa_executable_symbol_t kernel_symbol;
+  uint64_t kernel;
+};
+
+
 template <typename MUTEX_TYPE>
 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
 {
 public:
-    ihipStreamCriticalBase_t() :
+    ihipStreamCriticalBase_t(hc::accelerator_view av) :
         _last_command_type(ihipCommandCopyH2H),
         _last_copy_signal(NULL),
         _signalCursor(0),
         _oldest_live_sig_id(1),
         _streamSigId(0),
         _kernelCnt(0),
-        _signalCnt(0)
+        _signalCnt(0),
+        _av(av)
     {
         _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
     };
@@ -395,27 +413,14 @@ public:
                                                //   2 are required if a barrier packet is inserted.
     uint32_t                    _kernelCnt;    // Count of inflight kernels in this stream.  Reset at ::wait().
     SIGSEQNUM                   _streamSigId;      // Monotonically increasing unique signal id.
+
+    hc::accelerator_view        _av;
 };
 
 
 typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
 typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
 
-class ihipModule_t{
-public:
-  hsa_executable_t executable;
-  hsa_code_object_t object;
-  std::string fileName;
-  void *ptr;
-  size_t size;
-};
-
-
-class ihipFunction_t{
-public:
-  hsa_executable_symbol_t kernel_symbol;
-  uint64_t kernel;
-};
 
 // Internal stream structure.
 class ihipStream_t {
@@ -431,16 +436,23 @@ typedef uint64_t SeqNum_t ;
 
     void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
 
+    int                  preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
+
     //---
-    // Thread-safe accessors - these acquire / release mutex:
-    bool                 lockopen_preKernelCommand();
+    // Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex.
+    LockedAccessor_StreamCrit_t  lockopen_preKernelCommand();
     void                 lockclose_postKernelCommand(hc::completion_future &kernel_future);
 
-    int                  preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
 
     void                 locked_reclaimSignals(SIGSEQNUM sigNum);
     void                 locked_wait(bool assertQueueEmpty=false);
-    SIGSEQNUM            locked_lastCopySeqId() {LockedAccessor_StreamCrit_t crit(_criticalData); return lastCopySeqId(crit); };
+
+    hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
+
+    void                 locked_waitEvent(hipEvent_t event);
+    void                 locked_recordEvent(hipEvent_t event);
+
+    //---
 
     // Use this if we already have the stream critical data mutex:
     void                 wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
@@ -462,7 +474,6 @@ public:
     //---
     //Public member vars - these are set at initialization and never change:
     SeqNum_t                    _id;   // monotonic sequence ID
-    hc::accelerator_view        _av;
     unsigned                    _flags;
 
 
diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h
index 727604d8d8..108f73a7eb 100644
--- a/projects/hip/include/hcc_detail/hip_runtime.h
+++ b/projects/hip/include/hcc_detail/hip_runtime.h
@@ -50,9 +50,8 @@ THE SOFTWARE.
 
 #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
 // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
-#define USE_GRID_LAUNCH_20 1
 #else
-#define USE_GRID_LAUNCH_20 0
+#error (HCC must support GRID_LAUNCH_20)
 #endif
 
 #define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*) 0x01)
@@ -633,7 +632,6 @@ extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
 #define KNRM  "\x1B[0m"
 #define KGRN  "\x1B[32m"
 
-#if USE_GRID_LAUNCH_20
 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
 do {\
   grid_launch_parm lp;\
@@ -645,21 +643,6 @@ do {\
   _kernelName (lp, ##__VA_ARGS__);\
   ihipPostLaunchKernel(trueStream, lp);\
 } while(0)
-#else
-#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
-do {\
-  grid_launch_parm lp;\
-  lp.groupMemBytes = _groupMemBytes; \
-  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
-    if (HIP_TRACE_API) {\
-        fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
-                #_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
-    }\
-  _kernelName (lp, ##__VA_ARGS__);\
-  ihipPostLaunchKernel(trueStream, lp);\
-} while(0)
-
-#endif
 
 
 #elif defined (__HCC_C__)
diff --git a/projects/hip/include/hcc_detail/hip_runtime_api.h b/projects/hip/include/hcc_detail/hip_runtime_api.h
index 213928151a..fc390bbd23 100644
--- a/projects/hip/include/hcc_detail/hip_runtime_api.h
+++ b/projects/hip/include/hcc_detail/hip_runtime_api.h
@@ -1307,7 +1307,7 @@ hipError_t hipDriverGetVersion(int *driverVersion) ;
 hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 
 /**
- * @brief Freeing the module
+ * @brief Frees the module
  *
  * @param [in] module
  *
@@ -1319,7 +1319,7 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
 hipError_t hipModuleUnload(hipModule_t module);
 
 /**
- * @brief Function with kname will be extracted present in module
+ * @brief Function with kname will be extracted if present in module
  *
  * @param [in] module
  * @param [in] kname
@@ -1330,19 +1330,20 @@ hipError_t hipModuleUnload(hipModule_t module);
 hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
 
 /**
- * @brief returns device memory pointer and size of the kernel present in the module with symbol - name
+ * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name
  *
- * @param [in] moodule
- * @param [in] name
  * @param [out] dptr
  * @param [out[ bytes
+ * @param [in] hmod
+ * @param [in] name
  *
  * @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
  */
 hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
 
+
 /**
- * @brief builds module from code object which resides in host memory. And image is pointer to that location.
+ * @brief builds module from code object which resides in host memory. Image is pointer to that location.
  *
  * @param [in] image
  * @param [out] module
@@ -1351,8 +1352,9 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h
  */
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 
+
 /**
- * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
  *
  * @param [in[ f
  * @param [in] gridDimX
diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp
index 4324583690..33ec0bc897 100644
--- a/projects/hip/src/hip_event.cpp
+++ b/projects/hip/src/hip_event.cpp
@@ -72,9 +72,8 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
 {
     HIP_INIT_API(event, stream);
 
-    ihipEvent_t *eh = event;
-    if (eh && eh->_state != hipEventStatusUnitialized)   {
-        eh->_stream = stream;
+    if (event && event->_state != hipEventStatusUnitialized)   {
+        event->_stream = stream;
 
         if (stream == NULL) {
             // If stream == NULL, wait on all queues.
@@ -83,16 +82,16 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
             ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
             ctx->locked_syncDefaultStream(true);
 
-            eh->_timestamp = hc::get_system_ticks();
-            eh->_state = hipEventStatusRecorded;
+            event->_timestamp = hc::get_system_ticks();
+            event->_state = hipEventStatusRecorded;
             return ihipLogStatus(hipSuccess);
         } else {
-            eh->_state  = hipEventStatusRecording;
+            event->_state  = hipEventStatusRecording;
             // Clear timestamps
-            eh->_timestamp = 0;
-            eh->_marker = stream->_av.create_marker();
-            
-            eh->_copySeqId = stream->locked_lastCopySeqId();
+            event->_timestamp = 0;
+
+            // Record the event in the stream:
+            stream->locked_recordEvent(event);
 
             return ihipLogStatus(hipSuccess);
         }
diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp
index ea8604b7c4..ca650e5283 100644
--- a/projects/hip/src/hip_hcc.cpp
+++ b/projects/hip/src/hip_hcc.cpp
@@ -195,9 +195,9 @@ ihipSignal_t::~ihipSignal_t()
 //---
 ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags) :
     _id(0), // will be set by add function.
-    _av(av),
     _flags(flags),
-    _ctx(ctx)
+    _ctx(ctx),
+    _criticalData(av)
 {
     tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
 };
@@ -246,7 +246,7 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
 {
     if (! assertQueueEmpty) {
         tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
-        _av.wait();
+        crit->_av.wait();
     }
 
     if (crit->_last_copy_signal) {
@@ -273,6 +273,18 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty)
 
 };
 
+
+// Create a marker in this stream.
+// Save state in the event so it can track the status of the event.
+void ihipStream_t::locked_recordEvent(hipEvent_t event)
+{
+    // Lock the stream to prevent simultaneous access
+    LockedAccessor_StreamCrit_t crit(_criticalData);
+
+    event->_marker = crit->_av.create_marker();
+    event->_copySeqId = lastCopySeqId(crit);
+}
+
 //=============================================================================
 
 
@@ -388,11 +400,10 @@ int HIP_NUM_KERNELS_INFLIGHT = 128;
 //into the stream to mimic CUDA stream semantics.  (some hardware uses separate
 //queues for data commands and kernel commands, and no implicit ordering is provided).
 //
-bool ihipStream_t::lockopen_preKernelCommand()
+LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
 {
     LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);
 
-    bool addedSync = false;
 
     if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
         this->wait(crit);
@@ -402,9 +413,8 @@ bool ihipStream_t::lockopen_preKernelCommand()
     // If switching command types, we need to add a barrier packet to synchronize things.
     if (crit->_last_command_type != ihipCommandKernel) {
         if (crit->_last_copy_signal) {
-            addedSync = true;
 
-            hsa_queue_t * q =  (hsa_queue_t*)_av.get_hsa_queue();
+            hsa_queue_t * q =  (hsa_queue_t*) (crit->_av.get_hsa_queue());
             if (HIP_DISABLE_HW_KERNEL_DEP == 0) {
                 this->enqueueBarrier(q, crit->_last_copy_signal, NULL);
                 tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n",
@@ -422,7 +432,7 @@ bool ihipStream_t::lockopen_preKernelCommand()
         crit->_last_command_type = ihipCommandKernel;
     }
 
-    return addedSync;
+    return crit;
 }
 
 
@@ -433,6 +443,11 @@ void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFutu
     // We locked _criticalData in the lockopen_preKernelCommand() so OK to access here:
     _criticalData._last_kernel_future = kernelFuture;
 
+    if (HIP_LAUNCH_BLOCKING) {
+        kernelFuture.wait();
+        tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str());
+    }
+
     _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
 };
 
@@ -457,7 +472,7 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
             needSync = 1;
             ihipSignal_t *depSignal = allocSignal(crit);
             hsa_signal_store_relaxed(depSignal->_hsaSignal,1);
-            this->enqueueBarrier(static_cast<hsa_queue_t*>(_av.get_hsa_queue()), NULL, depSignal);
+            this->enqueueBarrier(static_cast<hsa_queue_t*>(crit->_av.get_hsa_queue()), NULL, depSignal);
             *waitSignal = depSignal->_hsaSignal;
         } else if (crit->_last_copy_signal) {
             needSync = 1;
@@ -500,11 +515,17 @@ void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
                         uint64_t kernel){
     hsa_status_t status;
     void *kern;
-    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
+
+    // Lock stream to prevent other threads from enqueueing kernels at same time.
+    LockedAccessor_StreamCrit_t crit (_criticalData);
+
+    hc::accelerator_view av = crit->_av;
+
+    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(av.get_hsa_kernarg_region());
     status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
-    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
+    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)av.get_hsa_agent(), 0, kern);
     memcpy(kern, kernarg, kernSize);
-    hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
+    hsa_queue_t *Queue = (hsa_queue_t*)av.get_hsa_queue();
     const uint32_t queue_mask = Queue->size-1;
     uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
     hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
@@ -1117,8 +1138,9 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c
         while (std::getline(ss, device_id, ',')) {
             if (atoi(device_id.c_str()) >= 0) {
                 g_hip_visible_devices.push_back(atoi(device_id.c_str()));
-            }else// Any device number after invalid number will not present
+            } else { // Any device number after invalid number will not present
                 break;
+            }
         }
         // Print out the number of ids
         if (HIP_PRINT_ENV) {
@@ -1327,7 +1349,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
 {
     HIP_INIT();
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid.x;
     lp->grid_dim.y = grid.y;
     lp->grid_dim.z = grid.z;
@@ -1336,27 +1357,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
     lp->group_dim.z = block.z;
     lp->barrier_bit = barrier_bit_queue_default;
     lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid.x;
-    lp->gridDim.y = grid.y;
-    lp->gridDim.z = grid.z;
-    lp->groupDim.x = block.x;
-    lp->groupDim.y = block.y;
-    lp->groupDim.z = block.z;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
     lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
     return (stream);
 }
+
+
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid;
     lp->grid_dim.y = 1;
     lp->grid_dim.z = 1;
@@ -1365,28 +1377,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri
     lp->group_dim.z = block.z;
     lp->barrier_bit = barrier_bit_queue_default;
     lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid;
-    lp->gridDim.y = 1;
-    lp->gridDim.z = 1;
-    lp->groupDim.x = block.x;
-    lp->groupDim.y = block.y;
-    lp->groupDim.z = block.z;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
     lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
     return (stream);
 }
 
+
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid.x;
     lp->grid_dim.y = grid.y;
     lp->grid_dim.z = grid.z;
@@ -1395,28 +1397,18 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri
     lp->group_dim.z = 1;
     lp->barrier_bit = barrier_bit_queue_default;
     lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid.x;
-    lp->gridDim.y = grid.y;
-    lp->gridDim.z = grid.z;
-    lp->groupDim.x = block;
-    lp->groupDim.y = 1;
-    lp->groupDim.z = 1;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
     lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
     return (stream);
 }
 
+
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
     stream = ihipSyncAndResolveStream(stream);
-#if USE_GRID_LAUNCH_20
     lp->grid_dim.x = grid;
     lp->grid_dim.y = 1;
     lp->grid_dim.z = 1;
@@ -1425,37 +1417,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
     lp->group_dim.z = 1;
     lp->barrier_bit = barrier_bit_queue_default;
     lp->launch_fence = -1;
-#else
-    lp->gridDim.x = grid;
-    lp->gridDim.y = 1;
-    lp->gridDim.z = 1;
-    lp->groupDim.x = block;
-    lp->groupDim.y = 1;
-    lp->groupDim.z = 1;
-#endif
-    stream->lockopen_preKernelCommand();
-//    *av = &stream->_av;
-    lp->av = &stream->_av;
-    lp->cf = new hc::completion_future;
-//    lp->av = static_cast<void*>(av);
-//    lp->cf = static_cast<void*>(malloc(sizeof(hc::completion_future)));
+
+    auto crit = stream->lockopen_preKernelCommand();
+    lp->av = &(crit->_av);
+    lp->cf = new hc::completion_future; // TODO, is this necessary?
     return (stream);
 }
 
 
 //---
 //Called after kernel finishes execution.
+//This releases the lock on the stream.
 void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp)
 {
-//    stream->lockclose_postKernelCommand(cf);
-    stream->lockclose_postKernelCommand(*lp.cf);
-    if (HIP_LAUNCH_BLOCKING) {
-        tprintf(DB_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream);
-    }
+    stream->lockclose_postKernelCommand(*(lp.cf));
 }
 
 
-//
 //=================================================================================================
 // HIP API Implementation
 //
@@ -1629,7 +1607,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
                 // TODO - remove, slow path.
                 tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
 #if USE_AV_COPY
-                _av.copy(src,dst,sizeBytes);
+                crit->_av.copy(src,dst,sizeBytes);
 #else
                 hc::am_copy(dst, src, sizeBytes);
 #endif
@@ -1677,7 +1655,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
             // TODO - remove, slow path.
                 tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
 #if USE_AV_COPY
-                _av.copy(src, dst, sizeBytes);
+                crit->_av.copy(src, dst, sizeBytes);
 #else
                 hc::am_copy(dst, src, sizeBytes);
 #endif
@@ -1879,7 +1857,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
         stream = device->_defaultStream;
     }
 
-    *av = &(stream->_av);
+    *av = stream->locked_getAv();
 
     hipError_t err = hipSuccess;
     return ihipLogStatus(err);
diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp
index df1b70a3f8..beb84bdb6f 100644
--- a/projects/hip/src/hip_memory.cpp
+++ b/projects/hip/src/hip_memory.cpp
@@ -678,9 +678,12 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
 
 
 
+// TODO - make member function of stream?
 template <typename T>
 hc::completion_future
-ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
+ihipMemsetKernel(hipStream_t stream, 
+    LockedAccessor_StreamCrit_t &crit,
+    T * ptr, T val, size_t sizeBytes)
 {
     int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
     const int threads_per_wg = 256;
@@ -696,7 +699,7 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
 
     hc::completion_future cf =
     hc::parallel_for_each(
-            stream->_av,
+            crit->_av,
             ext_tile,
             [=] (hc::tiled_index<1> idx)
             __attribute__((hc))
@@ -713,41 +716,6 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
     return cf;
 }
 
-template <typename T>
-hc::completion_future
-ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
-{
-    int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
-    const int threads_per_wg = 256;
-
-    int threads = wg * threads_per_wg;
-    if (threads > sizeBytes) {
-        threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
-    }
-
-
-    hc::extent<1> ext(threads);
-    auto ext_tile = ext.tile(threads_per_wg);
-
-    hc::completion_future cf =
-    hc::parallel_for_each(
-            stream->_av,
-            ext_tile,
-            [=] (hc::tiled_index<1> idx)
-            __attribute__((hc))
-    {
-        int offset = amp_get_global_id(0);
-        // TODO-HCC - change to hc_get_local_size()
-        int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
-
-        for (int i=offset; i<sizeBytes; i+=stride) {
-            c[i] = a[i];
-        }
-    });
-
-    return cf;
-}
-
 
 
 // TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
@@ -762,7 +730,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
     stream =  ihipSyncAndResolveStream(stream);
 
     if (stream) {
-        stream->lockopen_preKernelCommand();
+        auto crit = stream->lockopen_preKernelCommand();
 
         hc::completion_future cf ;
 
@@ -771,7 +739,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
             try {
                 value = value & 0xff;
                 unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
+                cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
@@ -779,7 +747,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
         } else {
             // use a slow byte-per-workitem copy:
             try {
-                cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
+                cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
@@ -814,7 +782,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
     stream =  ihipSyncAndResolveStream(stream);
 
     if (stream) {
-        stream->lockopen_preKernelCommand();
+        auto crit = stream->lockopen_preKernelCommand();
 
         hc::completion_future cf ;
 
@@ -823,7 +791,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
             try {
                 value = value & 0xff;
                 unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-                cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
+                cf = ihipMemsetKernel<unsigned> (stream, crit, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
@@ -831,7 +799,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
         } else {
             // use a slow byte-per-workitem copy:
             try {
-                cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
+                cf = ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes);
             }
             catch (std::exception &ex) {
                 e = hipErrorInvalidValue;
diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp
index a204e2f79a..5d900fcaeb 100644
--- a/projects/hip/src/hip_stream.cpp
+++ b/projects/hip/src/hip_stream.cpp
@@ -84,6 +84,10 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
 
     hipError_t e = hipSuccess;
 
+#ifdef USE_AV_COPY
+     printf ("USE_AV_COPY\n");
+#endif
+
     {
         // TODO-hcc Convert to use create_blocking_marker(...) functionality.
         // Currently we have a super-conservative version of this - block on host, and drain the queue.