diff --git a/projects/hip/docs/markdown/hip_kernel_language.md b/projects/hip/docs/markdown/hip_kernel_language.md
index a4f4d5d47f..094d7531e8 100644
--- a/projects/hip/docs/markdown/hip_kernel_language.md
+++ b/projects/hip/docs/markdown/hip_kernel_language.md
@@ -699,8 +699,18 @@ for (int i=0; i<16; i++) ...
 
 ## In-Line Assembly
 
-In-line assembly, including in-line PTX, in-line HSAIL and in-line GCN ISA, is not supported. Users who need these features should employ conditional compilation to provide different functionally equivalent implementations on each target platform.
+GCN ISA In-line assembly, is supported. For example:
 
+```
+asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
+```
+
+We insert the GCN isa into the kernel using `asm()` Assembler statement.
+`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations.
+`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/)
+Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands
+`"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list)
+Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint).
 
 ## C++ Support
 The following C++ features are not supported:
diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h
index 1092e22c47..2386ea08cb 100644
--- a/projects/hip/include/hip/hcc_detail/hip_runtime.h
+++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h
@@ -99,7 +99,7 @@ extern int HIP_TRACE_API;
 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
 #if __HIP_DEVICE_COMPILE__ == 1
    #undef assert
-   #define assert(COND) { if (COND) {} }
+   #define assert(COND) { if (!COND) {abort();} }
 #endif
 
 
diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime.h b/projects/hip/include/hip/nvcc_detail/hip_runtime.h
index 8c08f3d151..fca9ab6e39 100644
--- a/projects/hip/include/hip/nvcc_detail/hip_runtime.h
+++ b/projects/hip/include/hip/nvcc_detail/hip_runtime.h
@@ -111,6 +111,8 @@ kernelName<<<numblocks,numthreads,memperblock,streamId>>>(__VA_ARGS__);\
 
 #ifdef __HIP_DEVICE_COMPILE__
 #define abort() {asm("trap;");}
+#undef assert
+#define assert(COND) { if (!COND) {abort();} }
 #endif
 
 #endif
diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h
index 502d205ac4..cd8b2c17a0 100644
--- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h
+++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h
@@ -105,7 +105,7 @@ typedef cudaIpcMemHandle_t hipIpcMemHandle_t;
 typedef cudaLimit hipLimit_t;
 typedef cudaFuncCache hipFuncCache_t;
 typedef CUcontext hipCtx_t;
-typedef CUsharedconfig hipSharedMemConfig;
+typedef cudaSharedMemConfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
 typedef CUjit_option hipJitOption;
 typedef CUdevice hipDevice_t;
@@ -124,6 +124,11 @@ typedef cudaArray hipArray;
 //typedef cudaChannelFormatDesc hipChannelFormatDesc;
 #define hipChannelFormatDesc cudaChannelFormatDesc
 
+//adding code for hipmemSharedConfig
+#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
+#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
+#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte
+
 inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
 switch(cuError) {
     case cudaSuccess                             : return hipSuccess;
@@ -833,12 +838,12 @@ inline static hipError_t  hipCtxSetCacheConfig (hipFuncCache cacheConfig)
 
 inline static hipError_t  hipCtxSetSharedMemConfig (hipSharedMemConfig config)
 {
-    return hipCUResultTohipError(cuCtxSetSharedMemConfig(config));
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config));
 }
 
 inline static hipError_t  hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
 {
-    return hipCUResultTohipError(cuCtxGetSharedMemConfig(pConfig));
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig *)pConfig));
 }
 
 inline static hipError_t  hipCtxSynchronize ( void )
@@ -881,6 +886,16 @@ inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char *pciBusI
     return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId));
 }
 
+inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig *config)
+{
+    return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config));
+}
+ 
+inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config)
+{
+    return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config));
+}
+
 inline static hipError_t hipDeviceGetLimit(size_t *pValue, hipLimit_t limit)
 {
     return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md
index 0e64fe9c6e..7d0301bc74 100644
--- a/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md
+++ b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md
@@ -27,10 +27,23 @@ We will be using the Simple Matrix Transpose application from the our very first
 
 ## asm() Assembler statement
 
-We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following:
+In the same sourcecode, we used for MatrixTranspose. We'll add the following:
 
 `  asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));                    `
 
+GCN ISA In-line assembly, is supported. For example:
+
+```
+asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
+```
+
+We insert the GCN isa into the kernel using `asm()` Assembler statement.
+`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations.
+`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/)
+Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands
+`"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list)
+Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint).
+
 ## How to build and run:
 Use the make command and execute it using ./exe
 Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp
index 5e297465ec..445e74cbeb 100644
--- a/projects/hip/src/hip_hcc.cpp
+++ b/projects/hip/src/hip_hcc.cpp
@@ -88,7 +88,11 @@ int HIP_HOST_COHERENT = 1;
 
 int HIP_SYNC_HOST_ALLOC = 1;
 
+
 int HIP_INIT_ALLOC=-1;
+int HIP_SYNC_STREAM_WAIT = 0;
+int HIP_FORCE_NULL_STREAM=0;
+
 
 
 #if (__hcc_workweek__ >= 17300)
@@ -1250,6 +1254,9 @@ void HipReadEnv()
     READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations.  May help stability");
     READ_ENV_I(release, HIP_INIT_ALLOC, 0, "If not -1, initialize allocated memory to specified byte");
     READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions");
+    READ_ENV_I(release, HIP_FORCE_NULL_STREAM,  0, "Force all stream allocations to secretly return the null stream");
+
+    READ_ENV_I(release, HIP_SYNC_STREAM_WAIT, 0, "hipStreamWaitEvent will synchronize to host");
 
 
     READ_ENV_I(release, HIP_HOST_COHERENT, 0, "If set, all host memory will be allocated as fine-grained system memory.  This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact.");
diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h
index 159a447081..4b7e533a4c 100644
--- a/projects/hip/src/hip_hcc_internal.h
+++ b/projects/hip/src/hip_hcc_internal.h
@@ -65,9 +65,12 @@ extern int HIP_HIDDEN_FREE_MEM;
 //---
 // Chicken bits for disabling functionality to work around potential issues:
 extern int HIP_SYNC_HOST_ALLOC;
+extern int HIP_SYNC_STREAM_WAIT;
 
 extern int HIP_SYNC_NULL_STREAM;
 extern int HIP_INIT_ALLOC;
+extern int HIP_FORCE_NULL_STREAM;
+
 
 // TODO - remove when this is standard behavior.
 extern int HCC_OPT_FLUSH;
diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp
index 51aeb01412..7dd6efd39c 100644
--- a/projects/hip/src/hip_stream.cpp
+++ b/projects/hip/src/hip_stream.cpp
@@ -38,21 +38,26 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
     hipError_t e = hipSuccess;
 
     if (ctx) {
-        hc::accelerator acc = ctx->getWriteableDevice()->_acc;
 
-        // TODO - se try-catch loop to detect memory exception?
-        //
-        //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete:
-        //This matches CUDA stream behavior:
+        if (HIP_FORCE_NULL_STREAM) {
+            *stream = 0; 
+        } else {
+            hc::accelerator acc = ctx->getWriteableDevice()->_acc;
 
-        {
-            // Obtain mutex access to the device critical data, release by destructor
-            LockedAccessor_CtxCrit_t  ctxCrit(ctx->criticalData());
+            // TODO - se try-catch loop to detect memory exception?
+            //
+            //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete:
+            //This matches CUDA stream behavior:
 
-            auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
+            {
+                // Obtain mutex access to the device critical data, release by destructor
+                LockedAccessor_CtxCrit_t  ctxCrit(ctx->criticalData());
 
-            ctxCrit->addStream(istream);
-            *stream = istream;
+                auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
+
+                ctxCrit->addStream(istream);
+                *stream = istream;
+            }
         }
 
         tprintf(DB_SYNC, "hipStreamCreate, %s\n", ToString(*stream).c_str());
@@ -93,18 +98,15 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
 
     } else if (event->_state != hipEventStatusUnitialized) {
 
-        if (stream != hipStreamNull) {
-
+        if (HIP_SYNC_STREAM_WAIT || (HIP_SYNC_NULL_STREAM && (stream == 0))) {
+            // conservative wait on host for the specified event to complete:
+            event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive);
+        } else {
+            stream = ihipSyncAndResolveStream(stream);
             // This will user create_blocking_marker to wait on the specified queue.
             stream->locked_streamWaitEvent(event);
-
-        } else {
-            // TODO-hcc Convert to use create_blocking_marker(...) functionality.
-            // Currently we have a super-conservative version of this - block on host, and drain the queue.
-            // This should create a barrier packet in the target queue.
-            // TODO-HIP_SYNC_NULL_STREAM
-            stream->locked_wait();
         }
+
     } // else event not recorded, return immediately and don't create marker.
 
     return ihipLogStatus(e);
@@ -122,15 +124,14 @@ hipError_t hipStreamQuery(hipStream_t stream)
         stream =  device->_defaultStream;
     }
 
-    int pendingOps = 0;
+    bool isEmpty = 0;
 
     {
         LockedAccessor_StreamCrit_t crit(stream->_criticalData);
-        pendingOps = crit->_av.get_pending_async_ops();
+        isEmpty = crit->_av.get_is_empty();
     }
 
-
-    hipError_t e = (pendingOps > 0) ? hipErrorNotReady : hipSuccess;
+    hipError_t e = isEmpty ? hipSuccess : hipErrorNotReady ;
 
     return ihipLogStatus(e);
 }
@@ -170,7 +171,9 @@ hipError_t hipStreamDestroy(hipStream_t stream)
 
     //--- Drain the stream:
     if (stream == NULL) {
-        e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream
+        if (!HIP_FORCE_NULL_STREAM) {
+            e = hipErrorInvalidResourceHandle; 
+        } 
     } else {
         stream->locked_wait();
 
diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
index 9bbd43828c..f5b1b79550 100644
--- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
+++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
@@ -446,9 +446,41 @@ int main(int argc, char *argv[])
 
 
     if (p_tests & 0x1000) {
-        printf ("==> Test 0x1000 try null stream\n"); 
-        hipStreamQuery(0/* try null stream*/);
+        printf ("==> Test 0x1000 simple null stream tests\n"); 
 
+        // try some null stream:
+        hipStreamQuery(0);
+
+
+        hipStream_t s1;
+        hipEvent_t e1;
+
+        {
+            // stream null waits on event in s1 stream:
+            HIPCHECK(hipStreamCreate(&s1));
+            HIPCHECK(hipEventCreate(&e1));
+
+            HIPCHECK(hipEventRecord(e1, s1))
+
+            HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/));
+            
+            HIPCHECK(hipStreamDestroy(s1));
+            HIPCHECK(hipEventDestroy(e1));
+        }
+
+        {
+            // stream s1 waits on event in null stream:
+            HIPCHECK(hipStreamCreate(&s1));
+            HIPCHECK(hipEventCreate(&e1));
+
+            HIPCHECK(hipEventRecord(e1, hipStream_t(0)))
+
+            HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/));
+            
+            HIPCHECK(hipStreamDestroy(s1));
+            HIPCHECK(hipEventDestroy(e1));
+        }
+        
     }
 
 
@@ -471,8 +503,8 @@ int main(int argc, char *argv[])
     }
 
 
-    {
-        printf ("test: alternating memcpy/count-reverse followed by event\n");
+    if (p_tests & 0x4000 ) {
+        printf ("test: %x alternating memcpy/count-reverse followed by event\n", p_tests);
         RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0),  true);
         RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false),  true);
     }