diff --git a/projects/hip/docs/markdown/hip_kernel_language.md b/projects/hip/docs/markdown/hip_kernel_language.md index a4f4d5d47f..094d7531e8 100644 --- a/projects/hip/docs/markdown/hip_kernel_language.md +++ b/projects/hip/docs/markdown/hip_kernel_language.md @@ -699,8 +699,18 @@ for (int i=0; i<16; i++) ... ## In-Line Assembly -In-line assembly, including in-line PTX, in-line HSAIL and in-line GCN ISA, is not supported. Users who need these features should employ conditional compilation to provide different functionally equivalent implementations on each target platform. +GCN ISA In-line assembly, is supported. For example: +``` +asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); +``` + +We insert the GCN isa into the kernel using `asm()` Assembler statement. +`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations. +`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands +`"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list) +Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint). ## C++ Support The following C++ features are not supported: diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index 1092e22c47..2386ea08cb 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -99,7 +99,7 @@ extern int HIP_TRACE_API; // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call. #if __HIP_DEVICE_COMPILE__ == 1 #undef assert - #define assert(COND) { if (COND) {} } + #define assert(COND) { if (!COND) {abort();} } #endif diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime.h b/projects/hip/include/hip/nvcc_detail/hip_runtime.h index 8c08f3d151..fca9ab6e39 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime.h @@ -111,6 +111,8 @@ kernelName<<>>(__VA_ARGS__);\ #ifdef __HIP_DEVICE_COMPILE__ #define abort() {asm("trap;");} +#undef assert +#define assert(COND) { if (!COND) {abort();} } #endif #endif diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h index 502d205ac4..cd8b2c17a0 100644 --- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h @@ -105,7 +105,7 @@ typedef cudaIpcMemHandle_t hipIpcMemHandle_t; typedef cudaLimit hipLimit_t; typedef cudaFuncCache hipFuncCache_t; typedef CUcontext hipCtx_t; -typedef CUsharedconfig hipSharedMemConfig; +typedef cudaSharedMemConfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; typedef CUjit_option hipJitOption; typedef CUdevice hipDevice_t; @@ -124,6 +124,11 @@ typedef cudaArray hipArray; //typedef cudaChannelFormatDesc hipChannelFormatDesc; #define hipChannelFormatDesc cudaChannelFormatDesc +//adding code for hipmemSharedConfig +#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault +#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte +#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte + inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) { switch(cuError) { case cudaSuccess : return hipSuccess; @@ -833,12 +838,12 @@ inline static hipError_t hipCtxSetCacheConfig (hipFuncCache cacheConfig) inline static hipError_t hipCtxSetSharedMemConfig (hipSharedMemConfig config) { - return hipCUResultTohipError(cuCtxSetSharedMemConfig(config)); + return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config)); } inline static hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) { - return hipCUResultTohipError(cuCtxGetSharedMemConfig(pConfig)); + return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig *)pConfig)); } inline static hipError_t hipCtxSynchronize ( void ) @@ -881,6 +886,16 @@ inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char *pciBusI return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId)); } +inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig *config) +{ + return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config)); +} + +inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) +{ + return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config)); +} + inline static hipError_t hipDeviceGetLimit(size_t *pValue, hipLimit_t limit) { return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit)); diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md index 0e64fe9c6e..7d0301bc74 100644 --- a/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md +++ b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md @@ -27,10 +27,23 @@ We will be using the Simple Matrix Transpose application from the our very first ## asm() Assembler statement -We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following: +In the same sourcecode, we used for MatrixTranspose. We'll add the following: ` asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); ` +GCN ISA In-line assembly, is supported. For example: + +``` +asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); +``` + +We insert the GCN isa into the kernel using `asm()` Assembler statement. +`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations. +`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands +`"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list) +Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint). + ## How to build and run: Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 5e297465ec..445e74cbeb 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -88,7 +88,11 @@ int HIP_HOST_COHERENT = 1; int HIP_SYNC_HOST_ALLOC = 1; + int HIP_INIT_ALLOC=-1; +int HIP_SYNC_STREAM_WAIT = 0; +int HIP_FORCE_NULL_STREAM=0; + #if (__hcc_workweek__ >= 17300) @@ -1250,6 +1254,9 @@ void HipReadEnv() READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); READ_ENV_I(release, HIP_INIT_ALLOC, 0, "If not -1, initialize allocated memory to specified byte"); READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); + READ_ENV_I(release, HIP_FORCE_NULL_STREAM, 0, "Force all stream allocations to secretly return the null stream"); + + READ_ENV_I(release, HIP_SYNC_STREAM_WAIT, 0, "hipStreamWaitEvent will synchronize to host"); READ_ENV_I(release, HIP_HOST_COHERENT, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h index 159a447081..4b7e533a4c 100644 --- a/projects/hip/src/hip_hcc_internal.h +++ b/projects/hip/src/hip_hcc_internal.h @@ -65,9 +65,12 @@ extern int HIP_HIDDEN_FREE_MEM; //--- // Chicken bits for disabling functionality to work around potential issues: extern int HIP_SYNC_HOST_ALLOC; +extern int HIP_SYNC_STREAM_WAIT; extern int HIP_SYNC_NULL_STREAM; extern int HIP_INIT_ALLOC; +extern int HIP_FORCE_NULL_STREAM; + // TODO - remove when this is standard behavior. extern int HCC_OPT_FLUSH; diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp index 51aeb01412..7dd6efd39c 100644 --- a/projects/hip/src/hip_stream.cpp +++ b/projects/hip/src/hip_stream.cpp @@ -38,21 +38,26 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags) hipError_t e = hipSuccess; if (ctx) { - hc::accelerator acc = ctx->getWriteableDevice()->_acc; - // TODO - se try-catch loop to detect memory exception? - // - //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete: - //This matches CUDA stream behavior: + if (HIP_FORCE_NULL_STREAM) { + *stream = 0; + } else { + hc::accelerator acc = ctx->getWriteableDevice()->_acc; - { - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); + // TODO - se try-catch loop to detect memory exception? + // + //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete: + //This matches CUDA stream behavior: - auto istream = new ihipStream_t(ctx, acc.create_view(), flags); + { + // Obtain mutex access to the device critical data, release by destructor + LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); - ctxCrit->addStream(istream); - *stream = istream; + auto istream = new ihipStream_t(ctx, acc.create_view(), flags); + + ctxCrit->addStream(istream); + *stream = istream; + } } tprintf(DB_SYNC, "hipStreamCreate, %s\n", ToString(*stream).c_str()); @@ -93,18 +98,15 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int } else if (event->_state != hipEventStatusUnitialized) { - if (stream != hipStreamNull) { - + if (HIP_SYNC_STREAM_WAIT || (HIP_SYNC_NULL_STREAM && (stream == 0))) { + // conservative wait on host for the specified event to complete: + event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + } else { + stream = ihipSyncAndResolveStream(stream); // This will user create_blocking_marker to wait on the specified queue. stream->locked_streamWaitEvent(event); - - } else { - // TODO-hcc Convert to use create_blocking_marker(...) functionality. - // Currently we have a super-conservative version of this - block on host, and drain the queue. - // This should create a barrier packet in the target queue. - // TODO-HIP_SYNC_NULL_STREAM - stream->locked_wait(); } + } // else event not recorded, return immediately and don't create marker. return ihipLogStatus(e); @@ -122,15 +124,14 @@ hipError_t hipStreamQuery(hipStream_t stream) stream = device->_defaultStream; } - int pendingOps = 0; + bool isEmpty = 0; { LockedAccessor_StreamCrit_t crit(stream->_criticalData); - pendingOps = crit->_av.get_pending_async_ops(); + isEmpty = crit->_av.get_is_empty(); } - - hipError_t e = (pendingOps > 0) ? hipErrorNotReady : hipSuccess; + hipError_t e = isEmpty ? hipSuccess : hipErrorNotReady ; return ihipLogStatus(e); } @@ -170,7 +171,9 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { - e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream + if (!HIP_FORCE_NULL_STREAM) { + e = hipErrorInvalidResourceHandle; + } } else { stream->locked_wait(); diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 9bbd43828c..f5b1b79550 100644 --- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -446,9 +446,41 @@ int main(int argc, char *argv[]) if (p_tests & 0x1000) { - printf ("==> Test 0x1000 try null stream\n"); - hipStreamQuery(0/* try null stream*/); + printf ("==> Test 0x1000 simple null stream tests\n"); + // try some null stream: + hipStreamQuery(0); + + + hipStream_t s1; + hipEvent_t e1; + + { + // stream null waits on event in s1 stream: + HIPCHECK(hipStreamCreate(&s1)); + HIPCHECK(hipEventCreate(&e1)); + + HIPCHECK(hipEventRecord(e1, s1)) + + HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); + + HIPCHECK(hipStreamDestroy(s1)); + HIPCHECK(hipEventDestroy(e1)); + } + + { + // stream s1 waits on event in null stream: + HIPCHECK(hipStreamCreate(&s1)); + HIPCHECK(hipEventCreate(&e1)); + + HIPCHECK(hipEventRecord(e1, hipStream_t(0))) + + HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); + + HIPCHECK(hipStreamDestroy(s1)); + HIPCHECK(hipEventDestroy(e1)); + } + } @@ -471,8 +503,8 @@ int main(int argc, char *argv[]) } - { - printf ("test: alternating memcpy/count-reverse followed by event\n"); + if (p_tests & 0x4000 ) { + printf ("test: %x alternating memcpy/count-reverse followed by event\n", p_tests); RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); }