diff --git a/projects/hip/docs/markdown/hip_profiling.md b/projects/hip/docs/markdown/hip_profiling.md index db5d0fc425..21133100ec 100644 --- a/projects/hip/docs/markdown/hip_profiling.md +++ b/projects/hip/docs/markdown/hip_profiling.md @@ -325,9 +325,18 @@ Some key information from the trace above. Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization. These are not intended for production but can be useful diagnose synchronization problems in the application (or driver). -Some of the most useful chicken bits are described here: +Some of the most useful chicken bits are described here. These bits are supported on the ROCm path: + +HIP provides 3 environment variables in the HIP_*_BLOCKING family. These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization. These flags will have performance impact and are not intended for production use. + +- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch. Equivalent to setting CUDA_LAUNCH_BLOCKING. +- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names. The HIP runtime will wait on the host after one of the named kernels executes. This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality. There is no indication if kernel names are spelled incorrectly. One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING". +- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller. + +These options cause HCC to serialize. Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP. +- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize. +- HCC_SERIALIZE_COPY : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.0 -- HIP_LAUNCH_BLOCKING=1 : On ROCm, this flag waits on the host after each kernel launches and after each memory copy command. On CUDA, the waits are only enforced after each kernel launch. This is useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels and memory management code are correct, and any failures likely are causes by improper or missing synchronization. - HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines. - HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts. Can be useful to diagnose interrupt storm issues in the driver. - HSA_DISABLE_CACHE=1 : Disables the GPU L2 data cache. diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index b1877ed0b3..55a0485365 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -828,7 +828,7 @@ extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr); extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr); extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr); -extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp); +extern void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp); // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be either size_t or dim3 types @@ -838,7 +838,7 @@ do {\ lp.dynamic_group_mem_bytes = _groupMemBytes; \ hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \ _kernelName (lp, ##__VA_ARGS__);\ - ihipPostLaunchKernel(trueStream, lp);\ + ihipPostLaunchKernel(#_kernelName, trueStream, lp);\ } while(0) diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 06402f9a67..abd260762f 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -61,6 +61,9 @@ const char *API_COLOR = KGRN; const char *API_COLOR_END = KNRM; int HIP_LAUNCH_BLOCKING = 0; +std::string HIP_LAUNCH_BLOCKING_KERNELS; +std::vector g_hipLaunchBlockingKernels; +int HIP_API_BLOCKING = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; @@ -81,6 +84,8 @@ int HIP_DENY_PEER_ACCESS = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; +int HIP_COHERENT_HOST_ALLOC = 0; + @@ -358,13 +363,24 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() //--- // Must be called after kernel finishes, this releases the lock on the stream so other commands can submit. -void ihipStream_t::lockclose_postKernelCommand(hc::accelerator_view *av) +void ihipStream_t::lockclose_postKernelCommand(const char * kernelName, hc::accelerator_view *av) { + bool blockThisKernel = false; - if (HIP_LAUNCH_BLOCKING) { + if (!g_hipLaunchBlockingKernels.empty()) { + std::string kernelNameString(kernelName); + for (auto o=g_hipLaunchBlockingKernels.begin(); o!=g_hipLaunchBlockingKernels.end(); o++) { + if ((*o == kernelNameString)) { + //printf ("force blocking for kernel %s\n", o->c_str()); + blockThisKernel = true; + } + } + } + + if (HIP_LAUNCH_BLOCKING || blockThisKernel) { // TODO - fix this so it goes through proper stream::wait() call.// direct wait OK since we know the stream is locked. av->wait(hc::hcWaitModeActive); - tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str()); + tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for kernel '%s' completion\n", ToString(this).c_str(), kernelName); } _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand. @@ -1243,7 +1259,15 @@ void ihipInit() //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading // TODO: In HIP/hcc, this variable blocks after both kernel commmands and data transfer. Maybe should be bit-mask for each command type? - READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); + READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP kernel launches 'host-synchronous', so they block until any kernel launches. Alias: CUDA_LAUNCH_BLOCKING." ); + READ_ENV_S(release, HIP_LAUNCH_BLOCKING_KERNELS, 0, "Comma-separated list of kernel names to make host-synchronous, so they block until completed."); + if (!HIP_LAUNCH_BLOCKING_KERNELS.empty()) { + tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels); + } + READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." ); + + + READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback); if ((HIP_DB & (1<lockclose_postKernelCommand(lp.av); + stream->lockclose_postKernelCommand(kernelName, lp.av); MARKER_END(); } @@ -1883,8 +1914,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes }; - if (HIP_LAUNCH_BLOCKING) { - tprintf(DB_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); + if (HIP_API_BLOCKING) { + tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for completion of hipMemcpyAsync(sz=%zu)\n", ToString(this).c_str(), sizeBytes); this->wait(crit); } diff --git a/projects/hip/src/hip_hcc.h b/projects/hip/src/hip_hcc.h index 0040263194..b01d41be14 100644 --- a/projects/hip/src/hip_hcc.h +++ b/projects/hip/src/hip_hcc.h @@ -45,6 +45,7 @@ extern const int release; // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for kernels? extern int HIP_LAUNCH_BLOCKING; +extern int HIP_API_BLOCKING; extern int HIP_PRINT_ENV; extern int HIP_PROFILE_API; @@ -56,6 +57,8 @@ extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creat extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */ extern int HIP_FORCE_P2P_HOST; +extern int HIP_COHERENT_HOST_ALLOC; + //--- // Chicken bits for disabling functionality to work around potential issues: @@ -156,11 +159,6 @@ extern const char *API_COLOR_END; #endif -// Compile code that force hipHostMalloc only allocates finegrained system memory. -#ifndef HIP_COHERENT_HOST_ALLOC -#define HIP_COHERENT_HOST_ALLOC 0 -#endif - // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. @@ -455,7 +453,7 @@ public: //--- // Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex. LockedAccessor_StreamCrit_t lockopen_preKernelCommand(); - void lockclose_postKernelCommand(hc::accelerator_view *av); + void lockclose_postKernelCommand(const char *kernelName, hc::accelerator_view *av); void locked_wait(bool assertQueueEmpty=false); diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp index 314890d167..4b91228032 100644 --- a/projects/hip/src/hip_memory.cpp +++ b/projects/hip/src/hip_memory.cpp @@ -157,20 +157,6 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) return ihipLogStatus(hip_status); } -void ihipReadSingleEnv(int *var_ptr, const char *var_name1, const char *description) -{ - char * env = getenv(var_name1); - - // Default is set when variable is initialized (at top of this file), so only override if we find - // an environment variable. - if (env) { - long int v = strtol(env, NULL, 0); - *var_ptr = (int) (v); - } - if (HIP_PRINT_ENV) { - printf ("%-30s = %2d : %s\n", var_name1, *var_ptr, description); - } -} hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { @@ -193,16 +179,12 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; - // Read from environment variable of HIP_COHERENT_HOST_ALLOC - int coherent_alloc=0; - ihipReadSingleEnv(&coherent_alloc, "HIP_COHERENT_HOST_ALLOC", "Flag to force allocate finegrained system memory"); - if (flags & ~supportedFlags) { hip_status = hipErrorInvalidValue; } else { auto device = ctx->getWriteableDevice(); - if(coherent_alloc){ + if(HIP_COHERENT_HOST_ALLOC){ // Force to allocate finedgrained system memory *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if(sizeBytes < 1 && (*ptr == NULL)){ @@ -853,13 +835,13 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } } - stream->lockclose_postKernelCommand(&crit->_av); + stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av); - if (HIP_LAUNCH_BLOCKING) { - tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset [stream:%p].\n", __func__, (void*)stream); + if (HIP_API_BLOCKING) { + tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str()); cf.wait(); - tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream); + //tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream); } } else { e = hipErrorInvalidValue; @@ -906,7 +888,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) // TODO - is hipMemset supposed to be async? cf.wait(); - stream->lockclose_postKernelCommand(&crit->_av); + stream->lockclose_postKernelCommand("hipMemset", &crit->_av); if (HIP_LAUNCH_BLOCKING) { diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp index f7ac35c77b..606d99f2fd 100644 --- a/projects/hip/src/hip_module.cpp +++ b/projects/hip/src/hip_module.cpp @@ -333,7 +333,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, #endif // USE_DISPATCH_HSA_KERNEL - ihipPostLaunchKernel(hStream, lp); + ihipPostLaunchKernel(f->_kernelName, hStream, lp); }