Add additional controls for forcing serialization and blocking.
Move HIP_COHERENT_HOST_ALLOC so it is read once at init time.
Add HIP_LAUNCH_BLOCKING_KERNELS, HIP_API_BLOCKING.
Update docs on debug and chicken bits.
Conflicts:
src/hip_hcc.cpp
[ROCm/hip commit: ff2f54c1bf]
This commit is contained in:
@@ -325,9 +325,18 @@ Some key information from the trace above.
|
||||
Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization.
|
||||
These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
|
||||
|
||||
Some of the most useful chicken bits are described here:
|
||||
Some of the most useful chicken bits are described here. These bits are supported on the ROCm path:
|
||||
|
||||
HIP provides 3 environment variables in the HIP_*_BLOCKING family. These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization. These flags will have performance impact and are not intended for production use.
|
||||
|
||||
- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch. Equivalent to setting CUDA_LAUNCH_BLOCKING.
|
||||
- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names. The HIP runtime will wait on the host after one of the named kernels executes. This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality. There is no indication if kernel names are spelled incorrectly. One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING".
|
||||
- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
|
||||
|
||||
These options cause HCC to serialize. Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.
|
||||
- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
|
||||
- HCC_SERIALIZE_COPY : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.0
|
||||
|
||||
- HIP_LAUNCH_BLOCKING=1 : On ROCm, this flag waits on the host after each kernel launches and after each memory copy command. On CUDA, the waits are only enforced after each kernel launch. This is useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels and memory management code are correct, and any failures likely are causes by improper or missing synchronization.
|
||||
- HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines.
|
||||
- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts. Can be useful to diagnose interrupt storm issues in the driver.
|
||||
- HSA_DISABLE_CACHE=1 : Disables the GPU L2 data cache.
|
||||
|
||||
@@ -828,7 +828,7 @@ extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
|
||||
extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
|
||||
extern void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp);
|
||||
|
||||
|
||||
// Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be either size_t or dim3 types
|
||||
@@ -838,7 +838,7 @@ do {\
|
||||
lp.dynamic_group_mem_bytes = _groupMemBytes; \
|
||||
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
|
||||
_kernelName (lp, ##__VA_ARGS__);\
|
||||
ihipPostLaunchKernel(trueStream, lp);\
|
||||
ihipPostLaunchKernel(#_kernelName, trueStream, lp);\
|
||||
} while(0)
|
||||
|
||||
|
||||
|
||||
+41
-10
@@ -61,6 +61,9 @@ const char *API_COLOR = KGRN;
|
||||
const char *API_COLOR_END = KNRM;
|
||||
|
||||
int HIP_LAUNCH_BLOCKING = 0;
|
||||
std::string HIP_LAUNCH_BLOCKING_KERNELS;
|
||||
std::vector<std::string> g_hipLaunchBlockingKernels;
|
||||
int HIP_API_BLOCKING = 0;
|
||||
|
||||
int HIP_PRINT_ENV = 0;
|
||||
int HIP_TRACE_API= 0;
|
||||
@@ -81,6 +84,8 @@ int HIP_DENY_PEER_ACCESS = 0;
|
||||
// Force async copies to actually use the synchronous copy interface.
|
||||
int HIP_FORCE_SYNC_COPY = 0;
|
||||
|
||||
int HIP_COHERENT_HOST_ALLOC = 0;
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -358,13 +363,24 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
|
||||
|
||||
//---
|
||||
// Must be called after kernel finishes, this releases the lock on the stream so other commands can submit.
|
||||
void ihipStream_t::lockclose_postKernelCommand(hc::accelerator_view *av)
|
||||
void ihipStream_t::lockclose_postKernelCommand(const char * kernelName, hc::accelerator_view *av)
|
||||
{
|
||||
bool blockThisKernel = false;
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
if (!g_hipLaunchBlockingKernels.empty()) {
|
||||
std::string kernelNameString(kernelName);
|
||||
for (auto o=g_hipLaunchBlockingKernels.begin(); o!=g_hipLaunchBlockingKernels.end(); o++) {
|
||||
if ((*o == kernelNameString)) {
|
||||
//printf ("force blocking for kernel %s\n", o->c_str());
|
||||
blockThisKernel = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING || blockThisKernel) {
|
||||
// TODO - fix this so it goes through proper stream::wait() call.// direct wait OK since we know the stream is locked.
|
||||
av->wait(hc::hcWaitModeActive);
|
||||
tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str());
|
||||
tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for kernel '%s' completion\n", ToString(this).c_str(), kernelName);
|
||||
}
|
||||
|
||||
_criticalData.unlock(); // paired with lock from lockopen_preKernelCommand.
|
||||
@@ -1243,7 +1259,15 @@ void ihipInit()
|
||||
//-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading
|
||||
|
||||
// TODO: In HIP/hcc, this variable blocks after both kernel commmands and data transfer. Maybe should be bit-mask for each command type?
|
||||
READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
|
||||
READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP kernel launches 'host-synchronous', so they block until any kernel launches. Alias: CUDA_LAUNCH_BLOCKING." );
|
||||
READ_ENV_S(release, HIP_LAUNCH_BLOCKING_KERNELS, 0, "Comma-separated list of kernel names to make host-synchronous, so they block until completed.");
|
||||
if (!HIP_LAUNCH_BLOCKING_KERNELS.empty()) {
|
||||
tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels);
|
||||
}
|
||||
READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." );
|
||||
|
||||
|
||||
|
||||
READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
|
||||
if ((HIP_DB & (1<<DB_API)) && (HIP_TRACE_API == 0)) {
|
||||
// Set HIP_TRACE_API default before we read it, so it is printed correctly.
|
||||
@@ -1252,6 +1276,8 @@ void ihipInit()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes.");
|
||||
READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White");
|
||||
READ_ENV_I(release, HIP_PROFILE_API, 0, "Add HIP API markers to ATP file generated with CodeXL. 0x1=short API name, 0x2=full API name including args.");
|
||||
@@ -1262,10 +1288,14 @@ void ihipInit()
|
||||
|
||||
|
||||
READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application");
|
||||
READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer");
|
||||
READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies");
|
||||
READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer");
|
||||
READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies");
|
||||
|
||||
// TODO - review, can we remove this?
|
||||
READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced.");
|
||||
|
||||
READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact.");
|
||||
|
||||
// Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled.
|
||||
if (HIP_DB && !COMPILE_HIP_DB) {
|
||||
fprintf (stderr, "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?)\n", HIP_DB);
|
||||
@@ -1398,6 +1428,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
|
||||
|
||||
void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream)
|
||||
{
|
||||
|
||||
if (HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) {
|
||||
std::stringstream os_pre;
|
||||
std::stringstream os;
|
||||
@@ -1515,11 +1546,11 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
|
||||
//---
|
||||
//Called after kernel finishes execution.
|
||||
//This releases the lock on the stream.
|
||||
void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp)
|
||||
void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp)
|
||||
{
|
||||
tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");
|
||||
|
||||
stream->lockclose_postKernelCommand(lp.av);
|
||||
stream->lockclose_postKernelCommand(kernelName, lp.av);
|
||||
MARKER_END();
|
||||
}
|
||||
|
||||
@@ -1883,8 +1914,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
|
||||
};
|
||||
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf(DB_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes);
|
||||
if (HIP_API_BLOCKING) {
|
||||
tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for completion of hipMemcpyAsync(sz=%zu)\n", ToString(this).c_str(), sizeBytes);
|
||||
this->wait(crit);
|
||||
}
|
||||
|
||||
|
||||
@@ -45,6 +45,7 @@ extern const int release;
|
||||
|
||||
// TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for kernels?
|
||||
extern int HIP_LAUNCH_BLOCKING;
|
||||
extern int HIP_API_BLOCKING;
|
||||
|
||||
extern int HIP_PRINT_ENV;
|
||||
extern int HIP_PROFILE_API;
|
||||
@@ -56,6 +57,8 @@ extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creat
|
||||
extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
|
||||
extern int HIP_FORCE_P2P_HOST;
|
||||
|
||||
extern int HIP_COHERENT_HOST_ALLOC;
|
||||
|
||||
|
||||
//---
|
||||
// Chicken bits for disabling functionality to work around potential issues:
|
||||
@@ -156,11 +159,6 @@ extern const char *API_COLOR_END;
|
||||
#endif
|
||||
|
||||
|
||||
// Compile code that force hipHostMalloc only allocates finegrained system memory.
|
||||
#ifndef HIP_COHERENT_HOST_ALLOC
|
||||
#define HIP_COHERENT_HOST_ALLOC 0
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary.
|
||||
@@ -455,7 +453,7 @@ public:
|
||||
//---
|
||||
// Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex.
|
||||
LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
|
||||
void lockclose_postKernelCommand(hc::accelerator_view *av);
|
||||
void lockclose_postKernelCommand(const char *kernelName, hc::accelerator_view *av);
|
||||
|
||||
|
||||
void locked_wait(bool assertQueueEmpty=false);
|
||||
|
||||
@@ -157,20 +157,6 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
return ihipLogStatus(hip_status);
|
||||
}
|
||||
|
||||
void ihipReadSingleEnv(int *var_ptr, const char *var_name1, const char *description)
|
||||
{
|
||||
char * env = getenv(var_name1);
|
||||
|
||||
// Default is set when variable is initialized (at top of this file), so only override if we find
|
||||
// an environment variable.
|
||||
if (env) {
|
||||
long int v = strtol(env, NULL, 0);
|
||||
*var_ptr = (int) (v);
|
||||
}
|
||||
if (HIP_PRINT_ENV) {
|
||||
printf ("%-30s = %2d : %s\n", var_name1, *var_ptr, description);
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
@@ -193,16 +179,12 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
|
||||
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
|
||||
|
||||
// Read from environment variable of HIP_COHERENT_HOST_ALLOC
|
||||
int coherent_alloc=0;
|
||||
ihipReadSingleEnv(&coherent_alloc, "HIP_COHERENT_HOST_ALLOC", "Flag to force allocate finegrained system memory");
|
||||
|
||||
if (flags & ~supportedFlags) {
|
||||
hip_status = hipErrorInvalidValue;
|
||||
}
|
||||
else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
if(coherent_alloc){
|
||||
if(HIP_COHERENT_HOST_ALLOC){
|
||||
// Force to allocate finedgrained system memory
|
||||
*ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
|
||||
if(sizeBytes < 1 && (*ptr == NULL)){
|
||||
@@ -853,13 +835,13 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
}
|
||||
}
|
||||
|
||||
stream->lockclose_postKernelCommand(&crit->_av);
|
||||
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
|
||||
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset [stream:%p].\n", __func__, (void*)stream);
|
||||
if (HIP_API_BLOCKING) {
|
||||
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
|
||||
cf.wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream);
|
||||
//tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream);
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -906,7 +888,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
// TODO - is hipMemset supposed to be async?
|
||||
cf.wait();
|
||||
|
||||
stream->lockclose_postKernelCommand(&crit->_av);
|
||||
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
|
||||
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
|
||||
@@ -333,7 +333,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f,
|
||||
#endif // USE_DISPATCH_HSA_KERNEL
|
||||
|
||||
|
||||
ihipPostLaunchKernel(hStream, lp);
|
||||
ihipPostLaunchKernel(f->_kernelName, hStream, lp);
|
||||
|
||||
}
|
||||
|
||||
|
||||
Referens i nytt ärende
Block a user