From 46a6201be00a61d95f891a7f4c933e209b9b3b53 Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 4 Nov 2019 14:44:59 -0500
Subject: [PATCH] P4 to Git Change 2024454 by axie@axie-hip-rocm on 2019/11/04
14:38:31
SWDEV-198863 - Options for hip-clang-vdi path to provide the chicken bits, or functional equivalents to HCC_DB (phase 1)
1. The log macros is turned off for release build. So log functions has zero impact to release build.
2. The log macros have level, mask, condition control. So we can have more control to avoid log flooding.
I also adjusted some existing log to use new log functions.
1. To excercise and test the new log functions.
2. To improve performance slightly.
3. The change is mainly for HIP-ROCM, we can move more in next phases for PAL or ORCA.
4. I make these log feature unavailable for release build. We can revert to old log functions for release build in a case by case method.
Tests:
1. http://ocltc.amd.com:8111/viewModification.html?modId=128289&personal=true&tab=vcsModificationBuilds
http://ocltc.amd.com:8111/viewModification.html?modId=128358&personal=true&tab=vcsModificationBuilds
2. release build, run hip program, there is no log
3. fastdebug build, run hip program,
export LOG_LEVEL=3
export GPU_LOG_MASK=4294967295
There was a lot of logs.
4. fastdebug build, run hip program,
export LOG_LEVEL=2
export GPU_LOG_MASK=4294967295
There was no logs.
5. fastdebug build, run hip program,
export LOG_LEVEL=3
export GPU_LOG_MASK=4294967294
There was much less logs.
6. fastdebug build, run hip program,
export LOG_LEVEL=3
export GPU_LOG_MASK=47102
There was even much less logs. The logs was expected according to the mask.
7. Tested step 2 to 6 similarily in Windows and Linux
ReviewBoard: http://ocltc.amd.com/reviews/r/18215
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/hip/hip_internal.hpp#46 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_memory.cpp#82 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hip_stream.cpp#26 edit
... //depot/stg/opencl/drivers/opencl/api/hip/hiprtc_internal.hpp#2 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/comgrctx.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devkernel.cpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/devprogram.cpp#68 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#137 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#100 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#32 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/runtime.cpp#40 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/debug.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#323 edit
[ROCm/clr commit: 3f6e18bf6b5b85a79724f53a456d292c0a6ac38b]
---
.../clr/rocclr/runtime/device/comgrctx.cpp | 2 +-
.../clr/rocclr/runtime/device/devkernel.cpp | 4 +-
.../clr/rocclr/runtime/device/devprogram.cpp | 6 +--
.../rocclr/runtime/device/rocm/rocdevice.cpp | 14 ++---
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 4 +-
.../clr/rocclr/runtime/platform/command.cpp | 8 +++
.../rocclr/runtime/platform/commandqueue.cpp | 4 +-
.../clr/rocclr/runtime/platform/runtime.cpp | 3 ++
projects/clr/rocclr/runtime/utils/debug.hpp | 51 +++++++++++++++++++
projects/clr/rocclr/runtime/utils/flags.hpp | 2 +
10 files changed, 82 insertions(+), 16 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/comgrctx.cpp b/projects/clr/rocclr/runtime/device/comgrctx.cpp
index 384237b29e..72d59c557f 100644
--- a/projects/clr/rocclr/runtime/device/comgrctx.cpp
+++ b/projects/clr/rocclr/runtime/device/comgrctx.cpp
@@ -13,7 +13,7 @@ bool Comgr::is_ready_ = false;
bool Comgr::LoadLib() {
#if defined(COMGR_DYN_DLL)
- LogInfo("Loading COMGR library.");
+ ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Loading COMGR library.");
static const char* ComgrLibName =
LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so"),
WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so"));
diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp
index 5f592fdf2d..1664582d96 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.cpp
@@ -1731,7 +1731,7 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) {
} while (end != std::string::npos);
if (tokens.size() < 2) {
- LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
+ ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str());
continue;
}
@@ -1747,7 +1747,7 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) {
// ensure that we have the correct number of tokens
if (tokens.size() < end + 1 /*last token is the fmtString*/) {
- LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
+ ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str());
continue;
}
diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp
index 9945830ea8..79a8476f0f 100644
--- a/projects/clr/rocclr/runtime/device/devprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/devprogram.cpp
@@ -1398,7 +1398,7 @@ static void dumpCodeObject(const std::string& image) {
char fname[30];
static std::atomic index;
sprintf(fname, "_code_object%04d.o", index++);
- LogPrintfInfo("Code object saved in %s\n", fname);
+ ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname);
std::ofstream ofs;
ofs.open(fname, std::ios::binary);
ofs << image;
@@ -2905,7 +2905,7 @@ bool Program::createKernelMetadataMap() {
status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD);
if (status == AMD_COMGR_STATUS_SUCCESS) {
- LogInfo("Using Code Object V2.");
+ ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2.");
hasKernelMD = true;
codeObjectVer_ = 2;
}
@@ -2913,7 +2913,7 @@ bool Program::createKernelMetadataMap() {
status = amd::Comgr::metadata_lookup(metadata_, "amdhsa.kernels", &kernelsMD);
if (status == AMD_COMGR_STATUS_SUCCESS) {
- LogInfo("Using Code Object V3.");
+ ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3.");
hasKernelMD = true;
codeObjectVer_ = 3;
}
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index af4d6eec95..edd487097d 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -396,7 +396,7 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) {
}
bool Device::init() {
- LogInfo("Initializing HSA stack.");
+ ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");
// Initialize the compiler
if (!initCompiler(offlineDevice_)) {
@@ -469,7 +469,7 @@ bool Device::init() {
}
// If the AmdDeviceInfo for the HsaDevice Id could not be found return false
if (id == HSA_INVALID_DEVICE_ID) {
- LogPrintfWarning("Could not find a DeviceInfo entry for %d", deviceId);
+ ClPrint(amd::LOG_WARNING, amd::LOG_INIT, "Could not find a DeviceInfo entry for %d", deviceId);
continue;
}
roc_device->deviceInfo_ = DeviceInfo[id];
@@ -1835,8 +1835,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
- LogPrintfInfo("number of allocated hardware queues: %d, maximum: %d",
- queuePool_.size(), GPU_MAX_HW_QUEUES);
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
+ queuePool_.size(), GPU_MAX_HW_QUEUES);
// If we have reached the max number of queues, reuse an existing queue,
// choosing the one with the least number of users.
@@ -1846,7 +1846,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
[] (PoolRef A, PoolRef B) {
return A.second.refCount < B.second.refCount;
});
- LogPrintfInfo("selected queue with least refCount: %p (%d)",
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "selected queue with least refCount: %p (%d)",
lowest->first, lowest->second.refCount);
lowest->second.refCount++;
return lowest->first;
@@ -1870,7 +1870,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
return nullptr;
}
}
- LogPrintfInfo("created hardware queue %p with size %d",
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
queue, queue_size);
hsa_amd_profiling_set_profiler_enabled(queue, 1);
auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
@@ -1890,7 +1890,7 @@ void Device::releaseQueue(hsa_queue_t* queue) {
if (qInfo.refCount != 0) {
return;
}
- LogPrintfInfo("deleting hardware queue %p with refCount 0", queue);
+ ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
hsa_queue_destroy(queue);
queuePool_.erase(qIter);
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index fe18049bbe..f23cde6f16 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -300,7 +300,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
gpuMem->syncCacheFromHost(*this);
}
const void* globalAddress = *reinterpret_cast(params + desc.offset_);
- LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
+ ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
desc.typeName_.c_str(), desc.name_.c_str(),
globalAddress, gpuMem->getDeviceMemory(),
reinterpret_cast(gpuMem->getDeviceMemory()) + mem->getSize(),
@@ -2034,7 +2034,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
return false;
}
- LogPrintfInfo("[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str());
+ ClPrint(amd::LOG_INFO, amd::LOG_KERN, "[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str());
// Check if runtime has to setup hidden arguments
for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp
index fb87d99588..6611d7abdc 100644
--- a/projects/clr/rocclr/runtime/platform/command.cpp
+++ b/projects/clr/rocclr/runtime/platform/command.cpp
@@ -96,6 +96,8 @@ bool Event::setStatus(cl_int status, uint64_t timeStamp) {
}
if (status <= CL_COMPLETE) {
+ ClPrint(LOG_DEBUG, LOG_CMD, "command %p complete", &command());
+
// Before we notify the waiters that this event reached the CL_COMPLETE
// status, we release all the resources associated with this instance.
releaseResources();
@@ -160,12 +162,16 @@ bool Event::awaitCompletion() {
return false;
}
+ ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status_);
+
ScopedLock lock(lock_);
// Wait until the status becomes CL_COMPLETE or negative.
while (status_ > CL_COMPLETE) {
lock_.wait();
}
+
+ ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
}
return status_ == CL_COMPLETE;
@@ -180,6 +186,7 @@ bool Event::notifyCmdQueue() {
notified_.clear();
return false;
}
+ ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
command->enqueue();
command->release();
}
@@ -220,6 +227,7 @@ void Command::enqueue() {
if (IS_HIP) {
queue_->setLastQueuedCommand(this);
}
+ ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
queue_->append(*this);
queue_->flush();
if ((queue_->device().settings().waitCommand_ && (type_ != 0)) ||
diff --git a/projects/clr/rocclr/runtime/platform/commandqueue.cpp b/projects/clr/rocclr/runtime/platform/commandqueue.cpp
index 160c97b67e..9a0b0c0f24 100644
--- a/projects/clr/rocclr/runtime/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/runtime/platform/commandqueue.cpp
@@ -75,10 +75,11 @@ void HostQueue::finish() {
if (command == NULL) {
return;
}
-
+ ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
command->enqueue();
command->awaitCompletion();
command->release();
+ ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished");
}
void HostQueue::loop(device::VirtualDevice* virtualDevice) {
@@ -132,6 +133,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
continue;
}
+ ClPrint(LOG_DEBUG, LOG_CMD, "command is submitted: %p", command);
command->setStatus(CL_SUBMITTED);
// Submit to the device queue.
command->submit(*virtualDevice);
diff --git a/projects/clr/rocclr/runtime/platform/runtime.cpp b/projects/clr/rocclr/runtime/platform/runtime.cpp
index 0850d6cd24..afeb69626b 100644
--- a/projects/clr/rocclr/runtime/platform/runtime.cpp
+++ b/projects/clr/rocclr/runtime/platform/runtime.cpp
@@ -58,10 +58,12 @@ bool Runtime::init() {
if (!Flag::init() || !option::init() || !Device::init()
// Agent initializes last
|| !Agent::init()) {
+ ClPrint(LOG_ERROR, LOG_INIT, "Runtime initilization failed");
return false;
}
initialized_ = true;
+ ClTrace(LOG_DEBUG, LOG_INIT);
return true;
}
@@ -69,6 +71,7 @@ void Runtime::tearDown() {
if (!initialized_) {
return;
}
+ ClTrace(LOG_DEBUG, LOG_INIT);
Agent::tearDown();
Device::tearDown();
diff --git a/projects/clr/rocclr/runtime/utils/debug.hpp b/projects/clr/rocclr/runtime/utils/debug.hpp
index e89fe7c8c7..c3273505f8 100644
--- a/projects/clr/rocclr/runtime/utils/debug.hpp
+++ b/projects/clr/rocclr/runtime/utils/debug.hpp
@@ -14,6 +14,26 @@ namespace amd { /*@{*/
enum LogLevel { LOG_NONE = 0, LOG_ERROR = 1, LOG_WARNING = 2, LOG_INFO = 3, LOG_DEBUG = 4 };
+enum LogMask {
+ LOG_API = 0x00000001, //!< API call
+ LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
+ LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish
+ LOG_AQL = 0x00000008, //!< Decode and display AQL packets
+ LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents
+ LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool
+ LOG_LOCK = 0x00000040, //!< Locks and thread-safety code.
+ LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc.
+ LOG_COPY = 0x00000100, //!< Copy debug
+ LOG_COPY2 = 0x00000200, //!< Detailed copy debug
+ LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events.
+ LOG_INIT = 0x00000800, //!< Initialization and shutdown
+ LOG_MISC = 0x00001000, //!< misc debug, not yet classified
+ LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet
+ LOG_CODE = 0x00004000, //!< Show code creation debug
+ LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands
+ LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero
+};
+
//! \cond ignore
extern "C" void breakpoint();
//! \endcond
@@ -137,4 +157,35 @@ inline void warning(const char* msg) { amd::report_warning(msg); }
#define DebugInfoGuarantee(cond) LogGuarantee(cond, amd::LOG_INFO, "Warning")
+#ifndef NDEBUG
+#define CL_LOG
+#endif
+// You may define CL_LOG to enable following log functions even for release build
+#ifdef CL_LOG
+#define ClPrint(level, mask, format, ...) \
+ do { \
+ if (LOG_LEVEL >= level) { \
+ if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) { \
+ amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__); \
+ } \
+ } \
+ } while (false)
+
+#define ClCondPrint(level, mask, condition, format, ...) \
+ do { \
+ if (LOG_LEVEL >= level && (condition)) { \
+ if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) { \
+ amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__); \
+ } \
+ } \
+ } while (false)
+
+#define ClTrace(level, mask) ClPrint(level, mask, "%s", __func__)
+
+#else /*CL_LOG*/
+#define ClPrint(level, mask, format, ...) (void)(0)
+#define ClCondPrint(level, mask, condition, format, ...) (void)(0)
+#define ClTrace(level, mask) (void)(0)
+#endif /*CL_LOG*/
+
#endif /*DEBUG_HPP_*/
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index cfceb6cc2a..cd5c914e4c 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -10,6 +10,8 @@
\
release(int, LOG_LEVEL, 0, \
"The default log level") \
+release(uint, GPU_LOG_MASK, 0, \
+ "The mask to enable specific kinds of logs") \
debug(uint, DEBUG_GPU_FLAGS, 0, \
"The debug options for GPU device") \
release(uint, GPU_MAX_COMMAND_QUEUES, 300, \