diff --git a/projects/clr/rocclr/runtime/device/comgrctx.cpp b/projects/clr/rocclr/runtime/device/comgrctx.cpp index 384237b29e..72d59c557f 100644 --- a/projects/clr/rocclr/runtime/device/comgrctx.cpp +++ b/projects/clr/rocclr/runtime/device/comgrctx.cpp @@ -13,7 +13,7 @@ bool Comgr::is_ready_ = false; bool Comgr::LoadLib() { #if defined(COMGR_DYN_DLL) - LogInfo("Loading COMGR library."); + ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Loading COMGR library."); static const char* ComgrLibName = LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so"), WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so")); diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp index 5f592fdf2d..1664582d96 100644 --- a/projects/clr/rocclr/runtime/device/devkernel.cpp +++ b/projects/clr/rocclr/runtime/device/devkernel.cpp @@ -1731,7 +1731,7 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { } while (end != std::string::npos); if (tokens.size() < 2) { - LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); + ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str()); continue; } @@ -1747,7 +1747,7 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { // ensure that we have the correct number of tokens if (tokens.size() < end + 1 /*last token is the fmtString*/) { - LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); + ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str()); continue; } diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp index 9945830ea8..79a8476f0f 100644 --- a/projects/clr/rocclr/runtime/device/devprogram.cpp +++ b/projects/clr/rocclr/runtime/device/devprogram.cpp @@ -1398,7 +1398,7 @@ static void dumpCodeObject(const std::string& image) { char fname[30]; static std::atomic index; sprintf(fname, "_code_object%04d.o", index++); - LogPrintfInfo("Code object saved in %s\n", fname); + ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname); std::ofstream ofs; ofs.open(fname, std::ios::binary); ofs << image; @@ -2905,7 +2905,7 @@ bool Program::createKernelMetadataMap() { status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD); if (status == AMD_COMGR_STATUS_SUCCESS) { - LogInfo("Using Code Object V2."); + ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2."); hasKernelMD = true; codeObjectVer_ = 2; } @@ -2913,7 +2913,7 @@ bool Program::createKernelMetadataMap() { status = amd::Comgr::metadata_lookup(metadata_, "amdhsa.kernels", &kernelsMD); if (status == AMD_COMGR_STATUS_SUCCESS) { - LogInfo("Using Code Object V3."); + ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3."); hasKernelMD = true; codeObjectVer_ = 3; } diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp index af4d6eec95..edd487097d 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp @@ -396,7 +396,7 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) { } bool Device::init() { - LogInfo("Initializing HSA stack."); + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack."); // Initialize the compiler if (!initCompiler(offlineDevice_)) { @@ -469,7 +469,7 @@ bool Device::init() { } // If the AmdDeviceInfo for the HsaDevice Id could not be found return false if (id == HSA_INVALID_DEVICE_ID) { - LogPrintfWarning("Could not find a DeviceInfo entry for %d", deviceId); + ClPrint(amd::LOG_WARNING, amd::LOG_INIT, "Could not find a DeviceInfo entry for %d", deviceId); continue; } roc_device->deviceInfo_ = DeviceInfo[id]; @@ -1835,8 +1835,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { assert(queuePool_.size() <= GPU_MAX_HW_QUEUES); - LogPrintfInfo("number of allocated hardware queues: %d, maximum: %d", - queuePool_.size(), GPU_MAX_HW_QUEUES); + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d", + queuePool_.size(), GPU_MAX_HW_QUEUES); // If we have reached the max number of queues, reuse an existing queue, // choosing the one with the least number of users. @@ -1846,7 +1846,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { [] (PoolRef A, PoolRef B) { return A.second.refCount < B.second.refCount; }); - LogPrintfInfo("selected queue with least refCount: %p (%d)", + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "selected queue with least refCount: %p (%d)", lowest->first, lowest->second.refCount); lowest->second.refCount++; return lowest->first; @@ -1870,7 +1870,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) { return nullptr; } } - LogPrintfInfo("created hardware queue %p with size %d", + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d", queue, queue_size); hsa_amd_profiling_set_profiler_enabled(queue, 1); auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo())); @@ -1890,7 +1890,7 @@ void Device::releaseQueue(hsa_queue_t* queue) { if (qInfo.refCount != 0) { return; } - LogPrintfInfo("deleting hardware queue %p with refCount 0", queue); + ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue); hsa_queue_destroy(queue); queuePool_.erase(qIter); diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index fe18049bbe..f23cde6f16 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -300,7 +300,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para gpuMem->syncCacheFromHost(*this); } const void* globalAddress = *reinterpret_cast(params + desc.offset_); - LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index, + ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress, gpuMem->getDeviceMemory(), reinterpret_cast
(gpuMem->getDeviceMemory()) + mem->getSize(), @@ -2034,7 +2034,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const return false; } - LogPrintfInfo("[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str()); + ClPrint(amd::LOG_INFO, amd::LOG_KERN, "[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str()); // Check if runtime has to setup hidden arguments for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp index fb87d99588..6611d7abdc 100644 --- a/projects/clr/rocclr/runtime/platform/command.cpp +++ b/projects/clr/rocclr/runtime/platform/command.cpp @@ -96,6 +96,8 @@ bool Event::setStatus(cl_int status, uint64_t timeStamp) { } if (status <= CL_COMPLETE) { + ClPrint(LOG_DEBUG, LOG_CMD, "command %p complete", &command()); + // Before we notify the waiters that this event reached the CL_COMPLETE // status, we release all the resources associated with this instance. releaseResources(); @@ -160,12 +162,16 @@ bool Event::awaitCompletion() { return false; } + ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status_); + ScopedLock lock(lock_); // Wait until the status becomes CL_COMPLETE or negative. while (status_ > CL_COMPLETE) { lock_.wait(); } + + ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this); } return status_ == CL_COMPLETE; @@ -180,6 +186,7 @@ bool Event::notifyCmdQueue() { notified_.clear(); return false; } + ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue); command->enqueue(); command->release(); } @@ -220,6 +227,7 @@ void Command::enqueue() { if (IS_HIP) { queue_->setLastQueuedCommand(this); } + ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this); queue_->append(*this); queue_->flush(); if ((queue_->device().settings().waitCommand_ && (type_ != 0)) || diff --git a/projects/clr/rocclr/runtime/platform/commandqueue.cpp b/projects/clr/rocclr/runtime/platform/commandqueue.cpp index 160c97b67e..9a0b0c0f24 100644 --- a/projects/clr/rocclr/runtime/platform/commandqueue.cpp +++ b/projects/clr/rocclr/runtime/platform/commandqueue.cpp @@ -75,10 +75,11 @@ void HostQueue::finish() { if (command == NULL) { return; } - + ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued"); command->enqueue(); command->awaitCompletion(); command->release(); + ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished"); } void HostQueue::loop(device::VirtualDevice* virtualDevice) { @@ -132,6 +133,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) { continue; } + ClPrint(LOG_DEBUG, LOG_CMD, "command is submitted: %p", command); command->setStatus(CL_SUBMITTED); // Submit to the device queue. command->submit(*virtualDevice); diff --git a/projects/clr/rocclr/runtime/platform/runtime.cpp b/projects/clr/rocclr/runtime/platform/runtime.cpp index 0850d6cd24..afeb69626b 100644 --- a/projects/clr/rocclr/runtime/platform/runtime.cpp +++ b/projects/clr/rocclr/runtime/platform/runtime.cpp @@ -58,10 +58,12 @@ bool Runtime::init() { if (!Flag::init() || !option::init() || !Device::init() // Agent initializes last || !Agent::init()) { + ClPrint(LOG_ERROR, LOG_INIT, "Runtime initilization failed"); return false; } initialized_ = true; + ClTrace(LOG_DEBUG, LOG_INIT); return true; } @@ -69,6 +71,7 @@ void Runtime::tearDown() { if (!initialized_) { return; } + ClTrace(LOG_DEBUG, LOG_INIT); Agent::tearDown(); Device::tearDown(); diff --git a/projects/clr/rocclr/runtime/utils/debug.hpp b/projects/clr/rocclr/runtime/utils/debug.hpp index e89fe7c8c7..c3273505f8 100644 --- a/projects/clr/rocclr/runtime/utils/debug.hpp +++ b/projects/clr/rocclr/runtime/utils/debug.hpp @@ -14,6 +14,26 @@ namespace amd { /*@{*/ enum LogLevel { LOG_NONE = 0, LOG_ERROR = 1, LOG_WARNING = 2, LOG_INFO = 3, LOG_DEBUG = 4 }; +enum LogMask { + LOG_API = 0x00000001, //!< API call + LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers + LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish + LOG_AQL = 0x00000008, //!< Decode and display AQL packets + LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents + LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool + LOG_LOCK = 0x00000040, //!< Locks and thread-safety code. + LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc. + LOG_COPY = 0x00000100, //!< Copy debug + LOG_COPY2 = 0x00000200, //!< Detailed copy debug + LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events. + LOG_INIT = 0x00000800, //!< Initialization and shutdown + LOG_MISC = 0x00001000, //!< misc debug, not yet classified + LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet + LOG_CODE = 0x00004000, //!< Show code creation debug + LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands + LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero +}; + //! \cond ignore extern "C" void breakpoint(); //! \endcond @@ -137,4 +157,35 @@ inline void warning(const char* msg) { amd::report_warning(msg); } #define DebugInfoGuarantee(cond) LogGuarantee(cond, amd::LOG_INFO, "Warning") +#ifndef NDEBUG +#define CL_LOG +#endif +// You may define CL_LOG to enable following log functions even for release build +#ifdef CL_LOG +#define ClPrint(level, mask, format, ...) \ + do { \ + if (LOG_LEVEL >= level) { \ + if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) { \ + amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__); \ + } \ + } \ + } while (false) + +#define ClCondPrint(level, mask, condition, format, ...) \ + do { \ + if (LOG_LEVEL >= level && (condition)) { \ + if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) { \ + amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__); \ + } \ + } \ + } while (false) + +#define ClTrace(level, mask) ClPrint(level, mask, "%s", __func__) + +#else /*CL_LOG*/ +#define ClPrint(level, mask, format, ...) (void)(0) +#define ClCondPrint(level, mask, condition, format, ...) (void)(0) +#define ClTrace(level, mask) (void)(0) +#endif /*CL_LOG*/ + #endif /*DEBUG_HPP_*/ diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index cfceb6cc2a..cd5c914e4c 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -10,6 +10,8 @@ \ release(int, LOG_LEVEL, 0, \ "The default log level") \ +release(uint, GPU_LOG_MASK, 0, \ + "The mask to enable specific kinds of logs") \ debug(uint, DEBUG_GPU_FLAGS, 0, \ "The debug options for GPU device") \ release(uint, GPU_MAX_COMMAND_QUEUES, 300, \