diff --git a/projects/clr/rocclr/runtime/device/comgrctx.cpp b/projects/clr/rocclr/runtime/device/comgrctx.cpp
index 384237b29e..72d59c557f 100644
--- a/projects/clr/rocclr/runtime/device/comgrctx.cpp
+++ b/projects/clr/rocclr/runtime/device/comgrctx.cpp
@@ -13,7 +13,7 @@ bool Comgr::is_ready_ = false;
 
 bool Comgr::LoadLib() {
 #if defined(COMGR_DYN_DLL)
-  LogInfo("Loading COMGR library.");
+  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Loading COMGR library.");
   static const char* ComgrLibName =
     LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so"),
                 WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so"));
diff --git a/projects/clr/rocclr/runtime/device/devkernel.cpp b/projects/clr/rocclr/runtime/device/devkernel.cpp
index 5f592fdf2d..1664582d96 100644
--- a/projects/clr/rocclr/runtime/device/devkernel.cpp
+++ b/projects/clr/rocclr/runtime/device/devkernel.cpp
@@ -1731,7 +1731,7 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
     } while (end != std::string::npos);
 
     if (tokens.size() < 2) {
-      LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
+      ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str());
       continue;
     }
 
@@ -1747,7 +1747,7 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
 
     // ensure that we have the correct number of tokens
     if (tokens.size() < end + 1 /*last token is the fmtString*/) {
-      LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str());
+      ClPrint(amd::LOG_WARNING, amd::LOG_KERN, "Invalid PrintInfo string: \"%s\"", str.c_str());
       continue;
     }
 
diff --git a/projects/clr/rocclr/runtime/device/devprogram.cpp b/projects/clr/rocclr/runtime/device/devprogram.cpp
index 9945830ea8..79a8476f0f 100644
--- a/projects/clr/rocclr/runtime/device/devprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/devprogram.cpp
@@ -1398,7 +1398,7 @@ static void dumpCodeObject(const std::string& image) {
     char fname[30];
     static std::atomic<int> index;
     sprintf(fname, "_code_object%04d.o", index++);
-    LogPrintfInfo("Code object saved in %s\n", fname);
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname);
     std::ofstream ofs;
     ofs.open(fname, std::ios::binary);
     ofs << image;
@@ -2905,7 +2905,7 @@ bool Program::createKernelMetadataMap() {
 
   status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD);
   if (status == AMD_COMGR_STATUS_SUCCESS) {
-    LogInfo("Using Code Object V2.");
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2.");
     hasKernelMD = true;
     codeObjectVer_ = 2;
   }
@@ -2913,7 +2913,7 @@ bool Program::createKernelMetadataMap() {
     status = amd::Comgr::metadata_lookup(metadata_, "amdhsa.kernels", &kernelsMD);
 
     if (status == AMD_COMGR_STATUS_SUCCESS) {
-      LogInfo("Using Code Object V3.");
+      ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3.");
       hasKernelMD = true;
       codeObjectVer_ = 3;
     }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index af4d6eec95..edd487097d 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -396,7 +396,7 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) {
 }
 
 bool Device::init() {
-  LogInfo("Initializing HSA stack.");
+  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");
 
   // Initialize the compiler
   if (!initCompiler(offlineDevice_)) {
@@ -469,7 +469,7 @@ bool Device::init() {
     }
     // If the AmdDeviceInfo for the HsaDevice Id could not be found return false
     if (id == HSA_INVALID_DEVICE_ID) {
-      LogPrintfWarning("Could not find a DeviceInfo entry for %d", deviceId);
+      ClPrint(amd::LOG_WARNING, amd::LOG_INIT, "Could not find a DeviceInfo entry for %d", deviceId);
       continue;
     }
     roc_device->deviceInfo_ = DeviceInfo[id];
@@ -1835,8 +1835,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
 
 hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
   assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
-  LogPrintfInfo("number of allocated hardware queues: %d, maximum: %d",
-                queuePool_.size(), GPU_MAX_HW_QUEUES);
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
+          queuePool_.size(), GPU_MAX_HW_QUEUES);
 
   // If we have reached the max number of queues, reuse an existing queue,
   // choosing the one with the least number of users.
@@ -1846,7 +1846,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
                                    [] (PoolRef A, PoolRef B) {
                                      return A.second.refCount < B.second.refCount;
                                    });
-    LogPrintfInfo("selected queue with least refCount: %p (%d)",
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "selected queue with least refCount: %p (%d)",
                   lowest->first, lowest->second.refCount);
     lowest->second.refCount++;
     return lowest->first;
@@ -1870,7 +1870,7 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
       return nullptr;
     }
   }
-  LogPrintfInfo("created hardware queue %p with size %d",
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
                 queue, queue_size);
   hsa_amd_profiling_set_profiler_enabled(queue, 1);
   auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
@@ -1890,7 +1890,7 @@ void Device::releaseQueue(hsa_queue_t* queue) {
   if (qInfo.refCount != 0) {
       return;
   }
-  LogPrintfInfo("deleting hardware queue %p with refCount 0", queue);
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
 
   hsa_queue_destroy(queue);
   queuePool_.erase(qIter);
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index fe18049bbe..f23cde6f16 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -300,7 +300,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
             gpuMem->syncCacheFromHost(*this);
           }
           const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
-          LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
+          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
             desc.typeName_.c_str(), desc.name_.c_str(),
             globalAddress, gpuMem->getDeviceMemory(),
             reinterpret_cast<address>(gpuMem->getDeviceMemory()) + mem->getSize(),
@@ -2034,7 +2034,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
       return false;
     }
 
-    LogPrintfInfo("[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str());
+    ClPrint(amd::LOG_INFO, amd::LOG_KERN, "[%zx]!\tShaderName : %s\n", std::this_thread::get_id(), gpuKernel.name().c_str());
 
     // Check if runtime has to setup hidden arguments
     for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
diff --git a/projects/clr/rocclr/runtime/platform/command.cpp b/projects/clr/rocclr/runtime/platform/command.cpp
index fb87d99588..6611d7abdc 100644
--- a/projects/clr/rocclr/runtime/platform/command.cpp
+++ b/projects/clr/rocclr/runtime/platform/command.cpp
@@ -96,6 +96,8 @@ bool Event::setStatus(cl_int status, uint64_t timeStamp) {
   }
 
   if (status <= CL_COMPLETE) {
+    ClPrint(LOG_DEBUG, LOG_CMD, "command %p complete", &command());
+
     // Before we notify the waiters that this event reached the CL_COMPLETE
     // status, we release all the resources associated with this instance.
     releaseResources();
@@ -160,12 +162,16 @@ bool Event::awaitCompletion() {
       return false;
     }
 
+    ClPrint(LOG_DEBUG, LOG_WAIT, "waiting for event %p to complete, current status %d", this, status_);
+
     ScopedLock lock(lock_);
 
     // Wait until the status becomes CL_COMPLETE or negative.
     while (status_ > CL_COMPLETE) {
       lock_.wait();
     }
+
+    ClPrint(LOG_DEBUG, LOG_WAIT, "event %p wait completed", this);
   }
 
   return status_ == CL_COMPLETE;
@@ -180,6 +186,7 @@ bool Event::notifyCmdQueue() {
       notified_.clear();
       return false;
     }
+    ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
     command->enqueue();
     command->release();
   }
@@ -220,6 +227,7 @@ void Command::enqueue() {
   if (IS_HIP) {
     queue_->setLastQueuedCommand(this);
   }
+  ClPrint(LOG_DEBUG, LOG_CMD, "command is enqueued: %p", this);
   queue_->append(*this);
   queue_->flush();
   if ((queue_->device().settings().waitCommand_ && (type_ != 0)) ||
diff --git a/projects/clr/rocclr/runtime/platform/commandqueue.cpp b/projects/clr/rocclr/runtime/platform/commandqueue.cpp
index 160c97b67e..9a0b0c0f24 100644
--- a/projects/clr/rocclr/runtime/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/runtime/platform/commandqueue.cpp
@@ -75,10 +75,11 @@ void HostQueue::finish() {
   if (command == NULL) {
     return;
   }
-
+  ClPrint(LOG_DEBUG, LOG_CMD, "marker is queued");
   command->enqueue();
   command->awaitCompletion();
   command->release();
+  ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished");
 }
 
 void HostQueue::loop(device::VirtualDevice* virtualDevice) {
@@ -132,6 +133,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
       continue;
     }
 
+    ClPrint(LOG_DEBUG, LOG_CMD, "command is submitted: %p", command);
     command->setStatus(CL_SUBMITTED);
     // Submit to the device queue.
     command->submit(*virtualDevice);
diff --git a/projects/clr/rocclr/runtime/platform/runtime.cpp b/projects/clr/rocclr/runtime/platform/runtime.cpp
index 0850d6cd24..afeb69626b 100644
--- a/projects/clr/rocclr/runtime/platform/runtime.cpp
+++ b/projects/clr/rocclr/runtime/platform/runtime.cpp
@@ -58,10 +58,12 @@ bool Runtime::init() {
   if (!Flag::init() || !option::init() || !Device::init()
       // Agent initializes last
       || !Agent::init()) {
+    ClPrint(LOG_ERROR, LOG_INIT, "Runtime initilization failed");
     return false;
   }
 
   initialized_ = true;
+  ClTrace(LOG_DEBUG, LOG_INIT);
   return true;
 }
 
@@ -69,6 +71,7 @@ void Runtime::tearDown() {
   if (!initialized_) {
     return;
   }
+  ClTrace(LOG_DEBUG, LOG_INIT);
 
   Agent::tearDown();
   Device::tearDown();
diff --git a/projects/clr/rocclr/runtime/utils/debug.hpp b/projects/clr/rocclr/runtime/utils/debug.hpp
index e89fe7c8c7..c3273505f8 100644
--- a/projects/clr/rocclr/runtime/utils/debug.hpp
+++ b/projects/clr/rocclr/runtime/utils/debug.hpp
@@ -14,6 +14,26 @@ namespace amd { /*@{*/
 
 enum LogLevel { LOG_NONE = 0, LOG_ERROR = 1, LOG_WARNING = 2, LOG_INFO = 3, LOG_DEBUG = 4 };
 
+enum LogMask {
+  LOG_API       = 0x00000001, //!< API call
+  LOG_CMD       = 0x00000002, //!< Kernel and Copy Commands and Barriers
+  LOG_WAIT      = 0x00000004, //!< Synchronization and waiting for commands to finish
+  LOG_AQL       = 0x00000008, //!< Decode and display AQL packets
+  LOG_QUEUE     = 0x00000010, //!< Queue commands and queue contents
+  LOG_SIG       = 0x00000020, //!< Signal creation, allocation, pool
+  LOG_LOCK      = 0x00000040, //!< Locks and thread-safety code.
+  LOG_KERN      = 0x00000080, //!< kernel creations and arguments, etc.
+  LOG_COPY      = 0x00000100, //!< Copy debug
+  LOG_COPY2     = 0x00000200, //!< Detailed copy debug
+  LOG_RESOURCE  = 0x00000400, //!< Resource allocation, performance-impacting events.
+  LOG_INIT      = 0x00000800, //!< Initialization and shutdown
+  LOG_MISC      = 0x00001000, //!< misc debug, not yet classified
+  LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
+  LOG_CODE      = 0x00004000, //!< Show code creation debug
+  LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
+  LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
+};
+
 //! \cond ignore
 extern "C" void breakpoint();
 //! \endcond
@@ -137,4 +157,35 @@ inline void warning(const char* msg) { amd::report_warning(msg); }
 
 #define DebugInfoGuarantee(cond) LogGuarantee(cond, amd::LOG_INFO, "Warning")
 
+#ifndef NDEBUG
+#define CL_LOG
+#endif
+// You may define CL_LOG to enable following log functions even for release build
+#ifdef CL_LOG
+#define ClPrint(level, mask, format, ...)                                                          \
+  do {                                                                                             \
+    if (LOG_LEVEL >= level) {                                                                      \
+      if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {                                        \
+        amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__);                         \
+      }                                                                                            \
+    }                                                                                              \
+  } while (false)
+
+#define ClCondPrint(level, mask, condition, format, ...)                                           \
+  do {                                                                                             \
+    if (LOG_LEVEL >= level && (condition)) {                                                       \
+      if (GPU_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {                                        \
+        amd::log_printf(level, __FILE__, __LINE__, format, ##__VA_ARGS__);                         \
+      }                                                                                            \
+    }                                                                                              \
+  } while (false)
+
+#define ClTrace(level, mask) ClPrint(level, mask, "%s", __func__)
+
+#else /*CL_LOG*/
+#define ClPrint(level, mask, format, ...) (void)(0)
+#define ClCondPrint(level, mask, condition, format, ...) (void)(0)
+#define ClTrace(level, mask) (void)(0)
+#endif /*CL_LOG*/
+
 #endif /*DEBUG_HPP_*/
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index cfceb6cc2a..cd5c914e4c 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -10,6 +10,8 @@
                                                                               \
 release(int, LOG_LEVEL, 0,                                                    \
         "The default log level")                                              \
+release(uint, GPU_LOG_MASK, 0,                                                \
+        "The mask to enable specific kinds of logs")                          \
 debug(uint, DEBUG_GPU_FLAGS, 0,                                               \
         "The debug options for GPU device")                                   \
 release(uint, GPU_MAX_COMMAND_QUEUES, 300,                                    \