SWDEV-553126 - Improve logging (#835)

* Ability to mask COPY api usage in logs * Show total graph nodes in logs * Add another log level for detailed debug
2025-09-04 10:08:41 -07:00
commit c4537e8050
@@ -32,7 +32,7 @@ namespace hip {

 // ================================================================================================
 hip::Stream* Device::NullStream(bool wait) {
-  ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
  if (null_stream_ == nullptr) {
    amd::ScopedLock lock(lock_);
    if (null_stream_ == nullptr) {
@@ -188,7 +188,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre

  if (wait_null_stream) {
    if (null_stream_) {
-      ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
      waitForStream(null_stream_);
    }
  } else {
@@ -199,7 +199,8 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
          ((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
          // and it's not the current stream
          (active_stream != blocking_stream)) {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p",
+                active_stream);
        // Get the last valid command
        waitForStream(active_stream);
      }
@@ -399,7 +399,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned
  e->SetCaptureStream(stream);
  if ((stream != nullptr && stream != hipStreamLegacy) &&
      (s->GetCaptureStatus() == hipStreamCaptureStatusActive)) {
-    ClPrint(amd::LOG_INFO, amd::LOG_API,
+    ClPrint(amd::LOG_INFO, amd::LOG_CODE,
            "[hipGraph] Current capture node EventRecord on stream : %p, Event %p", stream, event);
    s->SetCaptureEvent(event);
    std::vector<hip::GraphNode*> lastCapturedNodes = s->GetLastCapturedNodes();
@@ -411,7 +411,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned
          reinterpret_cast<hip::GraphNode* const*>(s->GetLastCapturedNodes().data()),
          s->GetLastCapturedNodes().size(), false);
      if (status != hipSuccess) {
-        ClPrint(amd::LOG_ERROR, amd::LOG_API, "hipEventRecord add external event node failed");
+        ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "hipEventRecord add external event node failed");
        return status;
      }
      s->SetLastCapturedNode(node);
@@ -228,7 +228,7 @@ hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph

 hipError_t capturehipLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim,
                                  dim3& blockDim, void**& args, size_t& sharedMemBytes) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node LaunchKernel on stream : %p", stream);

  if (!hip::isValid(stream)) {
@@ -316,7 +316,7 @@ hipError_t capturehipExtModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f
                                           size_t& sharedMemBytes, void**& kernelParams,
                                           void**& extra, hipEvent_t& startEvent,
                                           hipEvent_t& stopEvent, uint32_t& flags) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node ExtModuleLaunchKernel on stream : %p", stream);
  return ihipExtLaunchKernel(stream, f, globalWorkSizeX / localWorkSizeX,
                             globalWorkSizeY / localWorkSizeY, globalWorkSizeZ / localWorkSizeZ,
@@ -329,7 +329,7 @@ hipError_t capturehipExtModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f
 hipError_t capturehipExtLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim,
                                     dim3& blockDim, void**& args, size_t& sharedMemBytes,
                                     hipEvent_t& startEvent, hipEvent_t& stopEvent, int& flags) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node ExtLaunchKernel on stream : %p", stream);
  return ihipExtLaunchKernel(
      stream, reinterpret_cast<hipFunction_t>(const_cast<void*>(hostFunction)), gridDim.x,
@@ -342,7 +342,7 @@ hipError_t capturehipModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f, u
                                        uint32_t& blockDimY, uint32_t& blockDimZ,
                                        uint32_t& sharedMemBytes, void**& kernelParams,
                                        void**& extra) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node ModuleLaunchKernel on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
@@ -372,7 +372,7 @@ hipError_t capturehipModuleLaunchCooperativeKernel(hipStream_t& stream, hipFunct
                                                   uint32_t& gridDimZ, uint32_t& blockDimX,
                                                   uint32_t& blockDimY, uint32_t& blockDimZ,
                                                   uint32_t& sharedMemBytes, void**& kernelParams) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node ModuleLaunchCooperativeKernel on stream : %p", stream);

  if (!hip::isValid(stream)) {
@@ -403,8 +403,8 @@ hipError_t capturehipModuleLaunchCooperativeKernel(hipStream_t& stream, hipFunct

 hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 blockDim,
                                 dim3 gridDim, unsigned int sharedMemBytes, void** extra) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node LaunchByPtr on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node LaunchByPtr on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -433,7 +433,7 @@ hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 b
 hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f, dim3& gridDim,
                                             dim3& blockDim, void**& kernelParams,
                                             uint32_t& sharedMemBytes) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node LaunchCooperativeKernel on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
@@ -462,8 +462,8 @@ hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f
 }

 hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*& p) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy3D on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memcpy3D on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -488,8 +488,8 @@ hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*&
 hipError_t capturehipMemcpy2DAsync(hipStream_t& stream, void*& dst, size_t& dpitch,
                                   const void*& src, size_t& spitch, size_t& width, size_t& height,
                                   hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy2D on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memcpy2D on stream : %p", stream);
  if (dst == nullptr || src == nullptr) {
    return hipErrorInvalidValue;
  }
@@ -536,7 +536,7 @@ hipError_t capturehipMemcpy2DFromArrayAsync(hipStream_t& stream, void*& dst, siz
                                            hipArray_const_t& src, size_t& wOffsetSrc,
                                            size_t& hOffsetSrc, size_t& width, size_t& height,
                                            hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node Memcpy2DFromArray on stream : %p", stream);

  // Skip zero-sized copies
@@ -577,7 +577,7 @@ hipError_t capturehipMemcpy2DFromArrayAsync(hipStream_t& stream, void*& dst, siz
 hipError_t capturehipMemcpy2DToArrayAsync(hipStream_t& stream, hipArray_t& dst, size_t& wOffset,
                                          size_t& hOffset, const void*& src, size_t& spitch,
                                          size_t& width, size_t& height, hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node Memcpy2DFromArray on stream : %p", stream);

  // Skip zero-sized copies
@@ -616,7 +616,7 @@ hipError_t capturehipMemcpy2DToArrayAsync(hipStream_t& stream, hipArray_t& dst,
 }

 hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D*& pCopy) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
@@ -691,7 +691,7 @@ hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D*

 hipError_t capturehipMemcpyAtoHAsync(hipStream_t& stream, void*& dstHost, hipArray_t& srcArray,
                                     size_t& srcOffset, size_t& ByteCount) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream);
  if (srcArray == nullptr || dstHost == nullptr) {
    return hipErrorInvalidValue;
@@ -720,7 +720,7 @@ hipError_t capturehipMemcpyAtoHAsync(hipStream_t& stream, void*& dstHost, hipArr

 hipError_t capturehipMemcpyHtoAAsync(hipStream_t& stream, hipArray_t& dstArray, size_t& dstOffset,
                                     const void*& srcHost, size_t& ByteCount) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream);
  if (dstArray == nullptr || srcHost == nullptr) {
    return hipErrorInvalidValue;
@@ -748,6 +748,8 @@ hipError_t capturehipMemcpyHtoAAsync(hipStream_t& stream, hipArray_t& dstArray,

 hipError_t capturehipMemcpy(hipStream_t stream, void* dst, const void* src, size_t sizeBytes,
                            hipMemcpyKind kind) {
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memcpy on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -770,8 +772,8 @@ hipError_t capturehipMemcpy(hipStream_t stream, void* dst, const void* src, size

 hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& src,
                                 size_t& sizeBytes, hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy1D on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memcpy1D on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -779,9 +781,10 @@ hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& s
 }

 hipError_t capturehipMemcpyHtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDevice,
-                                     const void*& srcHost, size_t& ByteCount, hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node MemcpyHtoD on stream : %p",
-          stream);
+                                     const void*& srcHost, size_t& ByteCount,
+                                     hipMemcpyKind& kind) {
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node MemcpyHtoD on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -791,18 +794,19 @@ hipError_t capturehipMemcpyHtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDev
 hipError_t capturehipMemcpyDtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDevice,
                                     hipDeviceptr_t& srcDevice, size_t& ByteCount,
                                     hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
-          "[hipGraph] Current capture node hipMemcpyDtoD on stream : %p", stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node MemcpyDtoD on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
  return capturehipMemcpy(stream, dstDevice, srcDevice, ByteCount, kind);
 }

-hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, hipDeviceptr_t& srcDevice,
-                                     size_t& ByteCount, hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
-          "[hipGraph] Current capture node hipMemcpyDtoH on stream : %p", stream);
+hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost,
+                                     hipDeviceptr_t& srcDevice, size_t& ByteCount,
+                                     hipMemcpyKind& kind) {
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node MemcpyDtoH on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -810,8 +814,9 @@ hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, hipDev
 }

 hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, const void*& symbol,
-                                           size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+                                           size_t& sizeBytes, size_t& offset,
+                                           hipMemcpyKind& kind) {
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node MemcpyFromSymbolNode on stream : %p", stream);

  if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice &&
@@ -847,7 +852,7 @@ hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, cons

 hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbol, const void*& src,
                                         size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
          "[hipGraph] Current capture node MemcpyToSymbolNode on stream : %p", stream);

  if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice &&
@@ -882,8 +887,8 @@ hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbo

 hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, size_t& valueSize,
                                 size_t& sizeBytes) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset1D on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memset1D on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -908,8 +913,8 @@ hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, si

 hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitch, int& value,
                                   size_t& width, size_t& height) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset2D on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memset2D on stream : %p", stream);
  hipMemsetParams memsetParams = {0};
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
@@ -934,8 +939,8 @@ hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitc

 hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDevPtr, int& value,
                                   hipExtent& extent) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset3D on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node Memset3D on stream : %p", stream);
  if (!hip::isValid(stream)) {
    return hipErrorContextIsDestroyed;
  }
@@ -966,8 +971,8 @@ hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDe
 }

 hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*& userData) {
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node host on stream : %p",
-          stream);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node HostFunction launch on stream : %p", stream);
  if (fn == nullptr) {
    return hipErrorInvalidValue;
  }
@@ -992,6 +997,8 @@ hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*&
 // ================================================================================================
 hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size,
                                 void** dev_ptr) {
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node MallocAsync on stream : %p", stream);
  auto s = reinterpret_cast<hip::Stream*>(stream);
  auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);

@@ -1029,6 +1036,8 @@ hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size

 // ================================================================================================
 hipError_t capturehipFreeAsync(hipStream_t stream, void* dev_ptr) {
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
+          "[hipGraph] Current capture node FreeAsync on stream : %p", stream);
  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
  auto mem_free_node = new hip::GraphMemFreeNode(dev_ptr);
  auto status =
@@ -122,7 +122,7 @@ bool Graph::isGraphValid(Graph* pGraph) {
 // ================================================================================================
 void Graph::AddNode(const Node& node) {
  vertices_.emplace_back(node);
-  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)",
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "[hipGraph] Add %s(%p)",
          GetGraphNodeTypeString(node->GetType()), node);
  node->SetParentGraph(this);
 }
@@ -140,7 +140,7 @@ std::vector<Node> Graph::GetRootNodes() const {
  for (auto entry : vertices_) {
    if (entry->GetInDegree() == 0) {
      roots.push_back(entry);
-      ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)",
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)",
              GetGraphNodeTypeString(entry->GetType()), entry);
    }
  }
@@ -703,6 +703,11 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) {
    repeatLaunch_ = true;
  }

+  ClPrint(amd::LOG_DEBUG, amd::LOG_CODE,
+          "GraphExec::Run max_streams: %d, "
+          "on device: %d, total number of nodes: %d",
+          max_streams_, launch_stream->DeviceId(), topoOrder_.size());
+
  if (max_streams_ == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) {
    if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
      // If the graph has kernels that does device side allocation,  during packet capture, heap is
@@ -2327,7 +2327,7 @@ class GraphMemAllocNode final : public GraphNode {
                                     amd::Device::VmmAccess::kReadWrite);
      va_->retain();
      graph_->IncrementMemAllocNodeCount();  // Increment count of unreleased mem alloc nodes
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
              vaddr_sub_obj->getSvmPtr(),
              reinterpret_cast<char*>(vaddr_sub_obj->getSvmPtr()) + aligned_size, memory());
    }
@@ -2390,7 +2390,7 @@ class GraphMemAllocNode final : public GraphNode {
          // be executed again
          amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
        }
-        ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr);
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr);
      }
    }
    return error;
@@ -2405,7 +2405,7 @@ class GraphMemAllocNode final : public GraphNode {
        va_ = amd::MemObjMap::FindVirtualMemObj(node_params_.dptr);
        amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
      }
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr);
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr);
    }
    return node_params_.dptr;
  }
@@ -2470,7 +2470,7 @@ class GraphMemFreeNode : public GraphNode {
      }
      amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
      graph_->DecrementMemAllocNodeCount();  // Decrement count of unreleased memalloc nodes
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(),
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(),
              vaddr_sub_obj);
    }

@@ -2500,7 +2500,7 @@ class GraphMemFreeNode : public GraphNode {
            graph, stream->DeviceId(), *stream, amd::Command::EventWaitList{}, device_ptr_,
            amd::alignUp(va->getSize(), dev_info.virtualMemAllocGranularity_), nullptr);
        commands_.push_back(cmd);
-        ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_);
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_);
      }
    }
    return error;
@@ -476,7 +476,7 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig

  hip::Stream* eventStream = reinterpret_cast<hip::Stream*>(eventStreamHandle);
  if (eventStream != nullptr && eventStream->IsEventCaptured(event) == true) {
-    ClPrint(amd::LOG_INFO, amd::LOG_API,
+    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
            "[hipGraph] Current capture node StreamWaitEvent on stream : %p, Event %p", stream,
            event);
    if (waitStream == nullptr) {
@@ -39,7 +39,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
    LogError("Couldn't map device memory for host read");
    return false;
  }
-  ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy D2H, src=%p, dst=%p, size=%zu",
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Using host memcpy D2H, src=%p, dst=%p, size=%zu",
          (reinterpret_cast<const_address>(src) + origin[0]), dstHost, size[0]);
  // Copy memory
  std::memcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
@@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory
    return false;
  }

-  ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy H2D, src=%p, dst=%p, size=%zu",
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Using host memcpy H2D, src=%p, dst=%p, size=%zu",
        srcHost, (reinterpret_cast<address>(dst) + origin[0]), size[0]);
  // Copy memory
  std::memcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
@@ -294,7 +294,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM
    LogError("Couldn't map destination memory");
    return false;
  }
-  ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy for copyBuffer, src=%p, dst=%p, size=%zu",
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY,
+          "Using host memcpy for copyBuffer, src=%p, dst=%p, size=%zu",
          (reinterpret_cast<const_address>(src) + srcOrigin[0]),
          (reinterpret_cast<address>(dst) + dstOrigin[0]), size[0]);
  // Straight forward buffer copy
@@ -328,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
    return false;
  }

-  ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy for copyBufferRect, src=%p, "
-          "dst=%p, size=%zu",
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY,
+          "Using host memcpy for copyBufferRect, src=%p, dst=%p, size=%zu",
          (reinterpret_cast<const_address>(src) + srcRect.offset(0, 0, 0)),
          (reinterpret_cast<address>(dst) + dstRect.offset(0, 0, 0)), size[0]);

@@ -1049,7 +1049,7 @@ static void dumpCodeObject(const std::string& image) {
  char fname[30];
  static std::atomic<int> index;
  sprintf(fname, "_code_object%04d.o", index++);
-  ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Code object saved in %s\n", fname);
  std::ofstream ofs;
  ofs.open(fname, std::ios::binary);
  ofs << image;
@@ -2539,7 +2539,7 @@ bool Program::createKernelMetadataMap(void* binary, size_t binSize) {

  status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD);
  if (status == AMD_COMGR_STATUS_SUCCESS) {
-    ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2.");
+    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V2.");
    hasKernelMD = true;
    codeObjectVer_ = 2;
  } else {
@@ -2591,13 +2591,13 @@ bool Program::createKernelMetadataMap(void* binary, size_t binSize) {

    if (major_version == '1') {
      if (minor_version == '0') {
-        ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3.");
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V3.");
        codeObjectVer_ = 3;
      } else if (minor_version == '1') {
-        ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V4.");
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V4.");
        codeObjectVer_ = 4;
      } else if (minor_version == '2') {
-        ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V5.");
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V5.");
        codeObjectVer_ = 5;
      } else {
        ClPrint(amd::LOG_ERROR, amd::LOG_CODE,
@@ -310,7 +310,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
      hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

      // Copy memory line by line
-      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
              "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, "
              "completion_signal=0x%zx",
              dstMem.base, srcMem.base, (wait_events.size() != 0) ? wait_events[0].handle : 0,
@@ -335,7 +335,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
          size_t dstOffset = dstRect.offset(0, y, z);

          // Copy memory line by line
-          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
                  "HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx",
                  (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
          hsa_status_t status = hsa_amd_memory_async_copy(
@@ -509,7 +509,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c

  if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
    copyMask = gpu().getLastUsedSdmaEngine();
-    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
+    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
    copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
    if (copyMask == 0) {
      // Check SDMA engine status
@@ -539,7 +539,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
      // Copy on the first available free engine if ROCr returns a valid mask
      hsa_amd_sdma_engine_id_t copyEngine = static_cast<hsa_amd_sdma_engine_id_t>(copyMask);

-      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
              "HSA Copy copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
              "size=%ld, forceSDMA=%d, engineType=%d, wait_event=0x%zx, completion_signal=0x%zx",
              copyEngine, dst, src, size, forceSDMA, engine,
@@ -554,7 +554,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
  }

  if (engine == HwQueueEngine::Unknown || kUseRegularCopyApi) {
-    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
            "HSA Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, "
            "completion_signal=0x%zx, engineType=%d",
            dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle,
@@ -656,7 +656,8 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, bool enablePi
    if (pinnedMem != nullptr) {
      Memory* pinnedMemory = dev().getRocMemory(pinnedMem);
      address pinBuffer = pinnedMemory->getDeviceMemory();
-      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d", xferSize);
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d",
+              xferSize);
      buffState.copySize_ = xferSize;
      buffState.buffer_ = pinBuffer + partial1 + partial2;
      buffState.pinnedMem_ = pinnedMem;
@@ -666,7 +667,8 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, bool enablePi
  }
  // If Memory Pinning fails, failback to staging buffer
  xferSize = std::min(xferSize, StagingXferSize);
-  ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d", xferSize);
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d",
+          xferSize);
  buffState.copySize_ = xferSize;
  buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
 }
@@ -711,7 +713,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
    }
    if (hostToDev) {                          // H2D Path
      if (outBuffer.pinnedMem_ == nullptr) {  // Copy to Staging Buffer
-        ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
                stagingBuffer, hostSrc + copyOffset, copysize);
        memcpy(stagingBuffer, hostSrc + copyOffset, copysize);
      }
@@ -731,7 +733,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
        if (outBuffer.pinnedMem_ == nullptr) {
          // Wait for current signal of previous rocr copy if its not pinned mem
          gpu().Barriers().WaitCurrent();
-          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
+          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
                  hostDst + copyOffset, stagingBuffer, copysize);
          memcpy(hostDst + copyOffset, stagingBuffer, copysize);
        }
@@ -1754,7 +1756,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
        // Wait for current signal of previous blit copy if its not pinned mem
        if (outBuffer.pinnedMem_ == nullptr) {
          gpu().Barriers().WaitCurrent();
-          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
+          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
                  (void*)(dstAddr + stagedCopyOffset), stagingBuffer, copySize);
          memcpy(dstAddr + stagedCopyOffset, stagingBuffer, copySize);
        }
@@ -1875,7 +1877,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
        copySize = outBuffer.copySize_;
        address currentDstAddr = dstAddr + stagedCopyOffset;
        if (outBuffer.pinnedMem_ == nullptr) {
-          ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
+          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
                  stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
          memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
        }
@@ -234,12 +234,13 @@ Device::~Device() {
      hsa_queue_t* queue = qIter->first;
      auto& qInfo = qIter->second;
      if (qInfo.hostcallBuffer_) {
-        ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hostcall buffer %p for hardware queue %p",
-                qInfo.hostcallBuffer_, qIter->first->base_address);
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE,
+                "Deleting hostcall buffer %p for hardware queue %p", qInfo.hostcallBuffer_,
+                qIter->first->base_address);
        amd::disableHostcalls(qInfo.hostcallBuffer_);
        context().svmFree(qInfo.hostcallBuffer_);
      }
-      ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0",
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0",
              queue->base_address);
      qIter = it.erase(qIter);
      hsa_queue_destroy(queue);
@@ -1987,11 +1988,11 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
      break;
    case kUncachedAtomics:
      if (agentInfo->ext_fine_grain_pool.handle != 0) {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM,
                "Using extended fine grained access system memory pool");
        segment = agentInfo->ext_fine_grain_pool;
      } else {
-        ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
+        ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM,
                "Falling through on fine grained access system memory pool");
        segment = agentInfo->fine_grain_pool;
      }
@@ -2065,7 +2066,7 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
    LogPrintfError("get_mempolicy failed with error %ld", res);
    return ptr;
  }
-  ClPrint(amd::LOG_INFO, amd::LOG_RESOURCE,
+  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_RESOURCE,
          "get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", mode,
          *nodeMask->maskp, cpuCount);

@@ -2805,7 +2806,7 @@ bool Device::IsHwEventReady(const amd::Event& event, bool wait, amd::SyncPolicy
  void* hw_event =
      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
  if (hw_event == nullptr) {
-    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
+    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_SIG, "No HW event");
    return false;
  } else if (wait) {
    // hipEventBlockingSync
@@ -122,7 +122,7 @@ void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& reg
  if (indirectMapCount_ == 1) {
    if (!allocateMapMemory(owner()->getSize())) {
      decIndMapCount();
-      DevLogPrintfError("Cannot allocate Map memory for size: %u \n", owner()->getSize());
+      DevLogPrintfError("Cannot allocate Map memory for size: %u", owner()->getSize());
      return nullptr;
    }
  } else {
@@ -180,7 +180,7 @@ void* Memory::cpuMap(device::VirtualDevice& vDev, uint flags, uint startLayer, u
  if (!isHostMemDirectAccess() && !IsPersistentDirectMap()) {
    if (!vDev.blitMgr().readBuffer(*this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) {
      decIndMapCount();
-      DevLogError("Cannot read buffer \n");
+      DevLogError("Cannot read buffer");
      return nullptr;
    }
  }
@@ -192,7 +192,7 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
  if (!isHostMemDirectAccess() && !IsPersistentDirectMap()) {
    if (!vDev.blitMgr().writeBuffer(mapMemory_->getHostMem(), *this, amd::Coord3D(0),
                                    amd::Coord3D(size()), true)) {
-      LogError("[OCL] Fail sync the device memory on cpuUnmap");
+      LogError("Fail sync the device memory on cpuUnmap");
    }
    // Wait on CPU for the transfer
    static_cast<roc::VirtualGPU&>(vDev).releaseGpuMemoryFence();
@@ -624,7 +624,7 @@ Buffer::~Buffer() {
      // Detach the memory from HSA
      auto hsa_status = hsa_amd_ipc_memory_detach(owner()->getSvmPtr());
      if (hsa_status != HSA_STATUS_SUCCESS) {
-        LogPrintfError("HSA failed to detach memory with status: %d \n", hsa_status);
+        LogPrintfError("HSA failed to detach memory with status: %d", hsa_status);
      }
    }
  }
@@ -666,8 +666,8 @@ void Buffer::destroy() {
          }
        } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
          if (HSA_STATUS_SUCCESS != hsa_signal_destroy(signal_)) {
-            ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
-                    "[ROCClr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal destroy failed \n");
+            ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
+                    "hsa_signal_destroy failed");
          }
          deviceMemory_ = nullptr;
        } else {
@@ -684,7 +684,7 @@ void Buffer::destroy() {
        }
        // destroy system memory
        if (!(amd::Os::releaseMemory(deviceMemory_, size()))) {
-          ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n");
+          ClPrint(amd::LOG_ERROR, amd::LOG_MEM, "munmap failed");
        }
      }
    }
@@ -764,7 +764,7 @@ bool Buffer::create(bool alloc_local) {
            reinterpret_cast<const amd::IpcBuffer*>(owner())->Handle()),
        owner()->getSize(), ipc_agents_num, dev().IpcAgents(), &orig_dev_ptr);
    if (hsa_status != HSA_STATUS_SUCCESS) {
-      LogPrintfError("HSA failed to attach IPC memory with status: %d \n", hsa_status);
+      LogPrintfError("HSA failed to attach IPC memory with status: %d", hsa_status);
      return false;
    }
    owner()->setSvmPtr(orig_dev_ptr);
@@ -779,7 +779,7 @@ bool Buffer::create(bool alloc_local) {
      // if interprocess flag is set, then the memory is importable.
      if (!dev().ImportShareableHSAHandle(owner()->getSvmPtr(),
                                          &owner()->getUserData().hsa_handle)) {
-        LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x \n",
+        LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x",
                       owner()->getSvmPtr());
        return false;
      }
@@ -835,13 +835,13 @@ bool Buffer::create(bool alloc_local) {
          if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr,
                                                          HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal_)) {
            ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
-                    "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
+                    "hsa_amd_signal_create signal failed");
            return false;
          }
          volatile hsa_signal_value_t* signalValuePtr = nullptr;
          if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) {
            ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
-                    "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed");
+                    "hsa_amd_signal_value_pointer failed");
            return false;
          }

@@ -1043,7 +1043,7 @@ bool Buffer::ExportHandle(void* handle) const {
  auto hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, owner()->getSize(),
                                              reinterpret_cast<hsa_amd_ipc_memory_t*>(handle));
  if (hsa_status != HSA_STATUS_SUCCESS) {
-    LogPrintfError("Failed to create memory for IPC, failed with hsa_status: %d \n", hsa_status);
+    LogPrintfError("Failed to create memory for IPC, failed with hsa_status: %d", hsa_status);
    return false;
  }
  return true;
@@ -1304,7 +1304,7 @@ bool Image::create(bool alloc_local) {
                                                    permission_, &deviceImageInfo_);

  if (status != HSA_STATUS_SUCCESS) {
-    LogPrintfError("[OCL] Fail to allocate image memory, failed with hsa_status: %d \n", status);
+    LogPrintfError("Fail to allocate image memory, failed with hsa_status: %d", status);
    return false;
  }

@@ -773,7 +773,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
        mem = memories[index];
        const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
        if (mem == nullptr) {
-          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(),
+          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(),
                  desc.name_.c_str(), globalAddress);
          //! This condition is for SVM fine-grain
          if (dev().isFineGrainedSystem(true)) {
@@ -787,7 +787,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
          gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));

          const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
-          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i,
+          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i,
                  desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
                  gpuMem->getDeviceMemory(),
                  reinterpret_cast<address>(gpuMem->getDeviceMemory()) + mem->getSize());
@@ -874,10 +874,10 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
          if (desc.size_ > kMaxBytes) {
            bytes += "...";
          }
-          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i,
+          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i,
                  desc.typeName_.c_str(), desc.name_.c_str(), bytes.c_str(), desc.size_);
        } else {
-          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
+          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
                  desc.typeName_.c_str(), desc.name_.c_str(),
                  (desc.size_ == 1)   ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
                  : (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
@@ -1621,7 +1621,8 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
  } else {
    // Reset the signal for the barrier packet
    hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
-    ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Issue barrier to flush chunk %d", active_chunk_);
+    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN, "Issue barrier to flush chunk %d",
+            active_chunk_);
    // Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
    constexpr bool kSkipSignal = false;
    // Dispatch a barrier packet into the queue
@@ -3489,7 +3490,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
                                             gpuKernel.KernargSegmentAlignment());
      command_->SetKernelName(gpuKernel.getDemangledName().c_str());
    } else {
-      ClPrint(amd::LOG_INFO, amd::LOG_KERN,
+      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN,
              "KernargSegmentByteSize = %lu "
              "KernargSegmentAlignment = %lu",
              gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment());
@@ -3544,11 +3545,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    // Validate privateMemSize is more than max allowed.
    size_t maxStackSize = dev().MaxStackSize();
    if (dispatchPacket.private_segment_size > maxStackSize) {
-      ClPrint(amd::LOG_INFO, amd::LOG_KERN,
+      ClPrint(amd::LOG_ERROR, amd::LOG_KERN,
              "Scratch size (%u) exceeds max allowed (%zu) for kernel : %s",
              dispatchPacket.private_segment_size, maxStackSize,
              gpuKernel.getDemangledName().c_str());
-      LogError("Scratch size exceeds max allowed.");
      return false;
    }
  }
@@ -65,7 +65,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi
      if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
                                    kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) {
        if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) {
-          ClPrint(amd::LOG_INFO, amd::LOG_SIG,
+          ClPrint(amd::LOG_ERROR, amd::LOG_SIG,
                  "Device not Stable, while waiting for Signal ="
                  "(0x%lx) for %d ns",
                  signal.handle, kTimeout100us);
@@ -79,7 +79,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi
    while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
                                     kTimeout4Secs, wait_state) != 0) {
      if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) {
-        ClPrint(amd::LOG_INFO, amd::LOG_SIG,
+        ClPrint(amd::LOG_ERROR, amd::LOG_SIG,
                "Device not Stable, while waiting for Signal ="
                "(0x%lx) for %d ns",
                signal.handle, kTimeout4Secs);
@@ -160,12 +160,12 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
    }

    if (profilingInfo().enabled_) {
-      ClPrint(LOG_DEBUG, LOG_CMD, "Command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)",
+      ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)",
              &command(), ((profilingInfo().end_ - epoch) / 1000),
              ((profilingInfo().submitted_ - profilingInfo().queued_) / 1000),
              ((profilingInfo().end_ - profilingInfo().start_) / 1000));
    } else {
-      ClPrint(LOG_DEBUG, LOG_CMD, "Command %p complete", &command());
+      ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command %p complete", &command());
    }
    release();
  }
@@ -177,7 +177,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
 bool Event::resetStatus(int32_t status) {
  int32_t currentStatus = this->status();
  if (currentStatus != CL_COMPLETE) {
-    ClPrint(LOG_ERROR, LOG_CMD, "command is reset before complete current status :%d",
+    ClPrint(LOG_ERROR, LOG_CMD, "Command is reset before complete current status :%d",
            currentStatus);
  }
  if (!status_.compare_exchange_strong(currentStatus, status, std::memory_order_relaxed)) {
@@ -191,7 +191,7 @@ bool Event::resetStatus(int32_t status) {
 // ================================================================================================
 bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* data,
                        bool blocking) {
-  assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status");
+  assert(status >= CL_COMPLETE && status <= CL_QUEUED && "Invalid status");

  CallBackEntry* entry = new CallBackEntry(status, callback, data, blocking);
  if (entry == NULL) {
@@ -240,8 +240,8 @@ bool Event::awaitCompletion() {
      return false;
    }

-    ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d", this,
-            status());
+    ClPrint(LOG_DETAIL_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
+            this, status());
    auto* queue = command().queue();
    if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
      while (status() > CL_COMPLETE) {
@@ -255,7 +255,7 @@ bool Event::awaitCompletion() {
        lock_.wait();
      }
    }
-    ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
+    ClPrint(LOG_DETAIL_DEBUG, LOG_WAIT, "Event %p wait completed", this);
  }

  return status() == CL_COMPLETE;
@@ -353,7 +353,7 @@ void Command::enqueue() {
    Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
  }

-  ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p",
+  ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p",
          amd::activity_prof::getOclCommandKindString(this->type()), this, queue_);

  // Direct dispatch logic below will submit the command immediately, but the command status
@@ -75,7 +75,7 @@ bool HostQueue::terminate() {
        if (GetSubmissionBatch() != nullptr) {
          auto command = new Marker(*this, false);
          if (command != nullptr) {
-            ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish");
+            ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Marker queued to ensure finish");
            command->enqueue();
            lastCommand->release();
            lastCommand = command;
@@ -145,7 +145,7 @@ void HostQueue::finishCommand(Command* command) {
  if (command == nullptr) {
    command = getLastQueuedCommand(true);
    if (command != nullptr) {
-      ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
+      ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
      command->awaitCompletion();
      command->release();
    }
@@ -154,7 +154,7 @@ void HostQueue::finishCommand(Command* command) {
  // Check hardware event status for the specific command
  static constexpr bool kWaitCompletion = true;
  if (!device().IsHwEventReady(command->event(), kWaitCompletion)) {
-    ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
+    ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
    command->awaitCompletion();
  }
 }
@@ -181,7 +181,7 @@ void HostQueue::finish(bool cpu_wait) {
  }

  size_t batchSize = GetSubmissionBatchSize();
-  ClPrint(LOG_DEBUG, LOG_CMD,
+  ClPrint(LOG_DETAIL_DEBUG, LOG_CMD,
          "finish() called with batch size: %zu, cpu_wait: %d, "
          "fence dirty: %d",
          batchSize, cpu_wait, vdev()->isFenceDirty());
@@ -203,7 +203,7 @@ void HostQueue::finish(bool cpu_wait) {
  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
  static constexpr bool kWaitCompletion = true;
  if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion, GetSyncPolicy())) {
-    ClPrint(LOG_DEBUG, LOG_CMD,
+    ClPrint(LOG_DETAIL_DEBUG, LOG_CMD,
            "No HW event or batch size is less than %zu, "
            "await command completion",
            minBatchSize);
@@ -258,14 +258,14 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
    // Process the command's event wait list.
    const Command::EventWaitList& events = command->eventWaitList();
    bool dependencyFailed = false;
-    ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) processing: %p ,events.size(): %d",
+    ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) processing: %p ,events.size(): %d",
            amd::activity_prof::getOclCommandKindString(command->type()), command, events.size());
    for (const auto& it : events) {
      // Only wait if the command is enqueued into another queue.
      if (it->command().queue() != this) {
        // Runtime has to flush the current batch only if the dependent wait is blocking
        if (it->command().status() != CL_COMPLETE) {
-          ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) %p awaiting event: %p",
+          ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) %p awaiting event: %p",
                  amd::activity_prof::getOclCommandKindString(command->type()), command, it);
          virtualDevice->flush(head, true);
          tail = head = NULL;
@@ -287,7 +287,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
      continue;
    }

-    ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p",
+    ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) submitted: %p",
            amd::activity_prof::getOclCommandKindString(command->type()), command);

    command->setStatus(CL_SUBMITTED);
@@ -80,7 +80,8 @@ bool Runtime::init() {
    ClPrint(LOG_ERROR, LOG_INIT, "Runtime initialization failed");
    return false;
  }
-  ClPrint(LOG_INFO, LOG_MISC, "ROCclr version: %s", ROCCLR_VERSION_GITHASH);
+
+  ClPrint(LOG_INFO, LOG_MISC && !amd::IS_HIP, "ROCclr version: %s", ROCCLR_VERSION_GITHASH);

  initialized_ = true;
  pid_ = amd::Os::getProcessId();
@@ -41,7 +41,8 @@ enum LogLevel {
  LOG_WARNING = 2,
  LOG_INFO = 3,
  LOG_DEBUG = 4,
-  LOG_EXTRA_DEBUG = 5
+  LOG_DETAIL_DEBUG = 5,
+  LOG_EXTRA_DEBUG = 6
 };

 enum LogMask {