diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index b82b787167..03b7d7938b 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -32,7 +32,7 @@ namespace hip { // ================================================================================================ hip::Stream* Device::NullStream(bool wait) { - ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait); if (null_stream_ == nullptr) { amd::ScopedLock lock(lock_); if (null_stream_ == nullptr) { @@ -188,7 +188,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre if (wait_null_stream) { if (null_stream_) { - ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_); waitForStream(null_stream_); } } else { @@ -199,7 +199,8 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre ((active_stream->Flags() & hipStreamNonBlocking) == 0) && // and it's not the current stream (active_stream != blocking_stream)) { - ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", + active_stream); // Get the last valid command waitForStream(active_stream); } diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index f4f8c10416..8b533712c8 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -399,7 +399,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned e->SetCaptureStream(stream); if ((stream != nullptr && stream != hipStreamLegacy) && (s->GetCaptureStatus() == hipStreamCaptureStatusActive)) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Current capture node EventRecord on stream : %p, Event %p", stream, event); s->SetCaptureEvent(event); std::vector lastCapturedNodes = s->GetLastCapturedNodes(); @@ -411,7 +411,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned reinterpret_cast(s->GetLastCapturedNodes().data()), s->GetLastCapturedNodes().size(), false); if (status != hipSuccess) { - ClPrint(amd::LOG_ERROR, amd::LOG_API, "hipEventRecord add external event node failed"); + ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "hipEventRecord add external event node failed"); return status; } s->SetLastCapturedNode(node); diff --git a/projects/clr/hipamd/src/hip_graph.cpp b/projects/clr/hipamd/src/hip_graph.cpp index a82fc8da0a..0b3ee071ea 100644 --- a/projects/clr/hipamd/src/hip_graph.cpp +++ b/projects/clr/hipamd/src/hip_graph.cpp @@ -228,7 +228,7 @@ hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph hipError_t capturehipLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim, dim3& blockDim, void**& args, size_t& sharedMemBytes) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node LaunchKernel on stream : %p", stream); if (!hip::isValid(stream)) { @@ -316,7 +316,7 @@ hipError_t capturehipExtModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f size_t& sharedMemBytes, void**& kernelParams, void**& extra, hipEvent_t& startEvent, hipEvent_t& stopEvent, uint32_t& flags) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node ExtModuleLaunchKernel on stream : %p", stream); return ihipExtLaunchKernel(stream, f, globalWorkSizeX / localWorkSizeX, globalWorkSizeY / localWorkSizeY, globalWorkSizeZ / localWorkSizeZ, @@ -329,7 +329,7 @@ hipError_t capturehipExtModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f hipError_t capturehipExtLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim, dim3& blockDim, void**& args, size_t& sharedMemBytes, hipEvent_t& startEvent, hipEvent_t& stopEvent, int& flags) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node ExtLaunchKernel on stream : %p", stream); return ihipExtLaunchKernel( stream, reinterpret_cast(const_cast(hostFunction)), gridDim.x, @@ -342,7 +342,7 @@ hipError_t capturehipModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f, u uint32_t& blockDimY, uint32_t& blockDimZ, uint32_t& sharedMemBytes, void**& kernelParams, void**& extra) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node ModuleLaunchKernel on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; @@ -372,7 +372,7 @@ hipError_t capturehipModuleLaunchCooperativeKernel(hipStream_t& stream, hipFunct uint32_t& gridDimZ, uint32_t& blockDimX, uint32_t& blockDimY, uint32_t& blockDimZ, uint32_t& sharedMemBytes, void**& kernelParams) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node ModuleLaunchCooperativeKernel on stream : %p", stream); if (!hip::isValid(stream)) { @@ -403,8 +403,8 @@ hipError_t capturehipModuleLaunchCooperativeKernel(hipStream_t& stream, hipFunct hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 blockDim, dim3 gridDim, unsigned int sharedMemBytes, void** extra) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node LaunchByPtr on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node LaunchByPtr on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -433,7 +433,7 @@ hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 b hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f, dim3& gridDim, dim3& blockDim, void**& kernelParams, uint32_t& sharedMemBytes) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node LaunchCooperativeKernel on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; @@ -462,8 +462,8 @@ hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f } hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*& p) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy3D on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memcpy3D on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -488,8 +488,8 @@ hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*& hipError_t capturehipMemcpy2DAsync(hipStream_t& stream, void*& dst, size_t& dpitch, const void*& src, size_t& spitch, size_t& width, size_t& height, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy2D on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memcpy2D on stream : %p", stream); if (dst == nullptr || src == nullptr) { return hipErrorInvalidValue; } @@ -536,7 +536,7 @@ hipError_t capturehipMemcpy2DFromArrayAsync(hipStream_t& stream, void*& dst, siz hipArray_const_t& src, size_t& wOffsetSrc, size_t& hOffsetSrc, size_t& width, size_t& height, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node Memcpy2DFromArray on stream : %p", stream); // Skip zero-sized copies @@ -577,7 +577,7 @@ hipError_t capturehipMemcpy2DFromArrayAsync(hipStream_t& stream, void*& dst, siz hipError_t capturehipMemcpy2DToArrayAsync(hipStream_t& stream, hipArray_t& dst, size_t& wOffset, size_t& hOffset, const void*& src, size_t& spitch, size_t& width, size_t& height, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node Memcpy2DFromArray on stream : %p", stream); // Skip zero-sized copies @@ -616,7 +616,7 @@ hipError_t capturehipMemcpy2DToArrayAsync(hipStream_t& stream, hipArray_t& dst, } hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D*& pCopy) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; @@ -691,7 +691,7 @@ hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D* hipError_t capturehipMemcpyAtoHAsync(hipStream_t& stream, void*& dstHost, hipArray_t& srcArray, size_t& srcOffset, size_t& ByteCount) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream); if (srcArray == nullptr || dstHost == nullptr) { return hipErrorInvalidValue; @@ -720,7 +720,7 @@ hipError_t capturehipMemcpyAtoHAsync(hipStream_t& stream, void*& dstHost, hipArr hipError_t capturehipMemcpyHtoAAsync(hipStream_t& stream, hipArray_t& dstArray, size_t& dstOffset, const void*& srcHost, size_t& ByteCount) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream); if (dstArray == nullptr || srcHost == nullptr) { return hipErrorInvalidValue; @@ -748,6 +748,8 @@ hipError_t capturehipMemcpyHtoAAsync(hipStream_t& stream, hipArray_t& dstArray, hipError_t capturehipMemcpy(hipStream_t stream, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memcpy on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -770,8 +772,8 @@ hipError_t capturehipMemcpy(hipStream_t stream, void* dst, const void* src, size hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& src, size_t& sizeBytes, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy1D on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memcpy1D on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -779,9 +781,10 @@ hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& s } hipError_t capturehipMemcpyHtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDevice, - const void*& srcHost, size_t& ByteCount, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node MemcpyHtoD on stream : %p", - stream); + const void*& srcHost, size_t& ByteCount, + hipMemcpyKind& kind) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node MemcpyHtoD on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -791,18 +794,19 @@ hipError_t capturehipMemcpyHtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDev hipError_t capturehipMemcpyDtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDevice, hipDeviceptr_t& srcDevice, size_t& ByteCount, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, - "[hipGraph] Current capture node hipMemcpyDtoD on stream : %p", stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node MemcpyDtoD on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } return capturehipMemcpy(stream, dstDevice, srcDevice, ByteCount, kind); } -hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, hipDeviceptr_t& srcDevice, - size_t& ByteCount, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, - "[hipGraph] Current capture node hipMemcpyDtoH on stream : %p", stream); +hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, + hipDeviceptr_t& srcDevice, size_t& ByteCount, + hipMemcpyKind& kind) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node MemcpyDtoH on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -810,8 +814,9 @@ hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, hipDev } hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, const void*& symbol, - size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + size_t& sizeBytes, size_t& offset, + hipMemcpyKind& kind) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node MemcpyFromSymbolNode on stream : %p", stream); if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice && @@ -847,7 +852,7 @@ hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, cons hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbol, const void*& src, size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node MemcpyToSymbolNode on stream : %p", stream); if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice && @@ -882,8 +887,8 @@ hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbo hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, size_t& valueSize, size_t& sizeBytes) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset1D on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memset1D on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -908,8 +913,8 @@ hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, si hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitch, int& value, size_t& width, size_t& height) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset2D on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memset2D on stream : %p", stream); hipMemsetParams memsetParams = {0}; if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; @@ -934,8 +939,8 @@ hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitc hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDevPtr, int& value, hipExtent& extent) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset3D on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node Memset3D on stream : %p", stream); if (!hip::isValid(stream)) { return hipErrorContextIsDestroyed; } @@ -966,8 +971,8 @@ hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDe } hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*& userData) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node host on stream : %p", - stream); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node HostFunction launch on stream : %p", stream); if (fn == nullptr) { return hipErrorInvalidValue; } @@ -992,6 +997,8 @@ hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*& // ================================================================================================ hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size, void** dev_ptr) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node MallocAsync on stream : %p", stream); auto s = reinterpret_cast(stream); auto mpool = reinterpret_cast(mem_pool); @@ -1029,6 +1036,8 @@ hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size // ================================================================================================ hipError_t capturehipFreeAsync(hipStream_t stream, void* dev_ptr) { + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, + "[hipGraph] Current capture node FreeAsync on stream : %p", stream); hip::Stream* s = reinterpret_cast(stream); auto mem_free_node = new hip::GraphMemFreeNode(dev_ptr); auto status = diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index 4fbd7d32b7..4a7e735b00 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -122,7 +122,7 @@ bool Graph::isGraphValid(Graph* pGraph) { // ================================================================================================ void Graph::AddNode(const Node& node) { vertices_.emplace_back(node); - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "[hipGraph] Add %s(%p)", GetGraphNodeTypeString(node->GetType()), node); node->SetParentGraph(this); } @@ -140,7 +140,7 @@ std::vector Graph::GetRootNodes() const { for (auto entry : vertices_) { if (entry->GetInDegree() == 0) { roots.push_back(entry); - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)", GetGraphNodeTypeString(entry->GetType()), entry); } } @@ -703,6 +703,11 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) { repeatLaunch_ = true; } + ClPrint(amd::LOG_DEBUG, amd::LOG_CODE, + "GraphExec::Run max_streams: %d, " + "on device: %d, total number of nodes: %d", + max_streams_, launch_stream->DeviceId(), topoOrder_.size()); + if (max_streams_ == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) { if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { // If the graph has kernels that does device side allocation, during packet capture, heap is diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp index 5d2b6f6a14..2b6fafe699 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.hpp +++ b/projects/clr/hipamd/src/hip_graph_internal.hpp @@ -2327,7 +2327,7 @@ class GraphMemAllocNode final : public GraphNode { amd::Device::VmmAccess::kReadWrite); va_->retain(); graph_->IncrementMemAllocNodeCount(); // Increment count of unreleased mem alloc nodes - ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p", vaddr_sub_obj->getSvmPtr(), reinterpret_cast(vaddr_sub_obj->getSvmPtr()) + aligned_size, memory()); } @@ -2390,7 +2390,7 @@ class GraphMemAllocNode final : public GraphNode { // be executed again amd::MemObjMap::AddMemObj(node_params_.dptr, va_); } - ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr); } } return error; @@ -2405,7 +2405,7 @@ class GraphMemAllocNode final : public GraphNode { va_ = amd::MemObjMap::FindVirtualMemObj(node_params_.dptr); amd::MemObjMap::AddMemObj(node_params_.dptr, va_); } - ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr); } return node_params_.dptr; } @@ -2470,7 +2470,7 @@ class GraphMemFreeNode : public GraphNode { } amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj); graph_->DecrementMemAllocNodeCount(); // Decrement count of unreleased memalloc nodes - ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(), + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(), vaddr_sub_obj); } @@ -2500,7 +2500,7 @@ class GraphMemFreeNode : public GraphNode { graph, stream->DeviceId(), *stream, amd::Command::EventWaitList{}, device_ptr_, amd::alignUp(va->getSize(), dev_info.virtualMemAllocGranularity_), nullptr); commands_.push_back(cmd); - ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_); } } return error; diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index 5b825d0d4f..37c19bc7fc 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -476,7 +476,7 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig hip::Stream* eventStream = reinterpret_cast(eventStreamHandle); if (eventStream != nullptr && eventStream->IsEventCaptured(event) == true) { - ClPrint(amd::LOG_INFO, amd::LOG_API, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API, "[hipGraph] Current capture node StreamWaitEvent on stream : %p, Event %p", stream, event); if (waitStream == nullptr) { diff --git a/projects/clr/rocclr/device/blit.cpp b/projects/clr/rocclr/device/blit.cpp index 8873cf2e4f..d58ae48395 100644 --- a/projects/clr/rocclr/device/blit.cpp +++ b/projects/clr/rocclr/device/blit.cpp @@ -39,7 +39,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, LogError("Couldn't map device memory for host read"); return false; } - ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy D2H, src=%p, dst=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Using host memcpy D2H, src=%p, dst=%p, size=%zu", (reinterpret_cast(src) + origin[0]), dstHost, size[0]); // Copy memory std::memcpy(dstHost, reinterpret_cast(src) + origin[0], size[0]); @@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory return false; } - ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy H2D, src=%p, dst=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Using host memcpy H2D, src=%p, dst=%p, size=%zu", srcHost, (reinterpret_cast
(dst) + origin[0]), size[0]); // Copy memory std::memcpy(reinterpret_cast
(dst) + origin[0], srcHost, size[0]); @@ -294,7 +294,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM LogError("Couldn't map destination memory"); return false; } - ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy for copyBuffer, src=%p, dst=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, + "Using host memcpy for copyBuffer, src=%p, dst=%p, size=%zu", (reinterpret_cast(src) + srcOrigin[0]), (reinterpret_cast
(dst) + dstOrigin[0]), size[0]); // Straight forward buffer copy @@ -328,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& return false; } - ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy for copyBufferRect, src=%p, " - "dst=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, + "Using host memcpy for copyBufferRect, src=%p, dst=%p, size=%zu", (reinterpret_cast(src) + srcRect.offset(0, 0, 0)), (reinterpret_cast
(dst) + dstRect.offset(0, 0, 0)), size[0]); diff --git a/projects/clr/rocclr/device/devprogram.cpp b/projects/clr/rocclr/device/devprogram.cpp index a33b09b9c6..76d487bf55 100644 --- a/projects/clr/rocclr/device/devprogram.cpp +++ b/projects/clr/rocclr/device/devprogram.cpp @@ -1049,7 +1049,7 @@ static void dumpCodeObject(const std::string& image) { char fname[30]; static std::atomic index; sprintf(fname, "_code_object%04d.o", index++); - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Code object saved in %s\n", fname); std::ofstream ofs; ofs.open(fname, std::ios::binary); ofs << image; @@ -2539,7 +2539,7 @@ bool Program::createKernelMetadataMap(void* binary, size_t binSize) { status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD); if (status == AMD_COMGR_STATUS_SUCCESS) { - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2."); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V2."); hasKernelMD = true; codeObjectVer_ = 2; } else { @@ -2591,13 +2591,13 @@ bool Program::createKernelMetadataMap(void* binary, size_t binSize) { if (major_version == '1') { if (minor_version == '0') { - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3."); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V3."); codeObjectVer_ = 3; } else if (minor_version == '1') { - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V4."); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V4."); codeObjectVer_ = 4; } else if (minor_version == '2') { - ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V5."); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V5."); codeObjectVer_ = 5; } else { ClPrint(amd::LOG_ERROR, amd::LOG_CODE, diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 1b7b627d78..842d322554 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -310,7 +310,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Copy memory line by line - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2, "HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, " "completion_signal=0x%zx", dstMem.base, srcMem.base, (wait_events.size() != 0) ? wait_events[0].handle : 0, @@ -335,7 +335,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d size_t dstOffset = dstRect.offset(0, y, z); // Copy memory line by line - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2, "HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx", (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle); hsa_status_t status = hsa_amd_memory_async_copy( @@ -509,7 +509,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) { copyMask = gpu().getLastUsedSdmaEngine(); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask); copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_); if (copyMask == 0) { // Check SDMA engine status @@ -539,7 +539,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c // Copy on the first available free engine if ROCr returns a valid mask hsa_amd_sdma_engine_id_t copyEngine = static_cast(copyMask); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2, "HSA Copy copy_engine=0x%x, dst=0x%zx, src=0x%zx, " "size=%ld, forceSDMA=%d, engineType=%d, wait_event=0x%zx, completion_signal=0x%zx", copyEngine, dst, src, size, forceSDMA, engine, @@ -554,7 +554,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c } if (engine == HwQueueEngine::Unknown || kUseRegularCopyApi) { - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2, "HSA Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, " "completion_signal=0x%zx, engineType=%d", dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle, @@ -656,7 +656,8 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, bool enablePi if (pinnedMem != nullptr) { Memory* pinnedMemory = dev().getRocMemory(pinnedMem); address pinBuffer = pinnedMemory->getDeviceMemory(); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d", xferSize); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d", + xferSize); buffState.copySize_ = xferSize; buffState.buffer_ = pinBuffer + partial1 + partial2; buffState.pinnedMem_ = pinnedMem; @@ -666,7 +667,8 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, bool enablePi } // If Memory Pinning fails, failback to staging buffer xferSize = std::min(xferSize, StagingXferSize); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d", xferSize); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d", + xferSize); buffState.copySize_ = xferSize; buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize)); } @@ -711,7 +713,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs } if (hostToDev) { // H2D Path if (outBuffer.pinnedMem_ == nullptr) { // Copy to Staging Buffer - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu", stagingBuffer, hostSrc + copyOffset, copysize); memcpy(stagingBuffer, hostSrc + copyOffset, copysize); } @@ -731,7 +733,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs if (outBuffer.pinnedMem_ == nullptr) { // Wait for current signal of previous rocr copy if its not pinned mem gpu().Barriers().WaitCurrent(); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", hostDst + copyOffset, stagingBuffer, copysize); memcpy(hostDst + copyOffset, stagingBuffer, copysize); } @@ -1754,7 +1756,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, // Wait for current signal of previous blit copy if its not pinned mem if (outBuffer.pinnedMem_ == nullptr) { gpu().Barriers().WaitCurrent(); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu", (void*)(dstAddr + stagedCopyOffset), stagingBuffer, copySize); memcpy(dstAddr + stagedCopyOffset, stagingBuffer, copySize); } @@ -1875,7 +1877,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo copySize = outBuffer.copySize_; address currentDstAddr = dstAddr + stagedCopyOffset; if (outBuffer.pinnedMem_ == nullptr) { - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu", stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize); memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize); } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 4acf5adc12..104f40f76e 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -234,12 +234,13 @@ Device::~Device() { hsa_queue_t* queue = qIter->first; auto& qInfo = qIter->second; if (qInfo.hostcallBuffer_) { - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hostcall buffer %p for hardware queue %p", - qInfo.hostcallBuffer_, qIter->first->base_address); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE, + "Deleting hostcall buffer %p for hardware queue %p", qInfo.hostcallBuffer_, + qIter->first->base_address); amd::disableHostcalls(qInfo.hostcallBuffer_); context().svmFree(qInfo.hostcallBuffer_); } - ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0", + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0", queue->base_address); qIter = it.erase(qIter); hsa_queue_destroy(queue); @@ -1987,11 +1988,11 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg, break; case kUncachedAtomics: if (agentInfo->ext_fine_grain_pool.handle != 0) { - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM, "Using extended fine grained access system memory pool"); segment = agentInfo->ext_fine_grain_pool; } else { - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM, "Falling through on fine grained access system memory pool"); segment = agentInfo->fine_grain_pool; } @@ -2065,7 +2066,7 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg LogPrintfError("get_mempolicy failed with error %ld", res); return ptr; } - ClPrint(amd::LOG_INFO, amd::LOG_RESOURCE, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_RESOURCE, "get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", mode, *nodeMask->maskp, cpuCount); @@ -2805,7 +2806,7 @@ bool Device::IsHwEventReady(const amd::Event& event, bool wait, amd::SyncPolicy void* hw_event = (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); if (hw_event == nullptr) { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_SIG, "No HW event"); return false; } else if (wait) { // hipEventBlockingSync diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index 6b52211161..d337f92985 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -122,7 +122,7 @@ void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& reg if (indirectMapCount_ == 1) { if (!allocateMapMemory(owner()->getSize())) { decIndMapCount(); - DevLogPrintfError("Cannot allocate Map memory for size: %u \n", owner()->getSize()); + DevLogPrintfError("Cannot allocate Map memory for size: %u", owner()->getSize()); return nullptr; } } else { @@ -180,7 +180,7 @@ void* Memory::cpuMap(device::VirtualDevice& vDev, uint flags, uint startLayer, u if (!isHostMemDirectAccess() && !IsPersistentDirectMap()) { if (!vDev.blitMgr().readBuffer(*this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) { decIndMapCount(); - DevLogError("Cannot read buffer \n"); + DevLogError("Cannot read buffer"); return nullptr; } } @@ -192,7 +192,7 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) { if (!isHostMemDirectAccess() && !IsPersistentDirectMap()) { if (!vDev.blitMgr().writeBuffer(mapMemory_->getHostMem(), *this, amd::Coord3D(0), amd::Coord3D(size()), true)) { - LogError("[OCL] Fail sync the device memory on cpuUnmap"); + LogError("Fail sync the device memory on cpuUnmap"); } // Wait on CPU for the transfer static_cast(vDev).releaseGpuMemoryFence(); @@ -624,7 +624,7 @@ Buffer::~Buffer() { // Detach the memory from HSA auto hsa_status = hsa_amd_ipc_memory_detach(owner()->getSvmPtr()); if (hsa_status != HSA_STATUS_SUCCESS) { - LogPrintfError("HSA failed to detach memory with status: %d \n", hsa_status); + LogPrintfError("HSA failed to detach memory with status: %d", hsa_status); } } } @@ -666,8 +666,8 @@ void Buffer::destroy() { } } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) { if (HSA_STATUS_SUCCESS != hsa_signal_destroy(signal_)) { - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, - "[ROCClr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal destroy failed \n"); + ClPrint(amd::LOG_ERROR, amd::LOG_MEM, + "hsa_signal_destroy failed"); } deviceMemory_ = nullptr; } else { @@ -684,7 +684,7 @@ void Buffer::destroy() { } // destroy system memory if (!(amd::Os::releaseMemory(deviceMemory_, size()))) { - ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n"); + ClPrint(amd::LOG_ERROR, amd::LOG_MEM, "munmap failed"); } } } @@ -764,7 +764,7 @@ bool Buffer::create(bool alloc_local) { reinterpret_cast(owner())->Handle()), owner()->getSize(), ipc_agents_num, dev().IpcAgents(), &orig_dev_ptr); if (hsa_status != HSA_STATUS_SUCCESS) { - LogPrintfError("HSA failed to attach IPC memory with status: %d \n", hsa_status); + LogPrintfError("HSA failed to attach IPC memory with status: %d", hsa_status); return false; } owner()->setSvmPtr(orig_dev_ptr); @@ -779,7 +779,7 @@ bool Buffer::create(bool alloc_local) { // if interprocess flag is set, then the memory is importable. if (!dev().ImportShareableHSAHandle(owner()->getSvmPtr(), &owner()->getUserData().hsa_handle)) { - LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x \n", + LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x", owner()->getSvmPtr()); return false; } @@ -835,13 +835,13 @@ bool Buffer::create(bool alloc_local) { if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal_)) { ClPrint(amd::LOG_ERROR, amd::LOG_MEM, - "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed"); + "hsa_amd_signal_create signal failed"); return false; } volatile hsa_signal_value_t* signalValuePtr = nullptr; if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) { ClPrint(amd::LOG_ERROR, amd::LOG_MEM, - "[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed"); + "hsa_amd_signal_value_pointer failed"); return false; } @@ -1043,7 +1043,7 @@ bool Buffer::ExportHandle(void* handle) const { auto hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, owner()->getSize(), reinterpret_cast(handle)); if (hsa_status != HSA_STATUS_SUCCESS) { - LogPrintfError("Failed to create memory for IPC, failed with hsa_status: %d \n", hsa_status); + LogPrintfError("Failed to create memory for IPC, failed with hsa_status: %d", hsa_status); return false; } return true; @@ -1304,7 +1304,7 @@ bool Image::create(bool alloc_local) { permission_, &deviceImageInfo_); if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("[OCL] Fail to allocate image memory, failed with hsa_status: %d \n", status); + LogPrintfError("Fail to allocate image memory, failed with hsa_status: %d", status); return false; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 1266643e7f..a9d1d58762 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -773,7 +773,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para mem = memories[index]; const void* globalAddress = *reinterpret_cast(params + desc.offset_); if (mem == nullptr) { - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(), + ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress); //! This condition is for SVM fine-grain if (dev().isFineGrainedSystem(true)) { @@ -787,7 +787,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para gpuMem = static_cast(mem->getDeviceMemory(dev())); const void* globalAddress = *reinterpret_cast(params + desc.offset_); - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i, + ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i, desc.typeName_.c_str(), desc.name_.c_str(), globalAddress, gpuMem->getDeviceMemory(), reinterpret_cast
(gpuMem->getDeviceMemory()) + mem->getSize()); @@ -874,10 +874,10 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para if (desc.size_ > kMaxBytes) { bytes += "..."; } - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i, + ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i, desc.typeName_.c_str(), desc.name_.c_str(), bytes.c_str(), desc.size_); } else { - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i, + ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i, desc.typeName_.c_str(), desc.name_.c_str(), (desc.size_ == 1) ? *reinterpret_cast(srcArgPtr) : (desc.size_ == 2) ? *reinterpret_cast(srcArgPtr) @@ -1621,7 +1621,8 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) { } else { // Reset the signal for the barrier packet hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne); - ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Issue barrier to flush chunk %d", active_chunk_); + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN, "Issue barrier to flush chunk %d", + active_chunk_); // Currently don't skip wait signal check, because SDMA engine cna be used in staging copy constexpr bool kSkipSignal = false; // Dispatch a barrier packet into the queue @@ -3489,7 +3490,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const gpuKernel.KernargSegmentAlignment()); command_->SetKernelName(gpuKernel.getDemangledName().c_str()); } else { - ClPrint(amd::LOG_INFO, amd::LOG_KERN, + ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN, "KernargSegmentByteSize = %lu " "KernargSegmentAlignment = %lu", gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment()); @@ -3544,11 +3545,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const // Validate privateMemSize is more than max allowed. size_t maxStackSize = dev().MaxStackSize(); if (dispatchPacket.private_segment_size > maxStackSize) { - ClPrint(amd::LOG_INFO, amd::LOG_KERN, + ClPrint(amd::LOG_ERROR, amd::LOG_KERN, "Scratch size (%u) exceeds max allowed (%zu) for kernel : %s", dispatchPacket.private_segment_size, maxStackSize, gpuKernel.getDemangledName().c_str()); - LogError("Scratch size exceeds max allowed."); return false; } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 3c0ddb2b6c..f12116e425 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -65,7 +65,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) { if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, + ClPrint(amd::LOG_ERROR, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" "(0x%lx) for %d ns", signal.handle, kTimeout100us); @@ -79,7 +79,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, kTimeout4Secs, wait_state) != 0) { if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) { - ClPrint(amd::LOG_INFO, amd::LOG_SIG, + ClPrint(amd::LOG_ERROR, amd::LOG_SIG, "Device not Stable, while waiting for Signal =" "(0x%lx) for %d ns", signal.handle, kTimeout4Secs); diff --git a/projects/clr/rocclr/platform/command.cpp b/projects/clr/rocclr/platform/command.cpp index c52b3e1e19..10297bac8b 100644 --- a/projects/clr/rocclr/platform/command.cpp +++ b/projects/clr/rocclr/platform/command.cpp @@ -160,12 +160,12 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) { } if (profilingInfo().enabled_) { - ClPrint(LOG_DEBUG, LOG_CMD, "Command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)", + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)", &command(), ((profilingInfo().end_ - epoch) / 1000), ((profilingInfo().submitted_ - profilingInfo().queued_) / 1000), ((profilingInfo().end_ - profilingInfo().start_) / 1000)); } else { - ClPrint(LOG_DEBUG, LOG_CMD, "Command %p complete", &command()); + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command %p complete", &command()); } release(); } @@ -177,7 +177,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) { bool Event::resetStatus(int32_t status) { int32_t currentStatus = this->status(); if (currentStatus != CL_COMPLETE) { - ClPrint(LOG_ERROR, LOG_CMD, "command is reset before complete current status :%d", + ClPrint(LOG_ERROR, LOG_CMD, "Command is reset before complete current status :%d", currentStatus); } if (!status_.compare_exchange_strong(currentStatus, status, std::memory_order_relaxed)) { @@ -191,7 +191,7 @@ bool Event::resetStatus(int32_t status) { // ================================================================================================ bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* data, bool blocking) { - assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status"); + assert(status >= CL_COMPLETE && status <= CL_QUEUED && "Invalid status"); CallBackEntry* entry = new CallBackEntry(status, callback, data, blocking); if (entry == NULL) { @@ -240,8 +240,8 @@ bool Event::awaitCompletion() { return false; } - ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d", this, - status()); + ClPrint(LOG_DETAIL_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d", + this, status()); auto* queue = command().queue(); if ((queue != nullptr) && queue->vdev()->ActiveWait()) { while (status() > CL_COMPLETE) { @@ -255,7 +255,7 @@ bool Event::awaitCompletion() { lock_.wait(); } } - ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this); + ClPrint(LOG_DETAIL_DEBUG, LOG_WAIT, "Event %p wait completed", this); } return status() == CL_COMPLETE; @@ -353,7 +353,7 @@ void Command::enqueue() { Agent::postEventCreate(as_cl(static_cast(this)), type_); } - ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p", + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p", amd::activity_prof::getOclCommandKindString(this->type()), this, queue_); // Direct dispatch logic below will submit the command immediately, but the command status diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index 203b26d0d3..b965c9fad1 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -75,7 +75,7 @@ bool HostQueue::terminate() { if (GetSubmissionBatch() != nullptr) { auto command = new Marker(*this, false); if (command != nullptr) { - ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish"); + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Marker queued to ensure finish"); command->enqueue(); lastCommand->release(); lastCommand = command; @@ -145,7 +145,7 @@ void HostQueue::finishCommand(Command* command) { if (command == nullptr) { command = getLastQueuedCommand(true); if (command != nullptr) { - ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host"); + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No command, awaiting complete status on host"); command->awaitCompletion(); command->release(); } @@ -154,7 +154,7 @@ void HostQueue::finishCommand(Command* command) { // Check hardware event status for the specific command static constexpr bool kWaitCompletion = true; if (!device().IsHwEventReady(command->event(), kWaitCompletion)) { - ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host"); + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host"); command->awaitCompletion(); } } @@ -181,7 +181,7 @@ void HostQueue::finish(bool cpu_wait) { } size_t batchSize = GetSubmissionBatchSize(); - ClPrint(LOG_DEBUG, LOG_CMD, + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "finish() called with batch size: %zu, cpu_wait: %d, " "fence dirty: %d", batchSize, cpu_wait, vdev()->isFenceDirty()); @@ -203,7 +203,7 @@ void HostQueue::finish(bool cpu_wait) { // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status static constexpr bool kWaitCompletion = true; if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion, GetSyncPolicy())) { - ClPrint(LOG_DEBUG, LOG_CMD, + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No HW event or batch size is less than %zu, " "await command completion", minBatchSize); @@ -258,14 +258,14 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) { // Process the command's event wait list. const Command::EventWaitList& events = command->eventWaitList(); bool dependencyFailed = false; - ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) processing: %p ,events.size(): %d", + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) processing: %p ,events.size(): %d", amd::activity_prof::getOclCommandKindString(command->type()), command, events.size()); for (const auto& it : events) { // Only wait if the command is enqueued into another queue. if (it->command().queue() != this) { // Runtime has to flush the current batch only if the dependent wait is blocking if (it->command().status() != CL_COMPLETE) { - ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) %p awaiting event: %p", + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) %p awaiting event: %p", amd::activity_prof::getOclCommandKindString(command->type()), command, it); virtualDevice->flush(head, true); tail = head = NULL; @@ -287,7 +287,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) { continue; } - ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p", + ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) submitted: %p", amd::activity_prof::getOclCommandKindString(command->type()), command); command->setStatus(CL_SUBMITTED); diff --git a/projects/clr/rocclr/platform/runtime.cpp b/projects/clr/rocclr/platform/runtime.cpp index 52a6c5cfd6..4ad1852e34 100644 --- a/projects/clr/rocclr/platform/runtime.cpp +++ b/projects/clr/rocclr/platform/runtime.cpp @@ -80,7 +80,8 @@ bool Runtime::init() { ClPrint(LOG_ERROR, LOG_INIT, "Runtime initialization failed"); return false; } - ClPrint(LOG_INFO, LOG_MISC, "ROCclr version: %s", ROCCLR_VERSION_GITHASH); + + ClPrint(LOG_INFO, LOG_MISC && !amd::IS_HIP, "ROCclr version: %s", ROCCLR_VERSION_GITHASH); initialized_ = true; pid_ = amd::Os::getProcessId(); diff --git a/projects/clr/rocclr/utils/debug.hpp b/projects/clr/rocclr/utils/debug.hpp index ad9e92eb6b..be5ffc4517 100644 --- a/projects/clr/rocclr/utils/debug.hpp +++ b/projects/clr/rocclr/utils/debug.hpp @@ -41,7 +41,8 @@ enum LogLevel { LOG_WARNING = 2, LOG_INFO = 3, LOG_DEBUG = 4, - LOG_EXTRA_DEBUG = 5 + LOG_DETAIL_DEBUG = 5, + LOG_EXTRA_DEBUG = 6 }; enum LogMask {