SWDEV-553126 - Improve logging (#835)
* Ability to mask COPY api usage in logs * Show total graph nodes in logs * Add another log level for detailed debug
This commit is contained in:
@@ -32,7 +32,7 @@ namespace hip {
|
||||
|
||||
// ================================================================================================
|
||||
hip::Stream* Device::NullStream(bool wait) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "NullStream %p, wait %d", null_stream_, wait);
|
||||
if (null_stream_ == nullptr) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
if (null_stream_ == nullptr) {
|
||||
@@ -188,7 +188,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
|
||||
|
||||
if (wait_null_stream) {
|
||||
if (null_stream_) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "Waiting on nullstream %p", null_stream_);
|
||||
waitForStream(null_stream_);
|
||||
}
|
||||
} else {
|
||||
@@ -199,7 +199,8 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
|
||||
((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
|
||||
// and it's not the current stream
|
||||
(active_stream != blocking_stream)) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p",
|
||||
active_stream);
|
||||
// Get the last valid command
|
||||
waitForStream(active_stream);
|
||||
}
|
||||
|
||||
@@ -399,7 +399,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned
|
||||
e->SetCaptureStream(stream);
|
||||
if ((stream != nullptr && stream != hipStreamLegacy) &&
|
||||
(s->GetCaptureStatus() == hipStreamCaptureStatusActive)) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE,
|
||||
"[hipGraph] Current capture node EventRecord on stream : %p, Event %p", stream, event);
|
||||
s->SetCaptureEvent(event);
|
||||
std::vector<hip::GraphNode*> lastCapturedNodes = s->GetLastCapturedNodes();
|
||||
@@ -411,7 +411,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned
|
||||
reinterpret_cast<hip::GraphNode* const*>(s->GetLastCapturedNodes().data()),
|
||||
s->GetLastCapturedNodes().size(), false);
|
||||
if (status != hipSuccess) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_API, "hipEventRecord add external event node failed");
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "hipEventRecord add external event node failed");
|
||||
return status;
|
||||
}
|
||||
s->SetLastCapturedNode(node);
|
||||
|
||||
@@ -228,7 +228,7 @@ hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph
|
||||
|
||||
hipError_t capturehipLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim,
|
||||
dim3& blockDim, void**& args, size_t& sharedMemBytes) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node LaunchKernel on stream : %p", stream);
|
||||
|
||||
if (!hip::isValid(stream)) {
|
||||
@@ -316,7 +316,7 @@ hipError_t capturehipExtModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f
|
||||
size_t& sharedMemBytes, void**& kernelParams,
|
||||
void**& extra, hipEvent_t& startEvent,
|
||||
hipEvent_t& stopEvent, uint32_t& flags) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node ExtModuleLaunchKernel on stream : %p", stream);
|
||||
return ihipExtLaunchKernel(stream, f, globalWorkSizeX / localWorkSizeX,
|
||||
globalWorkSizeY / localWorkSizeY, globalWorkSizeZ / localWorkSizeZ,
|
||||
@@ -329,7 +329,7 @@ hipError_t capturehipExtModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f
|
||||
hipError_t capturehipExtLaunchKernel(hipStream_t& stream, const void*& hostFunction, dim3& gridDim,
|
||||
dim3& blockDim, void**& args, size_t& sharedMemBytes,
|
||||
hipEvent_t& startEvent, hipEvent_t& stopEvent, int& flags) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node ExtLaunchKernel on stream : %p", stream);
|
||||
return ihipExtLaunchKernel(
|
||||
stream, reinterpret_cast<hipFunction_t>(const_cast<void*>(hostFunction)), gridDim.x,
|
||||
@@ -342,7 +342,7 @@ hipError_t capturehipModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f, u
|
||||
uint32_t& blockDimY, uint32_t& blockDimZ,
|
||||
uint32_t& sharedMemBytes, void**& kernelParams,
|
||||
void**& extra) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node ModuleLaunchKernel on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
@@ -372,7 +372,7 @@ hipError_t capturehipModuleLaunchCooperativeKernel(hipStream_t& stream, hipFunct
|
||||
uint32_t& gridDimZ, uint32_t& blockDimX,
|
||||
uint32_t& blockDimY, uint32_t& blockDimZ,
|
||||
uint32_t& sharedMemBytes, void**& kernelParams) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node ModuleLaunchCooperativeKernel on stream : %p", stream);
|
||||
|
||||
if (!hip::isValid(stream)) {
|
||||
@@ -403,8 +403,8 @@ hipError_t capturehipModuleLaunchCooperativeKernel(hipStream_t& stream, hipFunct
|
||||
|
||||
hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 blockDim,
|
||||
dim3 gridDim, unsigned int sharedMemBytes, void** extra) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node LaunchByPtr on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node LaunchByPtr on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -433,7 +433,7 @@ hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 b
|
||||
hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f, dim3& gridDim,
|
||||
dim3& blockDim, void**& kernelParams,
|
||||
uint32_t& sharedMemBytes) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node LaunchCooperativeKernel on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
@@ -462,8 +462,8 @@ hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f
|
||||
}
|
||||
|
||||
hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*& p) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy3D on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memcpy3D on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -488,8 +488,8 @@ hipError_t capturehipMemcpy3DAsync(hipStream_t& stream, const hipMemcpy3DParms*&
|
||||
hipError_t capturehipMemcpy2DAsync(hipStream_t& stream, void*& dst, size_t& dpitch,
|
||||
const void*& src, size_t& spitch, size_t& width, size_t& height,
|
||||
hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy2D on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memcpy2D on stream : %p", stream);
|
||||
if (dst == nullptr || src == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
@@ -536,7 +536,7 @@ hipError_t capturehipMemcpy2DFromArrayAsync(hipStream_t& stream, void*& dst, siz
|
||||
hipArray_const_t& src, size_t& wOffsetSrc,
|
||||
size_t& hOffsetSrc, size_t& width, size_t& height,
|
||||
hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memcpy2DFromArray on stream : %p", stream);
|
||||
|
||||
// Skip zero-sized copies
|
||||
@@ -577,7 +577,7 @@ hipError_t capturehipMemcpy2DFromArrayAsync(hipStream_t& stream, void*& dst, siz
|
||||
hipError_t capturehipMemcpy2DToArrayAsync(hipStream_t& stream, hipArray_t& dst, size_t& wOffset,
|
||||
size_t& hOffset, const void*& src, size_t& spitch,
|
||||
size_t& width, size_t& height, hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memcpy2DFromArray on stream : %p", stream);
|
||||
|
||||
// Skip zero-sized copies
|
||||
@@ -616,7 +616,7 @@ hipError_t capturehipMemcpy2DToArrayAsync(hipStream_t& stream, hipArray_t& dst,
|
||||
}
|
||||
|
||||
hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D*& pCopy) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
@@ -691,7 +691,7 @@ hipError_t capturehipMemcpyParam2DAsync(hipStream_t& stream, const hip_Memcpy2D*
|
||||
|
||||
hipError_t capturehipMemcpyAtoHAsync(hipStream_t& stream, void*& dstHost, hipArray_t& srcArray,
|
||||
size_t& srcOffset, size_t& ByteCount) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream);
|
||||
if (srcArray == nullptr || dstHost == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -720,7 +720,7 @@ hipError_t capturehipMemcpyAtoHAsync(hipStream_t& stream, void*& dstHost, hipArr
|
||||
|
||||
hipError_t capturehipMemcpyHtoAAsync(hipStream_t& stream, hipArray_t& dstArray, size_t& dstOffset,
|
||||
const void*& srcHost, size_t& ByteCount) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyParam2D on stream : %p", stream);
|
||||
if (dstArray == nullptr || srcHost == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -748,6 +748,8 @@ hipError_t capturehipMemcpyHtoAAsync(hipStream_t& stream, hipArray_t& dstArray,
|
||||
|
||||
hipError_t capturehipMemcpy(hipStream_t stream, void* dst, const void* src, size_t sizeBytes,
|
||||
hipMemcpyKind kind) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memcpy on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -770,8 +772,8 @@ hipError_t capturehipMemcpy(hipStream_t stream, void* dst, const void* src, size
|
||||
|
||||
hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& src,
|
||||
size_t& sizeBytes, hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memcpy1D on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memcpy1D on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -779,9 +781,10 @@ hipError_t capturehipMemcpyAsync(hipStream_t& stream, void*& dst, const void*& s
|
||||
}
|
||||
|
||||
hipError_t capturehipMemcpyHtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDevice,
|
||||
const void*& srcHost, size_t& ByteCount, hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node MemcpyHtoD on stream : %p",
|
||||
stream);
|
||||
const void*& srcHost, size_t& ByteCount,
|
||||
hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyHtoD on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -791,18 +794,19 @@ hipError_t capturehipMemcpyHtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDev
|
||||
hipError_t capturehipMemcpyDtoDAsync(hipStream_t& stream, hipDeviceptr_t& dstDevice,
|
||||
hipDeviceptr_t& srcDevice, size_t& ByteCount,
|
||||
hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
"[hipGraph] Current capture node hipMemcpyDtoD on stream : %p", stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyDtoD on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
return capturehipMemcpy(stream, dstDevice, srcDevice, ByteCount, kind);
|
||||
}
|
||||
|
||||
hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, hipDeviceptr_t& srcDevice,
|
||||
size_t& ByteCount, hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
"[hipGraph] Current capture node hipMemcpyDtoH on stream : %p", stream);
|
||||
hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost,
|
||||
hipDeviceptr_t& srcDevice, size_t& ByteCount,
|
||||
hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyDtoH on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -810,8 +814,9 @@ hipError_t capturehipMemcpyDtoHAsync(hipStream_t& stream, void*& dstHost, hipDev
|
||||
}
|
||||
|
||||
hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, const void*& symbol,
|
||||
size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
size_t& sizeBytes, size_t& offset,
|
||||
hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyFromSymbolNode on stream : %p", stream);
|
||||
|
||||
if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice &&
|
||||
@@ -847,7 +852,7 @@ hipError_t capturehipMemcpyFromSymbolAsync(hipStream_t& stream, void*& dst, cons
|
||||
|
||||
hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbol, const void*& src,
|
||||
size_t& sizeBytes, size_t& offset, hipMemcpyKind& kind) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MemcpyToSymbolNode on stream : %p", stream);
|
||||
|
||||
if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice &&
|
||||
@@ -882,8 +887,8 @@ hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbo
|
||||
|
||||
hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, size_t& valueSize,
|
||||
size_t& sizeBytes) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset1D on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memset1D on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -908,8 +913,8 @@ hipError_t capturehipMemsetAsync(hipStream_t& stream, void*& dst, int& value, si
|
||||
|
||||
hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitch, int& value,
|
||||
size_t& width, size_t& height) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset2D on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memset2D on stream : %p", stream);
|
||||
hipMemsetParams memsetParams = {0};
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
@@ -934,8 +939,8 @@ hipError_t capturehipMemset2DAsync(hipStream_t& stream, void*& dst, size_t& pitc
|
||||
|
||||
hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDevPtr, int& value,
|
||||
hipExtent& extent) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node Memset3D on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node Memset3D on stream : %p", stream);
|
||||
if (!hip::isValid(stream)) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
@@ -966,8 +971,8 @@ hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDe
|
||||
}
|
||||
|
||||
hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*& userData) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Current capture node host on stream : %p",
|
||||
stream);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node HostFunction launch on stream : %p", stream);
|
||||
if (fn == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
@@ -992,6 +997,8 @@ hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*&
|
||||
// ================================================================================================
|
||||
hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size,
|
||||
void** dev_ptr) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node MallocAsync on stream : %p", stream);
|
||||
auto s = reinterpret_cast<hip::Stream*>(stream);
|
||||
auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
|
||||
|
||||
@@ -1029,6 +1036,8 @@ hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t capturehipFreeAsync(hipStream_t stream, void* dev_ptr) {
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node FreeAsync on stream : %p", stream);
|
||||
hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
|
||||
auto mem_free_node = new hip::GraphMemFreeNode(dev_ptr);
|
||||
auto status =
|
||||
|
||||
@@ -122,7 +122,7 @@ bool Graph::isGraphValid(Graph* pGraph) {
|
||||
// ================================================================================================
|
||||
void Graph::AddNode(const Node& node) {
|
||||
vertices_.emplace_back(node);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Add %s(%p)",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "[hipGraph] Add %s(%p)",
|
||||
GetGraphNodeTypeString(node->GetType()), node);
|
||||
node->SetParentGraph(this);
|
||||
}
|
||||
@@ -140,7 +140,7 @@ std::vector<Node> Graph::GetRootNodes() const {
|
||||
for (auto entry : vertices_) {
|
||||
if (entry->GetInDegree() == 0) {
|
||||
roots.push_back(entry);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "[hipGraph] Root node: %s(%p)",
|
||||
GetGraphNodeTypeString(entry->GetType()), entry);
|
||||
}
|
||||
}
|
||||
@@ -703,6 +703,11 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) {
|
||||
repeatLaunch_ = true;
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_CODE,
|
||||
"GraphExec::Run max_streams: %d, "
|
||||
"on device: %d, total number of nodes: %d",
|
||||
max_streams_, launch_stream->DeviceId(), topoOrder_.size());
|
||||
|
||||
if (max_streams_ == 1 && instantiateDeviceId_ == launch_stream->DeviceId()) {
|
||||
if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) {
|
||||
// If the graph has kernels that does device side allocation, during packet capture, heap is
|
||||
|
||||
@@ -2327,7 +2327,7 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
amd::Device::VmmAccess::kReadWrite);
|
||||
va_->retain();
|
||||
graph_->IncrementMemAllocNodeCount(); // Increment count of unreleased mem alloc nodes
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
|
||||
vaddr_sub_obj->getSvmPtr(),
|
||||
reinterpret_cast<char*>(vaddr_sub_obj->getSvmPtr()) + aligned_size, memory());
|
||||
}
|
||||
@@ -2390,7 +2390,7 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
// be executed again
|
||||
amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr);
|
||||
}
|
||||
}
|
||||
return error;
|
||||
@@ -2405,7 +2405,7 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
va_ = amd::MemObjMap::FindVirtualMemObj(node_params_.dptr);
|
||||
amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr);
|
||||
}
|
||||
return node_params_.dptr;
|
||||
}
|
||||
@@ -2470,7 +2470,7 @@ class GraphMemFreeNode : public GraphNode {
|
||||
}
|
||||
amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
|
||||
graph_->DecrementMemAllocNodeCount(); // Decrement count of unreleased memalloc nodes
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(),
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(),
|
||||
vaddr_sub_obj);
|
||||
}
|
||||
|
||||
@@ -2500,7 +2500,7 @@ class GraphMemFreeNode : public GraphNode {
|
||||
graph, stream->DeviceId(), *stream, amd::Command::EventWaitList{}, device_ptr_,
|
||||
amd::alignUp(va->getSize(), dev_info.virtualMemAllocGranularity_), nullptr);
|
||||
commands_.push_back(cmd);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_);
|
||||
}
|
||||
}
|
||||
return error;
|
||||
|
||||
@@ -476,7 +476,7 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig
|
||||
|
||||
hip::Stream* eventStream = reinterpret_cast<hip::Stream*>(eventStreamHandle);
|
||||
if (eventStream != nullptr && eventStream->IsEventCaptured(event) == true) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_API,
|
||||
"[hipGraph] Current capture node StreamWaitEvent on stream : %p, Event %p", stream,
|
||||
event);
|
||||
if (waitStream == nullptr) {
|
||||
|
||||
@@ -39,7 +39,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
LogError("Couldn't map device memory for host read");
|
||||
return false;
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy D2H, src=%p, dst=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Using host memcpy D2H, src=%p, dst=%p, size=%zu",
|
||||
(reinterpret_cast<const_address>(src) + origin[0]), dstHost, size[0]);
|
||||
// Copy memory
|
||||
std::memcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
|
||||
@@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory
|
||||
return false;
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy H2D, src=%p, dst=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Using host memcpy H2D, src=%p, dst=%p, size=%zu",
|
||||
srcHost, (reinterpret_cast<address>(dst) + origin[0]), size[0]);
|
||||
// Copy memory
|
||||
std::memcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
|
||||
@@ -294,7 +294,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM
|
||||
LogError("Couldn't map destination memory");
|
||||
return false;
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy for copyBuffer, src=%p, dst=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY,
|
||||
"Using host memcpy for copyBuffer, src=%p, dst=%p, size=%zu",
|
||||
(reinterpret_cast<const_address>(src) + srcOrigin[0]),
|
||||
(reinterpret_cast<address>(dst) + dstOrigin[0]), size[0]);
|
||||
// Straight forward buffer copy
|
||||
@@ -328,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
|
||||
return false;
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_COPY, "Using host memcpy for copyBufferRect, src=%p, "
|
||||
"dst=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY,
|
||||
"Using host memcpy for copyBufferRect, src=%p, dst=%p, size=%zu",
|
||||
(reinterpret_cast<const_address>(src) + srcRect.offset(0, 0, 0)),
|
||||
(reinterpret_cast<address>(dst) + dstRect.offset(0, 0, 0)), size[0]);
|
||||
|
||||
|
||||
@@ -1049,7 +1049,7 @@ static void dumpCodeObject(const std::string& image) {
|
||||
char fname[30];
|
||||
static std::atomic<int> index;
|
||||
sprintf(fname, "_code_object%04d.o", index++);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Code object saved in %s\n", fname);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Code object saved in %s\n", fname);
|
||||
std::ofstream ofs;
|
||||
ofs.open(fname, std::ios::binary);
|
||||
ofs << image;
|
||||
@@ -2539,7 +2539,7 @@ bool Program::createKernelMetadataMap(void* binary, size_t binSize) {
|
||||
|
||||
status = amd::Comgr::metadata_lookup(metadata_, "Kernels", &kernelsMD);
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V2.");
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V2.");
|
||||
hasKernelMD = true;
|
||||
codeObjectVer_ = 2;
|
||||
} else {
|
||||
@@ -2591,13 +2591,13 @@ bool Program::createKernelMetadataMap(void* binary, size_t binSize) {
|
||||
|
||||
if (major_version == '1') {
|
||||
if (minor_version == '0') {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V3.");
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V3.");
|
||||
codeObjectVer_ = 3;
|
||||
} else if (minor_version == '1') {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V4.");
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V4.");
|
||||
codeObjectVer_ = 4;
|
||||
} else if (minor_version == '2') {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_CODE, "Using Code Object V5.");
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_CODE, "Using Code Object V5.");
|
||||
codeObjectVer_ = 5;
|
||||
} else {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_CODE,
|
||||
|
||||
@@ -310,7 +310,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
// Copy memory line by line
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
|
||||
"HSA Async Copy Rect dst=0x%zx, src=0x%zx, wait_event=0x%zx, "
|
||||
"completion_signal=0x%zx",
|
||||
dstMem.base, srcMem.base, (wait_events.size() != 0) ? wait_events[0].handle : 0,
|
||||
@@ -335,7 +335,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
|
||||
size_t dstOffset = dstRect.offset(0, y, z);
|
||||
|
||||
// Copy memory line by line
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
|
||||
"HSA Async Copy wait_event=0x%zx, completion_signal=0x%zx",
|
||||
(wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);
|
||||
hsa_status_t status = hsa_amd_memory_async_copy(
|
||||
@@ -509,7 +509,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
|
||||
|
||||
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
|
||||
copyMask = gpu().getLastUsedSdmaEngine();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
|
||||
copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
|
||||
if (copyMask == 0) {
|
||||
// Check SDMA engine status
|
||||
@@ -539,7 +539,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
|
||||
// Copy on the first available free engine if ROCr returns a valid mask
|
||||
hsa_amd_sdma_engine_id_t copyEngine = static_cast<hsa_amd_sdma_engine_id_t>(copyMask);
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
|
||||
"HSA Copy copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
|
||||
"size=%ld, forceSDMA=%d, engineType=%d, wait_event=0x%zx, completion_signal=0x%zx",
|
||||
copyEngine, dst, src, size, forceSDMA, engine,
|
||||
@@ -554,7 +554,7 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
|
||||
}
|
||||
|
||||
if (engine == HwQueueEngine::Unknown || kUseRegularCopyApi) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
|
||||
"HSA Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, "
|
||||
"completion_signal=0x%zx, engineType=%d",
|
||||
dst, src, size, (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle,
|
||||
@@ -656,7 +656,8 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, bool enablePi
|
||||
if (pinnedMem != nullptr) {
|
||||
Memory* pinnedMemory = dev().getRocMemory(pinnedMem);
|
||||
address pinBuffer = pinnedMemory->getDeviceMemory();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d", xferSize);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "HSA Copy Using Pinned resource size %d",
|
||||
xferSize);
|
||||
buffState.copySize_ = xferSize;
|
||||
buffState.buffer_ = pinBuffer + partial1 + partial2;
|
||||
buffState.pinnedMem_ = pinnedMem;
|
||||
@@ -666,7 +667,8 @@ void DmaBlitManager::getBuffer(const_address hostMem, size_t size, bool enablePi
|
||||
}
|
||||
// If Memory Pinning fails, failback to staging buffer
|
||||
xferSize = std::min(xferSize, StagingXferSize);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d", xferSize);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "HSA Copy Using Staging resource size %d",
|
||||
xferSize);
|
||||
buffState.copySize_ = xferSize;
|
||||
buffState.buffer_ = gpu().Staging().Acquire(std::min(xferSize, StagingXferSize));
|
||||
}
|
||||
@@ -711,7 +713,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
|
||||
}
|
||||
if (hostToDev) { // H2D Path
|
||||
if (outBuffer.pinnedMem_ == nullptr) { // Copy to Staging Buffer
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
|
||||
stagingBuffer, hostSrc + copyOffset, copysize);
|
||||
memcpy(stagingBuffer, hostSrc + copyOffset, copysize);
|
||||
}
|
||||
@@ -731,7 +733,7 @@ bool DmaBlitManager::hsaCopyStagedOrPinned(const_address hostSrc, address hostDs
|
||||
if (outBuffer.pinnedMem_ == nullptr) {
|
||||
// Wait for current signal of previous rocr copy if its not pinned mem
|
||||
gpu().Barriers().WaitCurrent();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
|
||||
hostDst + copyOffset, stagingBuffer, copysize);
|
||||
memcpy(hostDst + copyOffset, stagingBuffer, copysize);
|
||||
}
|
||||
@@ -1754,7 +1756,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
// Wait for current signal of previous blit copy if its not pinned mem
|
||||
if (outBuffer.pinnedMem_ == nullptr) {
|
||||
gpu().Barriers().WaitCurrent();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy host dst=%p, stg buf=%p, size=%zu",
|
||||
(void*)(dstAddr + stagedCopyOffset), stagingBuffer, copySize);
|
||||
memcpy(dstAddr + stagedCopyOffset, stagingBuffer, copySize);
|
||||
}
|
||||
@@ -1875,7 +1877,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
copySize = outBuffer.copySize_;
|
||||
address currentDstAddr = dstAddr + stagedCopyOffset;
|
||||
if (outBuffer.pinnedMem_ == nullptr) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "memcpy stg buf=%p, host src=%p, size=%zu",
|
||||
stagingBuffer, (void*)(srcAddr + stagedCopyOffset), copySize);
|
||||
memcpy(stagingBuffer, srcAddr + stagedCopyOffset, copySize);
|
||||
}
|
||||
|
||||
@@ -234,12 +234,13 @@ Device::~Device() {
|
||||
hsa_queue_t* queue = qIter->first;
|
||||
auto& qInfo = qIter->second;
|
||||
if (qInfo.hostcallBuffer_) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hostcall buffer %p for hardware queue %p",
|
||||
qInfo.hostcallBuffer_, qIter->first->base_address);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE,
|
||||
"Deleting hostcall buffer %p for hardware queue %p", qInfo.hostcallBuffer_,
|
||||
qIter->first->base_address);
|
||||
amd::disableHostcalls(qInfo.hostcallBuffer_);
|
||||
context().svmFree(qInfo.hostcallBuffer_);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0",
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_QUEUE, "Deleting hardware queue %p with refCount 0",
|
||||
queue->base_address);
|
||||
qIter = it.erase(qIter);
|
||||
hsa_queue_destroy(queue);
|
||||
@@ -1987,11 +1988,11 @@ hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
|
||||
break;
|
||||
case kUncachedAtomics:
|
||||
if (agentInfo->ext_fine_grain_pool.handle != 0) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM,
|
||||
"Using extended fine grained access system memory pool");
|
||||
segment = agentInfo->ext_fine_grain_pool;
|
||||
} else {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_MEM,
|
||||
"Falling through on fine grained access system memory pool");
|
||||
segment = agentInfo->fine_grain_pool;
|
||||
}
|
||||
@@ -2065,7 +2066,7 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
|
||||
LogPrintfError("get_mempolicy failed with error %ld", res);
|
||||
return ptr;
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_RESOURCE,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_RESOURCE,
|
||||
"get_mempolicy() succeed with mode %d, nodeMask 0x%lx, cpuCount %zu", mode,
|
||||
*nodeMask->maskp, cpuCount);
|
||||
|
||||
@@ -2805,7 +2806,7 @@ bool Device::IsHwEventReady(const amd::Event& event, bool wait, amd::SyncPolicy
|
||||
void* hw_event =
|
||||
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
|
||||
if (hw_event == nullptr) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_SIG, "No HW event");
|
||||
return false;
|
||||
} else if (wait) {
|
||||
// hipEventBlockingSync
|
||||
|
||||
@@ -122,7 +122,7 @@ void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& reg
|
||||
if (indirectMapCount_ == 1) {
|
||||
if (!allocateMapMemory(owner()->getSize())) {
|
||||
decIndMapCount();
|
||||
DevLogPrintfError("Cannot allocate Map memory for size: %u \n", owner()->getSize());
|
||||
DevLogPrintfError("Cannot allocate Map memory for size: %u", owner()->getSize());
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
@@ -180,7 +180,7 @@ void* Memory::cpuMap(device::VirtualDevice& vDev, uint flags, uint startLayer, u
|
||||
if (!isHostMemDirectAccess() && !IsPersistentDirectMap()) {
|
||||
if (!vDev.blitMgr().readBuffer(*this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) {
|
||||
decIndMapCount();
|
||||
DevLogError("Cannot read buffer \n");
|
||||
DevLogError("Cannot read buffer");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@@ -192,7 +192,7 @@ void Memory::cpuUnmap(device::VirtualDevice& vDev) {
|
||||
if (!isHostMemDirectAccess() && !IsPersistentDirectMap()) {
|
||||
if (!vDev.blitMgr().writeBuffer(mapMemory_->getHostMem(), *this, amd::Coord3D(0),
|
||||
amd::Coord3D(size()), true)) {
|
||||
LogError("[OCL] Fail sync the device memory on cpuUnmap");
|
||||
LogError("Fail sync the device memory on cpuUnmap");
|
||||
}
|
||||
// Wait on CPU for the transfer
|
||||
static_cast<roc::VirtualGPU&>(vDev).releaseGpuMemoryFence();
|
||||
@@ -624,7 +624,7 @@ Buffer::~Buffer() {
|
||||
// Detach the memory from HSA
|
||||
auto hsa_status = hsa_amd_ipc_memory_detach(owner()->getSvmPtr());
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("HSA failed to detach memory with status: %d \n", hsa_status);
|
||||
LogPrintfError("HSA failed to detach memory with status: %d", hsa_status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -666,8 +666,8 @@ void Buffer::destroy() {
|
||||
}
|
||||
} else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_destroy(signal_)) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
|
||||
"[ROCClr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal destroy failed \n");
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
|
||||
"hsa_signal_destroy failed");
|
||||
}
|
||||
deviceMemory_ = nullptr;
|
||||
} else {
|
||||
@@ -684,7 +684,7 @@ void Buffer::destroy() {
|
||||
}
|
||||
// destroy system memory
|
||||
if (!(amd::Os::releaseMemory(deviceMemory_, size()))) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "[ROCClr] munmap failed \n");
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_MEM, "munmap failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -764,7 +764,7 @@ bool Buffer::create(bool alloc_local) {
|
||||
reinterpret_cast<const amd::IpcBuffer*>(owner())->Handle()),
|
||||
owner()->getSize(), ipc_agents_num, dev().IpcAgents(), &orig_dev_ptr);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("HSA failed to attach IPC memory with status: %d \n", hsa_status);
|
||||
LogPrintfError("HSA failed to attach IPC memory with status: %d", hsa_status);
|
||||
return false;
|
||||
}
|
||||
owner()->setSvmPtr(orig_dev_ptr);
|
||||
@@ -779,7 +779,7 @@ bool Buffer::create(bool alloc_local) {
|
||||
// if interprocess flag is set, then the memory is importable.
|
||||
if (!dev().ImportShareableHSAHandle(owner()->getSvmPtr(),
|
||||
&owner()->getUserData().hsa_handle)) {
|
||||
LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x \n",
|
||||
LogPrintfError("Importing Shareable Memory failed with os_handle: 0x%x",
|
||||
owner()->getSvmPtr());
|
||||
return false;
|
||||
}
|
||||
@@ -835,13 +835,13 @@ bool Buffer::create(bool alloc_local) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr,
|
||||
HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal_)) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
|
||||
"[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
|
||||
"hsa_amd_signal_create signal failed");
|
||||
return false;
|
||||
}
|
||||
volatile hsa_signal_value_t* signalValuePtr = nullptr;
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
|
||||
"[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed");
|
||||
"hsa_amd_signal_value_pointer failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1043,7 +1043,7 @@ bool Buffer::ExportHandle(void* handle) const {
|
||||
auto hsa_status = hsa_amd_ipc_memory_create(orig_dev_ptr, owner()->getSize(),
|
||||
reinterpret_cast<hsa_amd_ipc_memory_t*>(handle));
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("Failed to create memory for IPC, failed with hsa_status: %d \n", hsa_status);
|
||||
LogPrintfError("Failed to create memory for IPC, failed with hsa_status: %d", hsa_status);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -1304,7 +1304,7 @@ bool Image::create(bool alloc_local) {
|
||||
permission_, &deviceImageInfo_);
|
||||
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogPrintfError("[OCL] Fail to allocate image memory, failed with hsa_status: %d \n", status);
|
||||
LogPrintfError("Fail to allocate image memory, failed with hsa_status: %d", status);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -773,7 +773,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
mem = memories[index];
|
||||
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
|
||||
if (mem == nullptr) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(),
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(),
|
||||
desc.name_.c_str(), globalAddress);
|
||||
//! This condition is for SVM fine-grain
|
||||
if (dev().isFineGrainedSystem(true)) {
|
||||
@@ -787,7 +787,7 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));
|
||||
|
||||
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i,
|
||||
desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
|
||||
gpuMem->getDeviceMemory(),
|
||||
reinterpret_cast<address>(gpuMem->getDeviceMemory()) + mem->getSize());
|
||||
@@ -874,10 +874,10 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
if (desc.size_ > kMaxBytes) {
|
||||
bytes += "...";
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i,
|
||||
desc.typeName_.c_str(), desc.name_.c_str(), bytes.c_str(), desc.size_);
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
|
||||
desc.typeName_.c_str(), desc.name_.c_str(),
|
||||
(desc.size_ == 1) ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
|
||||
: (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
|
||||
@@ -1621,7 +1621,8 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
|
||||
} else {
|
||||
// Reset the signal for the barrier packet
|
||||
hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Issue barrier to flush chunk %d", active_chunk_);
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN, "Issue barrier to flush chunk %d",
|
||||
active_chunk_);
|
||||
// Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
|
||||
constexpr bool kSkipSignal = false;
|
||||
// Dispatch a barrier packet into the queue
|
||||
@@ -3489,7 +3490,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
gpuKernel.KernargSegmentAlignment());
|
||||
command_->SetKernelName(gpuKernel.getDemangledName().c_str());
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN,
|
||||
"KernargSegmentByteSize = %lu "
|
||||
"KernargSegmentAlignment = %lu",
|
||||
gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment());
|
||||
@@ -3544,11 +3545,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
// Validate privateMemSize is more than max allowed.
|
||||
size_t maxStackSize = dev().MaxStackSize();
|
||||
if (dispatchPacket.private_segment_size > maxStackSize) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_KERN,
|
||||
"Scratch size (%u) exceeds max allowed (%zu) for kernel : %s",
|
||||
dispatchPacket.private_segment_size, maxStackSize,
|
||||
gpuKernel.getDemangledName().c_str());
|
||||
LogError("Scratch size exceeds max allowed.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi
|
||||
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
||||
kTimeout100us, HSA_WAIT_STATE_ACTIVE) != 0) {
|
||||
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG,
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_SIG,
|
||||
"Device not Stable, while waiting for Signal ="
|
||||
"(0x%lx) for %d ns",
|
||||
signal.handle, kTimeout100us);
|
||||
@@ -79,7 +79,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool yi
|
||||
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
||||
kTimeout4Secs, wait_state) != 0) {
|
||||
if (HIP_SKIP_ABORT_ON_GPU_ERROR && amd::Device::IsGPUInError()) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG,
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_SIG,
|
||||
"Device not Stable, while waiting for Signal ="
|
||||
"(0x%lx) for %d ns",
|
||||
signal.handle, kTimeout4Secs);
|
||||
|
||||
@@ -160,12 +160,12 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
|
||||
}
|
||||
|
||||
if (profilingInfo().enabled_) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)",
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command %p complete (Wall: %ld, CPU: %ld, GPU: %ld us)",
|
||||
&command(), ((profilingInfo().end_ - epoch) / 1000),
|
||||
((profilingInfo().submitted_ - profilingInfo().queued_) / 1000),
|
||||
((profilingInfo().end_ - profilingInfo().start_) / 1000));
|
||||
} else {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command %p complete", &command());
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command %p complete", &command());
|
||||
}
|
||||
release();
|
||||
}
|
||||
@@ -177,7 +177,7 @@ bool Event::setStatus(int32_t status, uint64_t timeStamp) {
|
||||
bool Event::resetStatus(int32_t status) {
|
||||
int32_t currentStatus = this->status();
|
||||
if (currentStatus != CL_COMPLETE) {
|
||||
ClPrint(LOG_ERROR, LOG_CMD, "command is reset before complete current status :%d",
|
||||
ClPrint(LOG_ERROR, LOG_CMD, "Command is reset before complete current status :%d",
|
||||
currentStatus);
|
||||
}
|
||||
if (!status_.compare_exchange_strong(currentStatus, status, std::memory_order_relaxed)) {
|
||||
@@ -191,7 +191,7 @@ bool Event::resetStatus(int32_t status) {
|
||||
// ================================================================================================
|
||||
bool Event::setCallback(int32_t status, Event::CallBackFunction callback, void* data,
|
||||
bool blocking) {
|
||||
assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status");
|
||||
assert(status >= CL_COMPLETE && status <= CL_QUEUED && "Invalid status");
|
||||
|
||||
CallBackEntry* entry = new CallBackEntry(status, callback, data, blocking);
|
||||
if (entry == NULL) {
|
||||
@@ -240,8 +240,8 @@ bool Event::awaitCompletion() {
|
||||
return false;
|
||||
}
|
||||
|
||||
ClPrint(LOG_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d", this,
|
||||
status());
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_WAIT, "Waiting for event %p to complete, current status %d",
|
||||
this, status());
|
||||
auto* queue = command().queue();
|
||||
if ((queue != nullptr) && queue->vdev()->ActiveWait()) {
|
||||
while (status() > CL_COMPLETE) {
|
||||
@@ -255,7 +255,7 @@ bool Event::awaitCompletion() {
|
||||
lock_.wait();
|
||||
}
|
||||
}
|
||||
ClPrint(LOG_DEBUG, LOG_WAIT, "Event %p wait completed", this);
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_WAIT, "Event %p wait completed", this);
|
||||
}
|
||||
|
||||
return status() == CL_COMPLETE;
|
||||
@@ -353,7 +353,7 @@ void Command::enqueue() {
|
||||
Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
|
||||
}
|
||||
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p",
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p",
|
||||
amd::activity_prof::getOclCommandKindString(this->type()), this, queue_);
|
||||
|
||||
// Direct dispatch logic below will submit the command immediately, but the command status
|
||||
|
||||
@@ -75,7 +75,7 @@ bool HostQueue::terminate() {
|
||||
if (GetSubmissionBatch() != nullptr) {
|
||||
auto command = new Marker(*this, false);
|
||||
if (command != nullptr) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish");
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Marker queued to ensure finish");
|
||||
command->enqueue();
|
||||
lastCommand->release();
|
||||
lastCommand = command;
|
||||
@@ -145,7 +145,7 @@ void HostQueue::finishCommand(Command* command) {
|
||||
if (command == nullptr) {
|
||||
command = getLastQueuedCommand(true);
|
||||
if (command != nullptr) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No command, awaiting complete status on host");
|
||||
command->awaitCompletion();
|
||||
command->release();
|
||||
}
|
||||
@@ -154,7 +154,7 @@ void HostQueue::finishCommand(Command* command) {
|
||||
// Check hardware event status for the specific command
|
||||
static constexpr bool kWaitCompletion = true;
|
||||
if (!device().IsHwEventReady(command->event(), kWaitCompletion)) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "No HW event, awaiting complete status on host");
|
||||
command->awaitCompletion();
|
||||
}
|
||||
}
|
||||
@@ -181,7 +181,7 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
}
|
||||
|
||||
size_t batchSize = GetSubmissionBatchSize();
|
||||
ClPrint(LOG_DEBUG, LOG_CMD,
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD,
|
||||
"finish() called with batch size: %zu, cpu_wait: %d, "
|
||||
"fence dirty: %d",
|
||||
batchSize, cpu_wait, vdev()->isFenceDirty());
|
||||
@@ -203,7 +203,7 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
|
||||
static constexpr bool kWaitCompletion = true;
|
||||
if (cpu_wait || !device().IsHwEventReady(command->event(), kWaitCompletion, GetSyncPolicy())) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD,
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD,
|
||||
"No HW event or batch size is less than %zu, "
|
||||
"await command completion",
|
||||
minBatchSize);
|
||||
@@ -258,14 +258,14 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
|
||||
// Process the command's event wait list.
|
||||
const Command::EventWaitList& events = command->eventWaitList();
|
||||
bool dependencyFailed = false;
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) processing: %p ,events.size(): %d",
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) processing: %p ,events.size(): %d",
|
||||
amd::activity_prof::getOclCommandKindString(command->type()), command, events.size());
|
||||
for (const auto& it : events) {
|
||||
// Only wait if the command is enqueued into another queue.
|
||||
if (it->command().queue() != this) {
|
||||
// Runtime has to flush the current batch only if the dependent wait is blocking
|
||||
if (it->command().status() != CL_COMPLETE) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) %p awaiting event: %p",
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) %p awaiting event: %p",
|
||||
amd::activity_prof::getOclCommandKindString(command->type()), command, it);
|
||||
virtualDevice->flush(head, true);
|
||||
tail = head = NULL;
|
||||
@@ -287,7 +287,7 @@ void HostQueue::loop(device::VirtualDevice* virtualDevice) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) submitted: %p",
|
||||
ClPrint(LOG_DETAIL_DEBUG, LOG_CMD, "Command (%s) submitted: %p",
|
||||
amd::activity_prof::getOclCommandKindString(command->type()), command);
|
||||
|
||||
command->setStatus(CL_SUBMITTED);
|
||||
|
||||
@@ -80,7 +80,8 @@ bool Runtime::init() {
|
||||
ClPrint(LOG_ERROR, LOG_INIT, "Runtime initialization failed");
|
||||
return false;
|
||||
}
|
||||
ClPrint(LOG_INFO, LOG_MISC, "ROCclr version: %s", ROCCLR_VERSION_GITHASH);
|
||||
|
||||
ClPrint(LOG_INFO, LOG_MISC && !amd::IS_HIP, "ROCclr version: %s", ROCCLR_VERSION_GITHASH);
|
||||
|
||||
initialized_ = true;
|
||||
pid_ = amd::Os::getProcessId();
|
||||
|
||||
@@ -41,7 +41,8 @@ enum LogLevel {
|
||||
LOG_WARNING = 2,
|
||||
LOG_INFO = 3,
|
||||
LOG_DEBUG = 4,
|
||||
LOG_EXTRA_DEBUG = 5
|
||||
LOG_DETAIL_DEBUG = 5,
|
||||
LOG_EXTRA_DEBUG = 6
|
||||
};
|
||||
|
||||
enum LogMask {
|
||||
|
||||
Reference in New Issue
Block a user