From 994c9384e78447534be220f34394cd2e1cfa6590 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Thu, 3 Nov 2022 19:30:53 -0700 Subject: [PATCH] SWDEV-364052 - Print free memory on queue callback Change-Id: I7d15b6f6277326d5f3e9b784a3443f18ec6ba58a [ROCm/clr commit: 3e465c5ff811e19b7c36ef3fabbc4286df44dc3d] --- projects/clr/rocclr/device/rocm/rocdevice.cpp | 18 +++++++++++++++-- .../clr/rocclr/device/rocm/rocvirtual.cpp | 20 +++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index f9d7743d58..7830e6ce2d 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2670,8 +2670,22 @@ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { // Abort on device exceptions. const char* errorMsg = 0; hsa_status_string(status, &errorMsg); - ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, - "Device::callbackQueue aborting with error : %s code: 0x%x", errorMsg, status); + if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { + size_t global_available_mem = 0; + Device* dev = reinterpret_cast(data); + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(dev->getBackendDevice(), + static_cast(HSA_AMD_AGENT_INFO_MEMORY_AVAIL), + &global_available_mem)) { + LogError("HSA_AMD_AGENT_INFO_MEMORY_AVAIL query failed."); + } + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, + "Callback: Queue %p Aborting with error : %s Code: 0x%x Available Free mem : %zu MB", + queue->base_address, errorMsg, status, global_available_mem/Mi); + } else { + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, + "Callback: Queue %p aborting with error : %s code: 0x%x", queue->base_address, + errorMsg, status); + } abort(); } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index c055da80a3..5454bab8cf 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2516,8 +2516,24 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { // Abort on device exceptions. - ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, "VirtualGPU::callbackQueue aborting with status: 0x%x", - status); + const char* errorMsg = 0; + hsa_status_string(status, &errorMsg); + if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { + size_t global_available_mem = 0; + VirtualGPU* vgpu = reinterpret_cast(data); + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(vgpu->gpu_device(), + static_cast(HSA_AMD_AGENT_INFO_MEMORY_AVAIL), + &global_available_mem)) { + LogError("HSA_AMD_AGENT_INFO_MEMORY_AVAIL query failed."); + } + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, + "Callback: Queue %p Aborting with error : %s Code: 0x%x Available Free mem : %zu MB", + queue->base_address, errorMsg, status, global_available_mem/Mi); + } else { + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, + "Callback: Queue %p aborting with error : %s code: 0x%x", queue->base_address, + errorMsg, status); + } abort(); } }