From f06368fd041caf62d0c902c061520180dfe5facf Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Wed, 8 Nov 2023 17:31:42 -0800 Subject: [PATCH] SWDEV-301667 - Add error logging Change-Id: I814399dc0e7083bb7fb0ed8bf46dd96bdf664965 --- hipamd/src/hip_code_object.cpp | 9 ++------- rocclr/device/rocm/rocdevice.cpp | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/hipamd/src/hip_code_object.cpp b/hipamd/src/hip_code_object.cpp index e9b41d316b..bbc8fd0017 100644 --- a/hipamd/src/hip_code_object.cpp +++ b/hipamd/src/hip_code_object.cpp @@ -438,8 +438,6 @@ hipError_t CodeObject::ExtractCodeObjectFromFile( amd::Os::FileDesc fdesc, size_t fsize, const void** image, const std::vector& device_names, std::vector>& code_objs) { - hipError_t hip_error = hipSuccess; - if (!amd::Os::isValidFileDesc(fdesc)) { return hipErrorFileNotFound; } @@ -452,9 +450,7 @@ hipError_t CodeObject::ExtractCodeObjectFromFile( } // retrieve code_objs{binary_image, binary_size} for devices - hip_error = extractCodeObjectFromFatBinary(*image, device_names, code_objs); - - return hip_error; + return extractCodeObjectFromFatBinary(*image, device_names, code_objs); } // This will be moved to COMGR eventually @@ -534,14 +530,13 @@ hipError_t CodeObject::extractCodeObjectFromFatBinary( bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id); if (valid_co) { - LogPrintfError(" %s - [code object targetID is %s]", bundleEntryId.c_str(), + LogPrintfError(" %s - [Code object targetID is %s]", bundleEntryId.c_str(), co_triple_target_id.c_str()); } else { LogPrintfError(" %s - [Unsupported]", bundleEntryId.c_str()); } } - LogPrintfError("hipErrorNoBinaryForGpu: Unable to find code object for all current devices! - %d",hipErrorNoBinaryForGpu); return hipErrorNoBinaryForGpu; } } diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 4003401c0e..7f18e220bd 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -360,6 +360,7 @@ hsa_status_t Device::iterateAgentCallback(hsa_agent_t agent, void* data) { hsa_status_t stat = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &dev_type); if (stat != HSA_STATUS_SUCCESS) { + LogPrintfError("HSA_AGENT_INFO_DEVICE failed with %x", stat); return stat; } @@ -458,21 +459,25 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) { // ================================================================================================ bool Device::init() { ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack."); - + hsa_status_t status = HSA_STATUS_SUCCESS; // Initialize the compiler if (!initCompiler(offlineDevice_)) { + LogError("initCompiler failed."); return false; } - if (HSA_STATUS_SUCCESS != hsa_init()) { - LogError("hsa_init failed."); + status = hsa_init(); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("hsa_init failed with %x", status); return false; } hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(amd_loader_ext_table), &amd_loader_ext_table); - if (HSA_STATUS_SUCCESS != hsa_iterate_agents(iterateAgentCallback, nullptr)) { + status = hsa_iterate_agents(iterateAgentCallback, nullptr); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("hsa_iterate_agents failed with %x", status); return false; } @@ -513,6 +518,8 @@ bool Device::init() { gpu_agents_ = valid_agents; } + LogPrintfInfo("Enumerated GPU agents = %lu", gpu_agents_.size()); + for (auto agent : gpu_agents_) { std::unique_ptr roc_device(new Device(agent)); if (!roc_device) { @@ -568,6 +575,7 @@ bool Device::init() { // Create a dummy context for internal memory allocations on all reported devices glb_ctx_ = new amd::Context(devices, amd::Context::Info()); if (glb_ctx_ == nullptr) { + LogError("glb_ctx failed"); return false; } @@ -579,6 +587,7 @@ bool Device::init() { p2p_stage_ = buf; } else { delete buf; + LogError("p2p stg buffer alloc failed"); return false; } } @@ -589,6 +598,7 @@ bool Device::init() { *glb_ctx_, (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS), kMGInfoSizePerDevice * devices.size(), kMGInfoSizePerDevice)); if (mg_sync_ == nullptr) { + LogError("mgpu sync buffer alloc failed"); return false; } } @@ -973,6 +983,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) { if (data == nullptr) { + LogError("CpuMemoryPoolCallback invalid args"); return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -980,6 +991,7 @@ hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo hsa_status_t stat = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); if (stat != HSA_STATUS_SUCCESS) { + LogPrintfError("HSA_AMD_MEMORY_POOL_INFO_SEGMENT query failed with %x", stat); return stat; } AgentInfo* agentInfo = reinterpret_cast(data); @@ -990,6 +1002,7 @@ hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo stat = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); if (stat != HSA_STATUS_SUCCESS) { + LogPrintfError("HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS query failed with %x", stat); break; }