From 5ca2131ce037c1691893f9fe435ff93e637ca3ec Mon Sep 17 00:00:00 2001 From: ChingShihLi Date: Thu, 29 Feb 2024 15:15:53 +0800 Subject: [PATCH] SWDEV-413377 - Segfault for hipLaunchKernel with multi GPUs - Using runtime unbundler, no any gfx device can load fat binary, if there is any device without available code object. - Extract available code object to corresponding gfx devices. So users can work ROCm with those ready devices without segmentation fault. Change-Id: I9f14c65ecebf2d3c4b127a007cb434a3ae98c450 [ROCm/clr commit: 6723277ad498df18f96e9ae9878a6aec879fd0f0] --- projects/clr/hipamd/src/hip_fatbin.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/projects/clr/hipamd/src/hip_fatbin.cpp b/projects/clr/hipamd/src/hip_fatbin.cpp index ac8775597d..562f193861 100644 --- a/projects/clr/hipamd/src/hip_fatbin.cpp +++ b/projects/clr/hipamd/src/hip_fatbin.cpp @@ -340,6 +340,29 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector& devi } else { LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for ptr: 0x%x", image_); } + + // For the condition: unable to find code object for all devices, + // still extract available images to those devices owning them. + // This helps users to work with ROCm if there is any supported + // GFX on system. + for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) { + if (code_objs[dev_idx].first) { + // Calculate the offset wrt binary_image and the original image + size_t offset_l + = (reinterpret_cast
(const_cast(code_objs[dev_idx].first)) + - reinterpret_cast
(const_cast(image_))); + + fatbin_dev_info_[devices[dev_idx]->deviceId()] + = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l); + + fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ + = new amd::Program(*devices[dev_idx]->asContext()); + if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) { + break; + } + } + } + return hip_error; }