SWDEV-413377 - Segfault for hipLaunchKernel with multi GPUs

- Using runtime unbundler, no any gfx device can load fat binary,
if there is any device without available code object.

- Extract available code object to corresponding gfx devices. So
users can work ROCm with those ready devices without segmentation
fault.

Change-Id: I9f14c65ecebf2d3c4b127a007cb434a3ae98c450


[ROCm/clr commit: 6723277ad4]
This commit is contained in:
ChingShihLi
2024-02-29 15:15:53 +08:00
committed by Ching-shih Li
parent 3cde9aacb2
commit 5ca2131ce0
+23
View File
@@ -340,6 +340,29 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devi
} else {
LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for ptr: 0x%x", image_);
}
// For the condition: unable to find code object for all devices,
// still extract available images to those devices owning them.
// This helps users to work with ROCm if there is any supported
// GFX on system.
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
if (code_objs[dev_idx].first) {
// Calculate the offset wrt binary_image and the original image
size_t offset_l
= (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
- reinterpret_cast<address>(const_cast<void*>(image_)));
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
= new amd::Program(*devices[dev_idx]->asContext());
if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
break;
}
}
}
return hip_error;
}