Add kpack runtime integration for split device code artifacts (#2622)

Integrates rocm-kpack runtime library for loading device code from
external kpack archives at HIP initialization time.

Changes:
- Add kpack_params_ optional to FatBinaryInfo for HIPK metadata
- Parse HIPK magic (0x4B504948) in digestFatBinary to detect kpack'd binaries
- Add ExtractKpackBinary() to load code objects via kpack_load_code_object()
- Wire up kpack cache lifecycle in hip_global.cpp
- Track kpack allocations for proper cleanup
- Support multi-TU binaries via bundle_index (co_index parameter)

The ROCM_KPACK_ENABLED cmake flag controls whether kpack support is compiled
in. When disabled, HIPK binaries return hipErrorNotSupported.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Stella Laurenzo
2026-01-26 09:50:42 -08:00
committed by GitHub
vanhempi dbd26a88b4
commit e0dcba903e
7 muutettua tiedostoa jossa 214 lisäystä ja 1 poistoa
@@ -180,6 +180,19 @@ if(DISABLE_DIRECT_DISPATCH)
target_compile_definitions(amdhip64 PRIVATE DISABLE_DIRECT_DISPATCH)
endif()
# Optional rocm-kpack support for kpack split artifacts
# This option will be enabled permanently at a future point and gates the use
# of the rocm-kpack library for detecting ROCm multi-arch archives in
# distributions of ROCm so that the CLR can load them just as it would normal
# fat binaries. See the WIP repo: https://github.com/ROCm/rocm-kpack (which
# will be migrated to rocm-systems when ready).
option(ROCM_KPACK_ENABLED "Enable kpack runtime loading for split device code" OFF)
if(ROCM_KPACK_ENABLED)
find_package(rocm-kpack REQUIRED)
target_compile_definitions(amdhip64 PRIVATE ROCM_KPACK_ENABLED=1)
target_link_libraries(amdhip64 PRIVATE rocm::rocm_kpack)
endif()
# Short-Term solution for pre-compiled headers for online compilation
# Enable pre compiled header
if(__HIP_ENABLE_PCH)
@@ -32,6 +32,7 @@ THE SOFTWARE.
#include <elf/elf.hpp>
#include "comgrctx.hpp"
#include "hip_comgr_helper.hpp"
#include "hip_platform.hpp"
namespace hip {
hipError_t ihipFree(void* ptr);
@@ -265,6 +266,39 @@ hipError_t StatCO::digestFatBinary(const void* data, FatBinaryInfo*& programs) {
return hipSuccess;
}
// Fat binary wrapper structure (matches hip_platform.cpp definition)
// Defined locally to keep kpack integration as implementation detail
struct __CudaFatBinaryWrapper {
unsigned int magic;
unsigned int version;
void* binary;
void* dummy1; // reserved1: bundle index for multi-TU binaries
};
// Check if this is a kpack'd binary (HIPK magic)
const auto* wrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
if (wrapper->magic == symbols::kHipkMagic && wrapper->version == 1) {
// Discover binary path from the wrapper address using existing CLR utility
std::string binary_path;
size_t file_offset = 0;
if (!amd::Os::FindFileNameFromAddress(data, &binary_path, &file_offset)) {
LogError("Failed to discover binary path for kpack loading");
return hipErrorNoBinaryForGpu;
}
// Get bundle index from wrapper->dummy1 (reserved1 field)
// For multi-TU binaries, this identifies which bundle this wrapper corresponds to
uint64_t bundle_index = reinterpret_cast<uintptr_t>(wrapper->dummy1);
// wrapper->binary points to msgpack metadata
// ExtractKpackBinary will error if ROCM_KPACK_ENABLED=OFF
FatBinaryInfo* fatBinaryInfo = new FatBinaryInfo(
FatBinaryInfo::KpackParams{wrapper->binary, std::move(binary_path), bundle_index});
hipError_t err = fatBinaryInfo->ExtractKpackBinary(g_devices);
programs = fatBinaryInfo;
return err;
}
// Create a new fat binary object and extract the fat binary for all devices.
FatBinaryInfo* fatBinaryInfo = new FatBinaryInfo(nullptr, data);
hipError_t err = fatBinaryInfo->ExtractFatBinaryUsingCOMGR(g_devices);
@@ -287,6 +321,26 @@ FatBinaryInfo** StatCO::addFatBinary(const void* data, bool initialized, bool& s
return &modules_[data];
}
FatBinaryInfo** StatCO::addKpackBinary(const void* hipk_metadata, const void* wrapper_addr,
bool initialized, bool& success) {
amd::ScopedLock lock(sclock_);
// Use wrapper_addr as the key (same as data pointer for normal path)
// This allows digestFatBinary to access the wrapper and detect HIPK magic
module_to_hostModule_.insert(std::make_pair(&modules_[wrapper_addr], wrapper_addr));
if (!initialized) {
// Deferred loading: modules_[wrapper_addr] is nullptr, digestFatBinary will handle it later
success = true;
return &modules_[wrapper_addr];
}
// Immediate loading: call digestFatBinary which handles kpack detection
hipError_t err = digestFatBinary(wrapper_addr, modules_[wrapper_addr]);
success = (err == hipSuccess);
return &modules_[wrapper_addr];
}
hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
amd::ScopedLock lock(sclock_);
@@ -55,6 +55,10 @@ constexpr char kHipFatBinName_[] = "hipfatbin-";
constexpr char kOffloadKindHipv4_[] = "hipv4-"; // bundled code objects need the prefix
constexpr char kOffloadHipV4FatBinName_[] = "hipfatbin-hipv4-";
// Fat binary wrapper magic values
constexpr uint32_t kHipfMagic = 0x48495046; // "HIPF" little-endian (normal fat binary)
constexpr uint32_t kHipkMagic = 0x4B504948; // "HIPK" little-endian (kpack'd binary)
// Clang Offload bundler description & Header in uncompressed mode.
struct ClangOffloadBundleInfo {
uint64_t offset;
@@ -154,6 +158,8 @@ class StatCO : public CodeObject {
// Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
FatBinaryInfo** addFatBinary(const void* data, bool initialized, bool& success);
FatBinaryInfo** addKpackBinary(const void* hipk_metadata, const void* wrapper_addr,
bool initialized, bool& success);
hipError_t removeFatBinary(FatBinaryInfo** module);
hipError_t digestFatBinary(const void* data, FatBinaryInfo*& programs);
void RemoveAllFatBinaries();
+108 -1
Näytä tiedosto
@@ -24,18 +24,38 @@ THE SOFTWARE.
#include "hip_fatbin.hpp"
#include "hip_global.hpp"
#include <unordered_map>
#include <mutex>
#include "hip_code_object.hpp"
#include "hip_platform.hpp"
#include "comgrctx.hpp"
#include "amd_hsa_elf.hpp"
#include "hip_comgr_helper.hpp"
#if ROCM_KPACK_ENABLED
#include <rocm_kpack/kpack.h>
#endif
namespace hip {
// Use ComgrUniqueHandle and type aliases from hip_comgr_helper.hpp
using comgr_helper::ComgrDataSetUniqueHandle;
using comgr_helper::ComgrActionInfoUniqueHandle;
using comgr_helper::ComgrDataUniqueHandle;
#if ROCM_KPACK_ENABLED
namespace {
// HIP process-global kpack cache - initialized on first use
std::once_flag g_hipKpackCacheInitFlag;
kpack_cache_t g_hipKpackCache = nullptr;
void initHipKpackCache() { kpack_cache_create(&g_hipKpackCache); }
kpack_cache_t getHipKpackCache() {
std::call_once(g_hipKpackCacheInitFlag, initHipKpackCache);
return g_hipKpackCache;
}
} // namespace
#endif
FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
: foffset_(0), image_(image), image_mapped_(false), uri_(std::string()) {
if (fname != nullptr) {
@@ -47,6 +67,11 @@ FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
dev_programs_.resize(g_devices.size(), nullptr);
}
FatBinaryInfo::FatBinaryInfo(KpackParams kpack_params)
: FatBinaryInfo(kpack_params.binary_path.c_str(), nullptr) {
kpack_params_ = std::move(kpack_params);
}
FatBinaryInfo::~FatBinaryInfo() {
// Release per device fat bin info.
for (int dev_id = 0; dev_id < dev_programs_.size(); dev_id++) {
@@ -57,7 +82,16 @@ FatBinaryInfo::~FatBinaryInfo() {
}
// Release Code object allocations
for (const auto& i : code_obj_allocations_) {
delete[] reinterpret_cast<const char*>(i);
if (kpack_params_.has_value()) {
// Kpack-allocated code objects must be freed via kpack API
#if ROCM_KPACK_ENABLED
kpack_free_code_object(const_cast<void*>(i));
#else
guarantee(false, "Kpack code object but ROCM_KPACK_ENABLED=OFF");
#endif
} else {
delete[] reinterpret_cast<const char*>(i);
}
}
ReleaseImageAndFile();
}
@@ -640,6 +674,79 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
return hip_status;
}
// This function is always defined but errors if ROCM_KPACK_ENABLED=OFF
// TODO: Extract SPIR-V translation from ExtractFatBinaryUsingCOMGR and call
// it from both of these entry-points once we have enough testing in place
// to ensure this advanced case is functional.
hipError_t FatBinaryInfo::ExtractKpackBinary(const std::vector<hip::Device*>& devices) {
#if !ROCM_KPACK_ENABLED
LogError("Kpack binary detected but ROCM_KPACK_ENABLED=OFF");
return hipErrorNotSupported;
#else
if (!kpack_params_.has_value()) {
LogError("ExtractKpackBinary called but kpack_params_ not set");
return hipErrorInvalidValue;
}
const auto& params = kpack_params_.value();
if (params.metadata == nullptr) {
LogError("HIPK metadata is null");
return hipErrorInvalidValue;
}
// Build architecture priority list from devices
// For each device, add native ISA first, then generic fallback
std::vector<std::string> arch_list;
for (auto device : devices) {
std::string device_name = device->devices()[0]->isa().isaName();
arch_list.push_back(device_name);
// Add generic fallback
auto generic_name = TargetToGeneric(device_name);
if (!generic_name.empty()) {
arch_list.push_back(generic_name);
}
}
// Convert to C-style array for kpack API
std::vector<const char*> arch_ptrs;
for (const auto& arch : arch_list) {
arch_ptrs.push_back(arch.c_str());
}
// Load code object from kpack archive
void* code_object = nullptr;
size_t code_object_size = 0;
// binary_path is used to resolve relative paths to kpack archives.
// bundle_index identifies which code object to load for multi-TU binaries.
// The kernel_name (used for TOC lookup) is embedded in the HIPK metadata.
kpack_error_t err =
kpack_load_code_object(getHipKpackCache(), params.metadata, fname_.c_str(),
static_cast<uint32_t>(params.bundle_index),
arch_ptrs.data(), arch_ptrs.size(), &code_object, &code_object_size);
if (err != KPACK_SUCCESS) {
LogPrintfError("kpack_load_code_object failed with error: %d", err);
return hipErrorInvalidImage;
}
// Add code object to all devices
for (auto device : devices) {
hipError_t hip_err = AddDevProgram(device, code_object, code_object_size, 0);
if (hip_err != hipSuccess) {
kpack_free_code_object(code_object);
return hip_err;
}
}
// Track allocation for cleanup in destructor
code_obj_allocations_.insert(code_object);
return hipSuccess;
#endif
}
hipError_t FatBinaryInfo::AddDevProgram(hip::Device* device, const void* binary_image,
size_t binary_size, size_t binary_offset) {
int devID = device->deviceId();
@@ -28,6 +28,8 @@ THE SOFTWARE.
#include "hip_internal.hpp"
#include "platform/program.hpp"
#include <optional>
// Forward declaration for Unique FD
struct UniqueFD;
@@ -36,10 +38,20 @@ namespace hip {
// Fat Binary Info
class FatBinaryInfo {
public:
// Parameters for kpack'd (split device code) binaries
struct KpackParams {
const void* metadata; //!< Msgpack metadata from .rocm_kpack_ref section
std::string binary_path; //!< Path to the host binary
uint64_t bundle_index; //!< Bundle index for multi-TU binaries (0-based)
};
FatBinaryInfo(const char* fname, const void* image);
// Constructor for kpack'd (split device code) binaries
explicit FatBinaryInfo(KpackParams kpack_params);
~FatBinaryInfo();
hipError_t ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices);
hipError_t ExtractKpackBinary(const std::vector<hip::Device*>& devices);
hipError_t AddDevProgram(hip::Device* device, const void* binary_image, size_t binary_size,
size_t binary_offset);
hipError_t BuildProgram(const int device_id);
@@ -84,6 +96,9 @@ class FatBinaryInfo {
// Only used for FBs where image is directly passed
std::string uri_; //!< Uniform resource indicator
// Kpack parameters for split device code binaries (nullopt for normal fat binaries)
std::optional<KpackParams> kpack_params_;
std::vector<amd::Program*> dev_programs_; //!< Program info per Device
std::shared_ptr<UniqueFD> ufd_; //!< Unique file descriptor
@@ -73,6 +73,17 @@ static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const c
void** __hipRegisterFatBinary(const void* data) {
const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
// Check for HIPK magic (kpack'd binary with external device code)
if (fbwrapper->magic == symbols::kHipkMagic && fbwrapper->version == 1) {
// For HIPK binaries, fbwrapper->binary points to msgpack metadata
// Route through addKpackBinary which will error if ROCM_KPACK_ENABLED=OFF
bool success = false;
auto fat_binary_info = PlatformState::instance().addKpackBinary(fbwrapper->binary, data, success);
return success ? reinterpret_cast<void**>(fat_binary_info) : nullptr;
}
// Normal HIPF path
if (fbwrapper->magic != __hipFatMAGIC2 || fbwrapper->version != 1) {
LogPrintfError("Cannot Register fat binary. FatMagic: %u version: %u ", fbwrapper->magic,
fbwrapper->version);
@@ -1003,6 +1014,11 @@ hip::FatBinaryInfo** PlatformState::addFatBinary(const void* data, bool& success
return statCO_.addFatBinary(data, initialized_, success);
}
hip::FatBinaryInfo** PlatformState::addKpackBinary(const void* hipk_metadata,
const void* wrapper_addr, bool& success) {
return statCO_.addKpackBinary(hipk_metadata, wrapper_addr, initialized_, success);
}
hipError_t PlatformState::removeFatBinary(hip::FatBinaryInfo** module) {
return statCO_.removeFatBinary(module);
}
@@ -87,6 +87,8 @@ class PlatformState {
// Static Code Objects functions
hip::FatBinaryInfo** addFatBinary(const void* data, bool& success);
hip::FatBinaryInfo** addKpackBinary(const void* hipk_metadata, const void* wrapper_addr,
bool& success);
hipError_t removeFatBinary(hip::FatBinaryInfo** module);
hipError_t digestFatBinary(const void* data, hip::FatBinaryInfo*& programs);