Add kpack runtime integration for split device code artifacts (#2622)
Integrates rocm-kpack runtime library for loading device code from external kpack archives at HIP initialization time. Changes: - Add kpack_params_ optional to FatBinaryInfo for HIPK metadata - Parse HIPK magic (0x4B504948) in digestFatBinary to detect kpack'd binaries - Add ExtractKpackBinary() to load code objects via kpack_load_code_object() - Wire up kpack cache lifecycle in hip_global.cpp - Track kpack allocations for proper cleanup - Support multi-TU binaries via bundle_index (co_index parameter) The ROCM_KPACK_ENABLED cmake flag controls whether kpack support is compiled in. When disabled, HIPK binaries return hipErrorNotSupported. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
Bu işleme şunda yer alıyor:
işlemeyi yapan:
GitHub
ebeveyn
dbd26a88b4
işleme
e0dcba903e
@@ -180,6 +180,19 @@ if(DISABLE_DIRECT_DISPATCH)
|
||||
target_compile_definitions(amdhip64 PRIVATE DISABLE_DIRECT_DISPATCH)
|
||||
endif()
|
||||
|
||||
# Optional rocm-kpack support for kpack split artifacts
|
||||
# This option will be enabled permanently at a future point and gates the use
|
||||
# of the rocm-kpack library for detecting ROCm multi-arch archives in
|
||||
# distributions of ROCm so that the CLR can load them just as it would normal
|
||||
# fat binaries. See the WIP repo: https://github.com/ROCm/rocm-kpack (which
|
||||
# will be migrated to rocm-systems when ready).
|
||||
option(ROCM_KPACK_ENABLED "Enable kpack runtime loading for split device code" OFF)
|
||||
if(ROCM_KPACK_ENABLED)
|
||||
find_package(rocm-kpack REQUIRED)
|
||||
target_compile_definitions(amdhip64 PRIVATE ROCM_KPACK_ENABLED=1)
|
||||
target_link_libraries(amdhip64 PRIVATE rocm::rocm_kpack)
|
||||
endif()
|
||||
|
||||
# Short-Term solution for pre-compiled headers for online compilation
|
||||
# Enable pre compiled header
|
||||
if(__HIP_ENABLE_PCH)
|
||||
|
||||
@@ -32,6 +32,7 @@ THE SOFTWARE.
|
||||
#include <elf/elf.hpp>
|
||||
#include "comgrctx.hpp"
|
||||
#include "hip_comgr_helper.hpp"
|
||||
#include "hip_platform.hpp"
|
||||
|
||||
namespace hip {
|
||||
hipError_t ihipFree(void* ptr);
|
||||
@@ -265,6 +266,39 @@ hipError_t StatCO::digestFatBinary(const void* data, FatBinaryInfo*& programs) {
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// Fat binary wrapper structure (matches hip_platform.cpp definition)
|
||||
// Defined locally to keep kpack integration as implementation detail
|
||||
struct __CudaFatBinaryWrapper {
|
||||
unsigned int magic;
|
||||
unsigned int version;
|
||||
void* binary;
|
||||
void* dummy1; // reserved1: bundle index for multi-TU binaries
|
||||
};
|
||||
|
||||
// Check if this is a kpack'd binary (HIPK magic)
|
||||
const auto* wrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
|
||||
if (wrapper->magic == symbols::kHipkMagic && wrapper->version == 1) {
|
||||
// Discover binary path from the wrapper address using existing CLR utility
|
||||
std::string binary_path;
|
||||
size_t file_offset = 0;
|
||||
if (!amd::Os::FindFileNameFromAddress(data, &binary_path, &file_offset)) {
|
||||
LogError("Failed to discover binary path for kpack loading");
|
||||
return hipErrorNoBinaryForGpu;
|
||||
}
|
||||
|
||||
// Get bundle index from wrapper->dummy1 (reserved1 field)
|
||||
// For multi-TU binaries, this identifies which bundle this wrapper corresponds to
|
||||
uint64_t bundle_index = reinterpret_cast<uintptr_t>(wrapper->dummy1);
|
||||
|
||||
// wrapper->binary points to msgpack metadata
|
||||
// ExtractKpackBinary will error if ROCM_KPACK_ENABLED=OFF
|
||||
FatBinaryInfo* fatBinaryInfo = new FatBinaryInfo(
|
||||
FatBinaryInfo::KpackParams{wrapper->binary, std::move(binary_path), bundle_index});
|
||||
hipError_t err = fatBinaryInfo->ExtractKpackBinary(g_devices);
|
||||
programs = fatBinaryInfo;
|
||||
return err;
|
||||
}
|
||||
|
||||
// Create a new fat binary object and extract the fat binary for all devices.
|
||||
FatBinaryInfo* fatBinaryInfo = new FatBinaryInfo(nullptr, data);
|
||||
hipError_t err = fatBinaryInfo->ExtractFatBinaryUsingCOMGR(g_devices);
|
||||
@@ -287,6 +321,26 @@ FatBinaryInfo** StatCO::addFatBinary(const void* data, bool initialized, bool& s
|
||||
return &modules_[data];
|
||||
}
|
||||
|
||||
FatBinaryInfo** StatCO::addKpackBinary(const void* hipk_metadata, const void* wrapper_addr,
|
||||
bool initialized, bool& success) {
|
||||
amd::ScopedLock lock(sclock_);
|
||||
|
||||
// Use wrapper_addr as the key (same as data pointer for normal path)
|
||||
// This allows digestFatBinary to access the wrapper and detect HIPK magic
|
||||
module_to_hostModule_.insert(std::make_pair(&modules_[wrapper_addr], wrapper_addr));
|
||||
|
||||
if (!initialized) {
|
||||
// Deferred loading: modules_[wrapper_addr] is nullptr, digestFatBinary will handle it later
|
||||
success = true;
|
||||
return &modules_[wrapper_addr];
|
||||
}
|
||||
|
||||
// Immediate loading: call digestFatBinary which handles kpack detection
|
||||
hipError_t err = digestFatBinary(wrapper_addr, modules_[wrapper_addr]);
|
||||
success = (err == hipSuccess);
|
||||
return &modules_[wrapper_addr];
|
||||
}
|
||||
|
||||
hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
|
||||
amd::ScopedLock lock(sclock_);
|
||||
|
||||
|
||||
@@ -55,6 +55,10 @@ constexpr char kHipFatBinName_[] = "hipfatbin-";
|
||||
constexpr char kOffloadKindHipv4_[] = "hipv4-"; // bundled code objects need the prefix
|
||||
constexpr char kOffloadHipV4FatBinName_[] = "hipfatbin-hipv4-";
|
||||
|
||||
// Fat binary wrapper magic values
|
||||
constexpr uint32_t kHipfMagic = 0x48495046; // "HIPF" little-endian (normal fat binary)
|
||||
constexpr uint32_t kHipkMagic = 0x4B504948; // "HIPK" little-endian (kpack'd binary)
|
||||
|
||||
// Clang Offload bundler description & Header in uncompressed mode.
|
||||
struct ClangOffloadBundleInfo {
|
||||
uint64_t offset;
|
||||
@@ -154,6 +158,8 @@ class StatCO : public CodeObject {
|
||||
|
||||
// Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
|
||||
FatBinaryInfo** addFatBinary(const void* data, bool initialized, bool& success);
|
||||
FatBinaryInfo** addKpackBinary(const void* hipk_metadata, const void* wrapper_addr,
|
||||
bool initialized, bool& success);
|
||||
hipError_t removeFatBinary(FatBinaryInfo** module);
|
||||
hipError_t digestFatBinary(const void* data, FatBinaryInfo*& programs);
|
||||
void RemoveAllFatBinaries();
|
||||
|
||||
@@ -24,18 +24,38 @@ THE SOFTWARE.
|
||||
#include "hip_fatbin.hpp"
|
||||
#include "hip_global.hpp"
|
||||
#include <unordered_map>
|
||||
#include <mutex>
|
||||
#include "hip_code_object.hpp"
|
||||
#include "hip_platform.hpp"
|
||||
#include "comgrctx.hpp"
|
||||
#include "amd_hsa_elf.hpp"
|
||||
#include "hip_comgr_helper.hpp"
|
||||
|
||||
#if ROCM_KPACK_ENABLED
|
||||
#include <rocm_kpack/kpack.h>
|
||||
#endif
|
||||
|
||||
namespace hip {
|
||||
// Use ComgrUniqueHandle and type aliases from hip_comgr_helper.hpp
|
||||
using comgr_helper::ComgrDataSetUniqueHandle;
|
||||
using comgr_helper::ComgrActionInfoUniqueHandle;
|
||||
using comgr_helper::ComgrDataUniqueHandle;
|
||||
|
||||
#if ROCM_KPACK_ENABLED
|
||||
namespace {
|
||||
// HIP process-global kpack cache - initialized on first use
|
||||
std::once_flag g_hipKpackCacheInitFlag;
|
||||
kpack_cache_t g_hipKpackCache = nullptr;
|
||||
|
||||
void initHipKpackCache() { kpack_cache_create(&g_hipKpackCache); }
|
||||
|
||||
kpack_cache_t getHipKpackCache() {
|
||||
std::call_once(g_hipKpackCacheInitFlag, initHipKpackCache);
|
||||
return g_hipKpackCache;
|
||||
}
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
|
||||
: foffset_(0), image_(image), image_mapped_(false), uri_(std::string()) {
|
||||
if (fname != nullptr) {
|
||||
@@ -47,6 +67,11 @@ FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
|
||||
dev_programs_.resize(g_devices.size(), nullptr);
|
||||
}
|
||||
|
||||
FatBinaryInfo::FatBinaryInfo(KpackParams kpack_params)
|
||||
: FatBinaryInfo(kpack_params.binary_path.c_str(), nullptr) {
|
||||
kpack_params_ = std::move(kpack_params);
|
||||
}
|
||||
|
||||
FatBinaryInfo::~FatBinaryInfo() {
|
||||
// Release per device fat bin info.
|
||||
for (int dev_id = 0; dev_id < dev_programs_.size(); dev_id++) {
|
||||
@@ -57,7 +82,16 @@ FatBinaryInfo::~FatBinaryInfo() {
|
||||
}
|
||||
// Release Code object allocations
|
||||
for (const auto& i : code_obj_allocations_) {
|
||||
delete[] reinterpret_cast<const char*>(i);
|
||||
if (kpack_params_.has_value()) {
|
||||
// Kpack-allocated code objects must be freed via kpack API
|
||||
#if ROCM_KPACK_ENABLED
|
||||
kpack_free_code_object(const_cast<void*>(i));
|
||||
#else
|
||||
guarantee(false, "Kpack code object but ROCM_KPACK_ENABLED=OFF");
|
||||
#endif
|
||||
} else {
|
||||
delete[] reinterpret_cast<const char*>(i);
|
||||
}
|
||||
}
|
||||
ReleaseImageAndFile();
|
||||
}
|
||||
@@ -640,6 +674,79 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Devi
|
||||
return hip_status;
|
||||
}
|
||||
|
||||
// This function is always defined but errors if ROCM_KPACK_ENABLED=OFF
|
||||
// TODO: Extract SPIR-V translation from ExtractFatBinaryUsingCOMGR and call
|
||||
// it from both of these entry-points once we have enough testing in place
|
||||
// to ensure this advanced case is functional.
|
||||
hipError_t FatBinaryInfo::ExtractKpackBinary(const std::vector<hip::Device*>& devices) {
|
||||
#if !ROCM_KPACK_ENABLED
|
||||
LogError("Kpack binary detected but ROCM_KPACK_ENABLED=OFF");
|
||||
return hipErrorNotSupported;
|
||||
#else
|
||||
if (!kpack_params_.has_value()) {
|
||||
LogError("ExtractKpackBinary called but kpack_params_ not set");
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
const auto& params = kpack_params_.value();
|
||||
if (params.metadata == nullptr) {
|
||||
LogError("HIPK metadata is null");
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
// Build architecture priority list from devices
|
||||
// For each device, add native ISA first, then generic fallback
|
||||
std::vector<std::string> arch_list;
|
||||
for (auto device : devices) {
|
||||
std::string device_name = device->devices()[0]->isa().isaName();
|
||||
arch_list.push_back(device_name);
|
||||
|
||||
// Add generic fallback
|
||||
auto generic_name = TargetToGeneric(device_name);
|
||||
if (!generic_name.empty()) {
|
||||
arch_list.push_back(generic_name);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to C-style array for kpack API
|
||||
std::vector<const char*> arch_ptrs;
|
||||
for (const auto& arch : arch_list) {
|
||||
arch_ptrs.push_back(arch.c_str());
|
||||
}
|
||||
|
||||
// Load code object from kpack archive
|
||||
void* code_object = nullptr;
|
||||
size_t code_object_size = 0;
|
||||
|
||||
// binary_path is used to resolve relative paths to kpack archives.
|
||||
// bundle_index identifies which code object to load for multi-TU binaries.
|
||||
// The kernel_name (used for TOC lookup) is embedded in the HIPK metadata.
|
||||
kpack_error_t err =
|
||||
kpack_load_code_object(getHipKpackCache(), params.metadata, fname_.c_str(),
|
||||
static_cast<uint32_t>(params.bundle_index),
|
||||
arch_ptrs.data(), arch_ptrs.size(), &code_object, &code_object_size);
|
||||
|
||||
if (err != KPACK_SUCCESS) {
|
||||
LogPrintfError("kpack_load_code_object failed with error: %d", err);
|
||||
return hipErrorInvalidImage;
|
||||
}
|
||||
|
||||
// Add code object to all devices
|
||||
for (auto device : devices) {
|
||||
hipError_t hip_err = AddDevProgram(device, code_object, code_object_size, 0);
|
||||
if (hip_err != hipSuccess) {
|
||||
kpack_free_code_object(code_object);
|
||||
return hip_err;
|
||||
}
|
||||
}
|
||||
|
||||
// Track allocation for cleanup in destructor
|
||||
code_obj_allocations_.insert(code_object);
|
||||
|
||||
return hipSuccess;
|
||||
#endif
|
||||
}
|
||||
|
||||
hipError_t FatBinaryInfo::AddDevProgram(hip::Device* device, const void* binary_image,
|
||||
size_t binary_size, size_t binary_offset) {
|
||||
int devID = device->deviceId();
|
||||
|
||||
@@ -28,6 +28,8 @@ THE SOFTWARE.
|
||||
#include "hip_internal.hpp"
|
||||
#include "platform/program.hpp"
|
||||
|
||||
#include <optional>
|
||||
|
||||
// Forward declaration for Unique FD
|
||||
struct UniqueFD;
|
||||
|
||||
@@ -36,10 +38,20 @@ namespace hip {
|
||||
// Fat Binary Info
|
||||
class FatBinaryInfo {
|
||||
public:
|
||||
// Parameters for kpack'd (split device code) binaries
|
||||
struct KpackParams {
|
||||
const void* metadata; //!< Msgpack metadata from .rocm_kpack_ref section
|
||||
std::string binary_path; //!< Path to the host binary
|
||||
uint64_t bundle_index; //!< Bundle index for multi-TU binaries (0-based)
|
||||
};
|
||||
|
||||
FatBinaryInfo(const char* fname, const void* image);
|
||||
// Constructor for kpack'd (split device code) binaries
|
||||
explicit FatBinaryInfo(KpackParams kpack_params);
|
||||
~FatBinaryInfo();
|
||||
|
||||
hipError_t ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices);
|
||||
hipError_t ExtractKpackBinary(const std::vector<hip::Device*>& devices);
|
||||
hipError_t AddDevProgram(hip::Device* device, const void* binary_image, size_t binary_size,
|
||||
size_t binary_offset);
|
||||
hipError_t BuildProgram(const int device_id);
|
||||
@@ -84,6 +96,9 @@ class FatBinaryInfo {
|
||||
// Only used for FBs where image is directly passed
|
||||
std::string uri_; //!< Uniform resource indicator
|
||||
|
||||
// Kpack parameters for split device code binaries (nullopt for normal fat binaries)
|
||||
std::optional<KpackParams> kpack_params_;
|
||||
|
||||
std::vector<amd::Program*> dev_programs_; //!< Program info per Device
|
||||
|
||||
std::shared_ptr<UniqueFD> ufd_; //!< Unique file descriptor
|
||||
|
||||
@@ -73,6 +73,17 @@ static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const c
|
||||
|
||||
void** __hipRegisterFatBinary(const void* data) {
|
||||
const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
|
||||
|
||||
// Check for HIPK magic (kpack'd binary with external device code)
|
||||
if (fbwrapper->magic == symbols::kHipkMagic && fbwrapper->version == 1) {
|
||||
// For HIPK binaries, fbwrapper->binary points to msgpack metadata
|
||||
// Route through addKpackBinary which will error if ROCM_KPACK_ENABLED=OFF
|
||||
bool success = false;
|
||||
auto fat_binary_info = PlatformState::instance().addKpackBinary(fbwrapper->binary, data, success);
|
||||
return success ? reinterpret_cast<void**>(fat_binary_info) : nullptr;
|
||||
}
|
||||
|
||||
// Normal HIPF path
|
||||
if (fbwrapper->magic != __hipFatMAGIC2 || fbwrapper->version != 1) {
|
||||
LogPrintfError("Cannot Register fat binary. FatMagic: %u version: %u ", fbwrapper->magic,
|
||||
fbwrapper->version);
|
||||
@@ -1003,6 +1014,11 @@ hip::FatBinaryInfo** PlatformState::addFatBinary(const void* data, bool& success
|
||||
return statCO_.addFatBinary(data, initialized_, success);
|
||||
}
|
||||
|
||||
hip::FatBinaryInfo** PlatformState::addKpackBinary(const void* hipk_metadata,
|
||||
const void* wrapper_addr, bool& success) {
|
||||
return statCO_.addKpackBinary(hipk_metadata, wrapper_addr, initialized_, success);
|
||||
}
|
||||
|
||||
hipError_t PlatformState::removeFatBinary(hip::FatBinaryInfo** module) {
|
||||
return statCO_.removeFatBinary(module);
|
||||
}
|
||||
|
||||
@@ -87,6 +87,8 @@ class PlatformState {
|
||||
|
||||
// Static Code Objects functions
|
||||
hip::FatBinaryInfo** addFatBinary(const void* data, bool& success);
|
||||
hip::FatBinaryInfo** addKpackBinary(const void* hipk_metadata, const void* wrapper_addr,
|
||||
bool& success);
|
||||
hipError_t removeFatBinary(hip::FatBinaryInfo** module);
|
||||
hipError_t digestFatBinary(const void* data, hip::FatBinaryInfo*& programs);
|
||||
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle