diff --git a/projects/clr/hipamd/api/hip/hip_context.cpp b/projects/clr/hipamd/api/hip/hip_context.cpp index 983dc9b13c..9603b938b6 100644 --- a/projects/clr/hipamd/api/hip/hip_context.cpp +++ b/projects/clr/hipamd/api/hip/hip_context.cpp @@ -25,16 +25,16 @@ THE SOFTWARE. #include "platform/runtime.hpp" #include "utils/versions.hpp" #include +#include thread_local amd::Context* g_context = nullptr; thread_local std::stack g_ctxtStack; std::vector g_devices; +std::once_flag g_ihipInitialized; -hipError_t hipInit(unsigned int flags) +void ihipInit() { - HIP_INIT_API(flags); - if (!amd::Runtime::initialized()) { amd::Runtime::init(); } @@ -44,18 +44,26 @@ hipError_t hipInit(unsigned int flags) for (unsigned int i=0; i device(1, devices[i]); amd::Context* context = new amd::Context(device, amd::Context::Info()); - if (!context) return hipErrorOutOfMemory; + if (!context) return; if (context && CL_SUCCESS != context->create(nullptr)) { context->release(); } else { g_devices.push_back(context); + g_context = context; } } +} + + +hipError_t hipInit(unsigned int flags) +{ + HIP_INIT_API(flags); return hipSuccess; } + hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) { HIP_INIT_API(ctx, flags, device); @@ -171,4 +179,4 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx) g_ctxtStack.push(g_context); return hipSuccess; -} \ No newline at end of file +} diff --git a/projects/clr/hipamd/api/hip/hip_internal.hpp b/projects/clr/hipamd/api/hip/hip_internal.hpp index 239538e613..3d334fa2ac 100644 --- a/projects/clr/hipamd/api/hip/hip_internal.hpp +++ b/projects/clr/hipamd/api/hip/hip_internal.hpp @@ -25,20 +25,26 @@ THE SOFTWARE. #include "cl_common.hpp" -#define HIP_INIT()\ +#include + +#define HIP_INIT() \ + std::call_once(g_ihipInitialized, ihipInit); + + +// This macro should be called at the beginning of every HIP API. +#define HIP_INIT_API(...) \ + HIP_INIT(); \ + \ amd::Thread* thread = amd::Thread::current(); \ if (!CL_CHECK_THREAD(thread)) { \ return hipErrorOutOfMemory; \ } - -// This macro should be called at the beginning of every HIP API. -#define HIP_INIT_API(...) \ - HIP_INIT() - +extern std::once_flag g_ihipInitialized; extern thread_local amd::Context* g_context; extern std::vector g_devices; -hipError_t ihipDeviceGetCount(int* count); +extern hipError_t ihipDeviceGetCount(int* count); +extern void ihipInit(); #endif // HIP_SRC_HIP_INTERNAL_H diff --git a/projects/clr/hipamd/api/hip/hip_platform.cpp b/projects/clr/hipamd/api/hip/hip_platform.cpp index 0cc6a3b1c2..db7939c9e7 100644 --- a/projects/clr/hipamd/api/hip/hip_platform.cpp +++ b/projects/clr/hipamd/api/hip/hip_platform.cpp @@ -44,7 +44,7 @@ struct __CudaFatBinaryHeader { unsigned long long int fatSize; }; -struct __CudaPartHeader{ +struct __CudaPartHeader { unsigned short type; unsigned short dummy1; unsigned int headerSize; @@ -54,31 +54,20 @@ struct __CudaPartHeader{ unsigned int subarch; }; -extern "C" hipModule_t __hipRegisterFatBinary(void* bundle) +static hipModule_t registerCudaFatBinary(const __CudaFatBinaryHeader* fbheader) { - if (!amd::Runtime::initialized()) { // FIXME: fix initialization - hipInit(0); - } + const __CudaPartHeader* pheader = reinterpret_cast( + reinterpret_cast(fbheader) + fbheader->headerSize); + const __CudaPartHeader* end = reinterpret_cast( + reinterpret_cast(pheader) + fbheader->fatSize); amd::Program* program = new amd::Program(*g_context); if (!program) return nullptr; - struct __CudaFatBinaryWrapper* fbwrapper = (struct __CudaFatBinaryWrapper*)bundle; - if (fbwrapper->magic != __cudaFatMAGIC2 || fbwrapper->version != 1) { - return nullptr; - } - struct __CudaFatBinaryHeader* fbheader = (struct __CudaFatBinaryHeader*)fbwrapper->binary; - if (fbheader->magic != __cudaFatMAGIC3 || fbheader->version != 1) { - return nullptr; - } - struct __CudaPartHeader* pheader = (struct __CudaPartHeader*)( - (uintptr_t)fbheader + fbheader->headerSize); - struct __CudaPartHeader* end = (struct __CudaPartHeader*)( - (uintptr_t)pheader + fbheader->fatSize); - while (pheader < end) { if (true/*pheader->subarch == match a device in the context*/) { - void *image = (void*)((uintptr_t)pheader + pheader->headerSize); + const void *image = reinterpret_cast( + reinterpret_cast(pheader) + pheader->headerSize); size_t size = pheader->partSize; if (CL_SUCCESS != program->addDeviceProgram(*g_context->devices()[0], image, size) || CL_SUCCESS != program->build(g_context->devices(), nullptr, nullptr, nullptr)) { @@ -86,13 +75,83 @@ extern "C" hipModule_t __hipRegisterFatBinary(void* bundle) } break; } - pheader = (struct __CudaPartHeader*)( - (uintptr_t)pheader + pheader->headerSize + pheader->partSize); + pheader = reinterpret_cast( + reinterpret_cast(pheader) + pheader->headerSize + pheader->partSize); } return reinterpret_cast(as_cl(program)); } +#define CLANG_OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__" +#define AMDGCN_AMDHSA_TRIPLE "openmp-amdgcn--amdhsa" + +struct __ClangOffloadBundleDesc { + uint64_t offset; + uint64_t size; + uint64_t tripleSize; + const char triple[1]; +}; + +struct __ClangOffloadBundleHeader { + const char magic[sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) - 1]; + uint64_t numBundles; + __ClangOffloadBundleDesc desc[1]; +}; + +static hipModule_t registerOffloadBundle(const __ClangOffloadBundleHeader* obheader) +{ + amd::Program* program = new amd::Program(*g_context); + if (!program) + return nullptr; + + const __ClangOffloadBundleDesc* desc = &obheader->desc[0]; + for (uint64_t i = 0; i < obheader->numBundles; ++i, + desc = reinterpret_cast( + reinterpret_cast(&desc->triple[0]) + desc->tripleSize)) { + + std::string triple(desc->triple, sizeof(AMDGCN_AMDHSA_TRIPLE) - 1); + if (triple.compare(AMDGCN_AMDHSA_TRIPLE)) + continue; + + std::string target(desc->triple + sizeof(AMDGCN_AMDHSA_TRIPLE), + desc->tripleSize - sizeof(AMDGCN_AMDHSA_TRIPLE)); + if (target.compare(g_context->devices()[0]->info().name_)) + continue; + + const void *image = reinterpret_cast( + reinterpret_cast(obheader) + desc->offset); + size_t size = desc->size; + + if (CL_SUCCESS == program->addDeviceProgram(*g_context->devices()[0], image, size) && + CL_SUCCESS == program->build(g_context->devices(), nullptr, nullptr, nullptr)) + break; + } + + return reinterpret_cast(as_cl(program)); +} + + +extern "C" hipModule_t __hipRegisterFatBinary(const void* data) +{ + HIP_INIT(); + + const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast(data); + if (fbwrapper->magic != __cudaFatMAGIC2 || fbwrapper->version != 1) { + return nullptr; + } + const __CudaFatBinaryHeader* fbheader = reinterpret_cast(fbwrapper->binary); + if (fbheader->magic == __cudaFatMAGIC3 && fbheader->version == 1) { + return registerCudaFatBinary(fbheader); + } + + std::string magic((char*)fbwrapper->binary, sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) - 1); + if (!magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) { + return registerOffloadBundle(reinterpret_cast(fbwrapper->binary)); + } + + return nullptr; +} + std::map g_functions; @@ -108,6 +167,8 @@ extern "C" void __hipRegisterFunction( dim3* gridDim, int* wSize) { + HIP_INIT(); + amd::Program* program = as_amd(reinterpret_cast(module)); const amd::Symbol* symbol = program->findSymbol(deviceName); @@ -130,12 +191,14 @@ extern "C" void __hipRegisterVar( int constant, int global) { + HIP_INIT(); } extern "C" void __hipUnregisterFatBinary( hipModule_t module ) { + HIP_INIT(); } dim3 g_gridDim; // FIXME: place in execution stack @@ -149,6 +212,8 @@ extern "C" hipError_t hipConfigureCall( size_t sharedMem, hipStream_t stream) { + HIP_INIT_API(gridDim, blockDim, sharedMem, stream); + // FIXME: should push and new entry on the execution stack g_gridDim = gridDim; @@ -166,6 +231,8 @@ extern "C" hipError_t hipSetupArgument( size_t size, size_t offset) { + HIP_INIT_API(arg, size, offset); + // FIXME: should modify the top of the execution stack ::memcpy(g_arguments + offset, arg, size); @@ -174,6 +241,8 @@ extern "C" hipError_t hipSetupArgument( extern "C" hipError_t hipLaunchByPtr(const void *hostFunction) { + HIP_INIT_API(hostFunction); + std::map::iterator it; if ((it = g_functions.find(hostFunction)) == g_functions.end()) return hipErrorUnknown;