Merge pull request #388 from lmoriche/master
Add support for clang offload bundles and <<<>>> kernel launch.
Šī revīzija ir iekļauta:
@@ -175,6 +175,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
|
||||
set(SOURCE_FILES_RUNTIME
|
||||
src/code_object_bundle.cpp
|
||||
src/hip_clang.cpp
|
||||
src/hip_hcc.cpp
|
||||
src/hip_context.cpp
|
||||
src/hip_device.cpp
|
||||
|
||||
@@ -534,19 +534,67 @@ typedef int hipLaunchParm;
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif /*__cplusplus*/
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ static __device__ __forceinline__
|
||||
|
||||
hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream);
|
||||
hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
|
||||
hipError_t hipLaunchByPtr(const void* func);
|
||||
extern "C" __device__ size_t __ockl_get_local_id(uint);
|
||||
__DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
|
||||
__DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
|
||||
__DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif /*__cplusplus*/
|
||||
extern "C" __device__ size_t __ockl_get_group_id(uint);
|
||||
__DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
|
||||
__DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
|
||||
__DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
|
||||
|
||||
extern "C" __device__ size_t __ockl_get_local_size(uint);
|
||||
__DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
|
||||
__DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
|
||||
__DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
|
||||
|
||||
extern "C" __device__ size_t __ockl_get_num_groups(uint);
|
||||
__DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
|
||||
__DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
|
||||
__DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
|
||||
|
||||
#define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
|
||||
__declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
|
||||
__DEVICE__ uint __get_##DIMENSION(void) { \
|
||||
return FUNCTION; \
|
||||
}
|
||||
|
||||
struct __hip_builtin_threadIdx_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
|
||||
};
|
||||
|
||||
struct __hip_builtin_blockIdx_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
|
||||
};
|
||||
|
||||
struct __hip_builtin_blockDim_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
|
||||
};
|
||||
|
||||
struct __hip_builtin_gridDim_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
|
||||
};
|
||||
|
||||
#undef __HIP_DEVICE_BUILTIN
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
||||
extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
|
||||
extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
|
||||
extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
|
||||
extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
|
||||
|
||||
#include <__clang_cuda_builtin_vars.h>
|
||||
|
||||
#define hipThreadIdx_x threadIdx.x
|
||||
#define hipThreadIdx_y threadIdx.y
|
||||
|
||||
@@ -2427,6 +2427,59 @@ hipError_t hipIpcCloseMemHandle(void* devPtr);
|
||||
// hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags);
|
||||
|
||||
|
||||
/**
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
* @defgroup Clang Launch API to support the triple-chevron syntax
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Configure a kernel launch.
|
||||
*
|
||||
* @param [in] gridDim grid dimension specified as multiple of blockDim.
|
||||
* @param [in] blockDim block dimensions specified in work-items
|
||||
* @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
|
||||
* kernel can access this with HIP_DYNAMIC_SHARED.
|
||||
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case the
|
||||
* default stream is used with associated synchronization rules.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*
|
||||
*/
|
||||
hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Set a kernel argument.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*
|
||||
* @param [in] arg Pointer the argument in host memory.
|
||||
* @param [in] size Size of the argument.
|
||||
* @param [in] offset Offset of the argument on the argument stack.
|
||||
*
|
||||
*/
|
||||
hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Launch a kernel.
|
||||
*
|
||||
* @param [in] func Kernel to launch.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*
|
||||
*/
|
||||
hipError_t hipLaunchByPtr(const void* func);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "c" */
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "trace_helper.h"
|
||||
|
||||
constexpr unsigned __cudaFatMAGIC2 = 0x466243b1;
|
||||
|
||||
#define CLANG_OFFLOAD_BUNDLER_MAGIC "__CLANG_OFFLOAD_BUNDLE__"
|
||||
#define AMDGCN_AMDHSA_TRIPLE "openmp-amdgcn--amdhsa"
|
||||
|
||||
struct __ClangOffloadBundleDesc {
|
||||
uint64_t offset;
|
||||
uint64_t size;
|
||||
uint64_t tripleSize;
|
||||
const char triple[1];
|
||||
};
|
||||
|
||||
struct __ClangOffloadBundleHeader {
|
||||
const char magic[sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1];
|
||||
uint64_t numBundles;
|
||||
__ClangOffloadBundleDesc desc[1];
|
||||
};
|
||||
|
||||
struct __CudaFatBinaryWrapper {
|
||||
unsigned int magic;
|
||||
unsigned int version;
|
||||
__ClangOffloadBundleHeader* binary;
|
||||
void* unused;
|
||||
};
|
||||
|
||||
|
||||
extern "C" std::unordered_map<std::string, hipModule_t>*
|
||||
__hipRegisterFatBinary(const void* data)
|
||||
{
|
||||
HIP_INIT();
|
||||
|
||||
const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast<const __CudaFatBinaryWrapper*>(data);
|
||||
if (fbwrapper->magic != __cudaFatMAGIC2 || fbwrapper->version != 1) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const __ClangOffloadBundleHeader* header = fbwrapper->binary;
|
||||
std::string magic(reinterpret_cast<const char*>(header), sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1);
|
||||
if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto modules = new std::unordered_map<std::string, hipModule_t>{};
|
||||
if (!modules) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const __ClangOffloadBundleDesc* desc = &header->desc[0];
|
||||
for (uint64_t i = 0; i < header->numBundles; ++i,
|
||||
desc = reinterpret_cast<const __ClangOffloadBundleDesc*>(
|
||||
reinterpret_cast<uintptr_t>(&desc->triple[0]) + desc->tripleSize)) {
|
||||
|
||||
std::string triple{&desc->triple[0], sizeof(AMDGCN_AMDHSA_TRIPLE) - 1};
|
||||
if (triple.compare(AMDGCN_AMDHSA_TRIPLE))
|
||||
continue;
|
||||
|
||||
hipModule_t module;
|
||||
if (hipSuccess == hipModuleLoadData(&module, reinterpret_cast<const void*>(
|
||||
reinterpret_cast<uintptr_t>(header) + desc->offset))) {
|
||||
modules->emplace(std::string{&desc->triple[sizeof(AMDGCN_AMDHSA_TRIPLE)],
|
||||
desc->tripleSize - sizeof(AMDGCN_AMDHSA_TRIPLE)},
|
||||
module);
|
||||
}
|
||||
}
|
||||
|
||||
return modules;
|
||||
}
|
||||
|
||||
std::map<const void*, std::vector<hipFunction_t>> g_functions;
|
||||
|
||||
extern "C" void __hipRegisterFunction(
|
||||
std::unordered_map<std::string, hipModule_t>* modules,
|
||||
const void* hostFunction,
|
||||
char* deviceFunction,
|
||||
const char* deviceName,
|
||||
unsigned int threadLimit,
|
||||
uint3* tid,
|
||||
uint3* bid,
|
||||
dim3* blockDim,
|
||||
dim3* gridDim,
|
||||
int* wSize)
|
||||
{
|
||||
std::vector<hipFunction_t> functions{g_deviceCnt};
|
||||
|
||||
for (auto&& it : *modules) {
|
||||
hipFunction_t function;
|
||||
if (hipSuccess != hipModuleGetFunction(&function, it.second, deviceName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) {
|
||||
char name[64] = {};
|
||||
hsa_agent_get_info(g_allAgents[deviceId + 1], HSA_AGENT_INFO_NAME, name);
|
||||
if (!it.first.compare(name)) {
|
||||
functions[deviceId] = function;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
g_functions.insert(std::make_pair(hostFunction, std::move(functions)));
|
||||
}
|
||||
|
||||
extern "C" void __hipRegisterVar(
|
||||
hipModule_t module,
|
||||
char* hostVar,
|
||||
char* deviceVar,
|
||||
const char* deviceName,
|
||||
int ext,
|
||||
int size,
|
||||
int constant,
|
||||
int global)
|
||||
{
|
||||
}
|
||||
|
||||
extern "C" void __hipUnregisterFatBinary(std::unordered_map<std::string, hipModule_t>* modules)
|
||||
{
|
||||
for (auto&& it : *modules) {
|
||||
delete it.second;
|
||||
}
|
||||
delete modules;
|
||||
}
|
||||
|
||||
hipError_t hipConfigureCall(
|
||||
dim3 gridDim,
|
||||
dim3 blockDim,
|
||||
size_t sharedMem,
|
||||
hipStream_t stream)
|
||||
{
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
|
||||
crit->_execStack.push(ihipExec_t{gridDim, blockDim, sharedMem, stream});
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipSetupArgument(
|
||||
const void *arg,
|
||||
size_t size,
|
||||
size_t offset)
|
||||
{
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
auto& arguments = crit->_execStack.top()._arguments;
|
||||
|
||||
if (arguments.size() < offset + size) {
|
||||
arguments.resize(offset + size);
|
||||
}
|
||||
|
||||
::memcpy(&arguments[offset], arg, size);
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipLaunchByPtr(const void *hostFunction)
|
||||
{
|
||||
ihipExec_t exec;
|
||||
{
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
exec = std::move(crit->_execStack.top());
|
||||
crit->_execStack.pop();
|
||||
}
|
||||
|
||||
int deviceId;
|
||||
if (exec._hStream) {
|
||||
deviceId = exec._hStream->getDevice()->_deviceId;
|
||||
}
|
||||
else if (ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
|
||||
deviceId = ihipGetTlsDefaultCtx()->getDevice()->_deviceId;
|
||||
}
|
||||
else {
|
||||
deviceId = 0;
|
||||
}
|
||||
|
||||
decltype(g_functions)::iterator it;
|
||||
if ((it = g_functions.find(hostFunction)) == g_functions.end())
|
||||
return hipErrorUnknown;
|
||||
|
||||
size_t size = exec._arguments.size();
|
||||
void *extra[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, &exec._arguments[0],
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
return hipModuleLaunchKernel(it->second[deviceId],
|
||||
exec._gridDim.x, exec._gridDim.y, exec._gridDim.z,
|
||||
exec._blockDim.x, exec._blockDim.y, exec._blockDim.z,
|
||||
exec._sharedMem, exec._hStream, nullptr, extra);
|
||||
}
|
||||
|
||||
@@ -789,6 +789,16 @@ class ihipDevice_t {
|
||||
//=============================================================================
|
||||
|
||||
|
||||
//---
|
||||
//
|
||||
struct ihipExec_t {
|
||||
dim3 _gridDim;
|
||||
dim3 _blockDim;
|
||||
size_t _sharedMem;
|
||||
hipStream_t _hStream;
|
||||
std::vector<char> _arguments;
|
||||
};
|
||||
|
||||
//=============================================================================
|
||||
// class ihipCtxCriticalBase_t
|
||||
template <typename MUTEX_TYPE>
|
||||
@@ -827,6 +837,8 @@ class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
|
||||
|
||||
// TODO - move private
|
||||
std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
|
||||
//--- Execution stack:
|
||||
std::stack<ihipExec_t> _execStack; // Execution stack for this device.
|
||||
|
||||
friend class LockedAccessor<ihipCtxCriticalBase_t>;
|
||||
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user