diff --git a/CMakeLists.txt b/CMakeLists.txt index 1df9455eee..4e4a295c35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,7 @@ if(HIP_PLATFORM STREQUAL "hcc") set(SOURCE_FILES_RUNTIME src/code_object_bundle.cpp + src/hip_clang.cpp src/hip_hcc.cpp src/hip_context.cpp src/hip_device.cpp diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index c5ad715bba..92f06e9174 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -534,19 +534,67 @@ typedef int hipLaunchParm; #include -#if defined(__cplusplus) -extern "C" { -#endif /*__cplusplus*/ +#pragma push_macro("__DEVICE__") +#define __DEVICE__ static __device__ __forceinline__ -hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream); -hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset); -hipError_t hipLaunchByPtr(const void* func); +extern "C" __device__ size_t __ockl_get_local_id(uint); +__DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); } +__DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); } +__DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); } -#if defined(__cplusplus) -} -#endif /*__cplusplus*/ +extern "C" __device__ size_t __ockl_get_group_id(uint); +__DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); } +__DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); } +__DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); } + +extern "C" __device__ size_t __ockl_get_local_size(uint); +__DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); } +__DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); } +__DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); } + +extern "C" __device__ size_t __ockl_get_num_groups(uint); +__DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); } +__DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); } +__DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); } + +#define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \ + __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \ + __DEVICE__ uint __get_##DIMENSION(void) { \ + return FUNCTION; \ + } + +struct __hip_builtin_threadIdx_t { + __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x()); + __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y()); + __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z()); +}; + +struct __hip_builtin_blockIdx_t { + __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x()); + __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y()); + __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z()); +}; + +struct __hip_builtin_blockDim_t { + __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x()); + __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y()); + __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z()); +}; + +struct __hip_builtin_gridDim_t { + __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x()); + __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y()); + __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z()); +}; + +#undef __HIP_DEVICE_BUILTIN +#pragma pop_macro("__DEVICE__") + +extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx; +extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx; +extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim; +extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; -#include <__clang_cuda_builtin_vars.h> #define hipThreadIdx_x threadIdx.x #define hipThreadIdx_y threadIdx.y diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 7eea69186e..78d9e5386c 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -2427,6 +2427,59 @@ hipError_t hipIpcCloseMemHandle(void* devPtr); // hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags); +/** + *------------------------------------------------------------------------------------------------- + *------------------------------------------------------------------------------------------------- + * @defgroup Clang Launch API to support the triple-chevron syntax + * @{ + */ + +/** + * @brief Configure a kernel launch. + * + * @param [in] gridDim grid dimension specified as multiple of blockDim. + * @param [in] blockDim block dimensions specified in work-items + * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The + * kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case the + * default stream is used with associated synchronization rules. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + * + */ +hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream); + + +/** + * @brief Set a kernel argument. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + * + * @param [in] arg Pointer the argument in host memory. + * @param [in] size Size of the argument. + * @param [in] offset Offset of the argument on the argument stack. + * + */ +hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset); + + +/** + * @brief Launch a kernel. + * + * @param [in] func Kernel to launch. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + * + */ +hipError_t hipLaunchByPtr(const void* func); + + + +/** + * @} + */ + + #ifdef __cplusplus } /* extern "c" */ #endif diff --git a/src/hip_clang.cpp b/src/hip_clang.cpp new file mode 100644 index 0000000000..5c4da0b39d --- /dev/null +++ b/src/hip_clang.cpp @@ -0,0 +1,219 @@ +/* +Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include "hip/hip_runtime.h" +#include "hip_hcc_internal.h" +#include "trace_helper.h" + +constexpr unsigned __cudaFatMAGIC2 = 0x466243b1; + +#define CLANG_OFFLOAD_BUNDLER_MAGIC "__CLANG_OFFLOAD_BUNDLE__" +#define AMDGCN_AMDHSA_TRIPLE "openmp-amdgcn--amdhsa" + +struct __ClangOffloadBundleDesc { + uint64_t offset; + uint64_t size; + uint64_t tripleSize; + const char triple[1]; +}; + +struct __ClangOffloadBundleHeader { + const char magic[sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1]; + uint64_t numBundles; + __ClangOffloadBundleDesc desc[1]; +}; + +struct __CudaFatBinaryWrapper { + unsigned int magic; + unsigned int version; + __ClangOffloadBundleHeader* binary; + void* unused; +}; + + +extern "C" std::unordered_map* +__hipRegisterFatBinary(const void* data) +{ + HIP_INIT(); + + const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast(data); + if (fbwrapper->magic != __cudaFatMAGIC2 || fbwrapper->version != 1) { + return nullptr; + } + + const __ClangOffloadBundleHeader* header = fbwrapper->binary; + std::string magic(reinterpret_cast(header), sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1); + if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC)) { + return nullptr; + } + + auto modules = new std::unordered_map{}; + if (!modules) { + return nullptr; + } + + const __ClangOffloadBundleDesc* desc = &header->desc[0]; + for (uint64_t i = 0; i < header->numBundles; ++i, + desc = reinterpret_cast( + reinterpret_cast(&desc->triple[0]) + desc->tripleSize)) { + + std::string triple{&desc->triple[0], sizeof(AMDGCN_AMDHSA_TRIPLE) - 1}; + if (triple.compare(AMDGCN_AMDHSA_TRIPLE)) + continue; + + hipModule_t module; + if (hipSuccess == hipModuleLoadData(&module, reinterpret_cast( + reinterpret_cast(header) + desc->offset))) { + modules->emplace(std::string{&desc->triple[sizeof(AMDGCN_AMDHSA_TRIPLE)], + desc->tripleSize - sizeof(AMDGCN_AMDHSA_TRIPLE)}, + module); + } + } + + return modules; +} + +std::map> g_functions; + +extern "C" void __hipRegisterFunction( + std::unordered_map* modules, + const void* hostFunction, + char* deviceFunction, + const char* deviceName, + unsigned int threadLimit, + uint3* tid, + uint3* bid, + dim3* blockDim, + dim3* gridDim, + int* wSize) +{ + std::vector functions{g_deviceCnt}; + + for (auto&& it : *modules) { + hipFunction_t function; + if (hipSuccess != hipModuleGetFunction(&function, it.second, deviceName)) { + continue; + } + + for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) { + char name[64] = {}; + hsa_agent_get_info(g_allAgents[deviceId + 1], HSA_AGENT_INFO_NAME, name); + if (!it.first.compare(name)) { + functions[deviceId] = function; + } + } + } + + g_functions.insert(std::make_pair(hostFunction, std::move(functions))); +} + +extern "C" void __hipRegisterVar( + hipModule_t module, + char* hostVar, + char* deviceVar, + const char* deviceName, + int ext, + int size, + int constant, + int global) +{ +} + +extern "C" void __hipUnregisterFatBinary(std::unordered_map* modules) +{ + for (auto&& it : *modules) { + delete it.second; + } + delete modules; +} + +hipError_t hipConfigureCall( + dim3 gridDim, + dim3 blockDim, + size_t sharedMem, + hipStream_t stream) +{ + auto ctx = ihipGetTlsDefaultCtx(); + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + + crit->_execStack.push(ihipExec_t{gridDim, blockDim, sharedMem, stream}); + return hipSuccess; +} + +hipError_t hipSetupArgument( + const void *arg, + size_t size, + size_t offset) +{ + auto ctx = ihipGetTlsDefaultCtx(); + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + auto& arguments = crit->_execStack.top()._arguments; + + if (arguments.size() < offset + size) { + arguments.resize(offset + size); + } + + ::memcpy(&arguments[offset], arg, size); + return hipSuccess; +} + +hipError_t hipLaunchByPtr(const void *hostFunction) +{ + ihipExec_t exec; + { + auto ctx = ihipGetTlsDefaultCtx(); + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + exec = std::move(crit->_execStack.top()); + crit->_execStack.pop(); + } + + int deviceId; + if (exec._hStream) { + deviceId = exec._hStream->getDevice()->_deviceId; + } + else if (ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { + deviceId = ihipGetTlsDefaultCtx()->getDevice()->_deviceId; + } + else { + deviceId = 0; + } + + decltype(g_functions)::iterator it; + if ((it = g_functions.find(hostFunction)) == g_functions.end()) + return hipErrorUnknown; + + size_t size = exec._arguments.size(); + void *extra[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &exec._arguments[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + return hipModuleLaunchKernel(it->second[deviceId], + exec._gridDim.x, exec._gridDim.y, exec._gridDim.z, + exec._blockDim.x, exec._blockDim.y, exec._blockDim.z, + exec._sharedMem, exec._hStream, nullptr, extra); +} + diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 0ef5dcaa2a..ce8041ef63 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -789,6 +789,16 @@ class ihipDevice_t { //============================================================================= +//--- +// +struct ihipExec_t { + dim3 _gridDim; + dim3 _blockDim; + size_t _sharedMem; + hipStream_t _hStream; + std::vector _arguments; +}; + //============================================================================= // class ihipCtxCriticalBase_t template @@ -827,6 +837,8 @@ class ihipCtxCriticalBase_t : LockedBase { // TODO - move private std::list _peers; // list of enabled peer devices. + //--- Execution stack: + std::stack _execStack; // Execution stack for this device. friend class LockedAccessor;