Initial integration with Alex' Generic Grid Launch

Change-Id: I559afb80e9e39ec0d119bb3bf3b85ef9e448caf6


[ROCm/clr commit: ad882222b0]
이 커밋은 다음에 포함됨:
pensun
2017-03-17 17:17:12 +00:00
커밋한 사람 Sun, Peng
부모 b4b87b8786
커밋 9c9a9bb330
10개의 변경된 파일376개의 추가작업 그리고 25개의 파일을 삭제
+11
파일 보기
@@ -0,0 +1,11 @@
//
// Created by alexv on 25/10/16.
//
#pragma once
namespace glo_tests // Documentation only.
{
#define requires(...)
#define FunctionalProcedure typename
}
+227
파일 보기
@@ -0,0 +1,227 @@
//
// Created by alexv on 25/10/16.
//
#pragma once
#include "concepts.hpp"
#include "helpers.hpp"
#include "hc.hpp"
#include "hcc_acc.h"
//#include <hip/hcc.h>
//#include <hip/hip_runtime.h>
#include <stdexcept>
#include <type_traits>
#include <utility>
namespace glo_tests
{
namespace
{
struct New_grid_launch_tag {};
struct Old_grid_launch_tag {};
}
template<FunctionalProcedure F, typename... Ts>
using is_new_grid_launch_t = typename std::conditional<
std::is_callable<F(Ts...)>{},
New_grid_launch_tag,
Old_grid_launch_tag>::type;
// TODO: - dispatch rank should be derived from the domain dimensions passed
// in, and not always assumed to be 3;
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
static
inline
void grid_launch_impl(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k,
Ts&&... args)
{
const auto d = hc::extent<3>{
num_blocks.z * dim_blocks.z,
num_blocks.y * dim_blocks.y,
num_blocks.x * dim_blocks.x}.tile_with_dynamic(
dim_blocks.z,
dim_blocks.y,
dim_blocks.x,
group_mem_bytes);
hc::accelerator_view* av = nullptr;
if (hipHccGetAcceleratorView(stream, &av) != HIP_SUCCESS) {
throw std::runtime_error{"Failed to retrieve accelerator_view!"};
}
hc::parallel_for_each(*av, d, [=](hc::tiled_index<3> idx) [[hc]] {
k(args...);
});
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
static
inline
void grid_launch_impl(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k,
Ts&&... args)
{
grid_launch_impl(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k),
hipLaunchParm{},
std::forward<Ts>(args)...);
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
static
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k,
Ts&& ... args)
{
grid_launch_impl(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k),
std::forward<Ts>(args)...);
}
template<FunctionalProcedure K, K* k, typename... Ts>
requires(Domain<K> == {Ts...})
static
inline
void grid_launch(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
Ts&&... args)
{
grid_launch(
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
[](decltype(std::decay_t<Ts>(args))... f_args) [[hc]] {
k(f_args...);
},
std::forward<Ts>(args)...);
}
template<FunctionalProcedure K, K* k, typename... Ts>
requires(Domain<K> == {Ts...})
static
inline
void grid_launch(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
Ts&&... args)
{
grid_launch<K, k>(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
hipLaunchParm{},
std::forward<Ts>(args)...);
}
template<FunctionalProcedure K, K* k, typename... Ts>
requires(Domain<K> == {Ts...})
static
inline
std::enable_if_t<std::is_function<K>::value> grid_launch(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
Ts&&... args)
{
grid_launch<K, k>(
is_new_grid_launch_t<K*, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::forward<Ts>(args)...);
}
template<typename, typename = void> struct Wrapper;
template<FunctionalProcedure K>
struct Wrapper<K, std::enable_if_t<!std::is_function<K>::value>> {
template<typename... Ts>
requires(Domain<K> == {Ts...})
void operator()(Ts&&... args) const
{
grid_launch_impl(
is_new_grid_launch_t<K, Ts...>{},
std::forward<Ts>(args)...);
}
};
template<FunctionalProcedure K>
struct Wrapper<K, std::enable_if_t<std::is_function<K>::value>> {
template<typename... Ts>
void operator()(Ts&&...) const {}
};
#warning "GGL hipLaunchKernel defined"
#define hipLaunchKernel( \
kernel_name, \
num_blocks, \
dim_blocks, \
group_mem_bytes, \
stream, \
...) \
{ \
using F = decltype(kernel_name); \
if (!std::is_function<F>::value) { \
glo_tests::Wrapper<F>{}( \
num_blocks, \
dim_blocks, \
group_mem_bytes, \
stream, \
kernel_name, \
##__VA_ARGS__); \
} \
else { \
glo_tests::grid_launch<F, &kernel_name>( \
num_blocks, \
dim_blocks, \
group_mem_bytes, \
stream, \
##__VA_ARGS__); \
} \
}
}
+96
파일 보기
@@ -0,0 +1,96 @@
//
// Created by alexv on 08/11/16.
//
#pragma once
#include <type_traits> // For std::conditional, std::decay, std::enable_if,
// std::false_type, std result_of and std::true_type.
#include <utility> // For std::declval.
namespace std
{
#if (__cplusplus < 201406L)
template<typename...>
using void_t = void;
#if (__cplusplus < 201402L)
template<bool cond, typename T = void>
using enable_if_t = typename enable_if<cond, T>::type;
template<bool cond, typename T, typename U>
using conditional_t = typename conditional<cond, T, U>::type;
template<typename T>
using decay_t = typename decay<T>::type;
template<FunctionalProcedure F, typename... Ts>
using result_of_t = typename result_of<F(Ts...)>::type;
template<
FunctionalProcedure F,
unsigned int n = 0u,
typename = void>
struct is_callable_impl : is_callable_impl<F, n + 1u> {};
// Pointer to member function, call through non-pointer.
template<FunctionalProcedure F, typename C, typename... Ts>
struct is_callable_impl<
F(C, Ts...),
0u,
void_t<decltype((declval<C>().*declval<F>())(declval<Ts>()...))>
> : true_type {
};
// Pointer to member function, call through pointer.
template<FunctionalProcedure F, typename C, typename... Ts>
struct is_callable_impl<
F(C, Ts...),
1u,
void_t<decltype(((*declval<C>()).*declval<F>())(declval<Ts>()...))>
> : std::true_type {
};
// Pointer to member data, call through non-pointer, no args.
template<FunctionalProcedure F, typename C>
struct is_callable_impl<
F(C),
2u,
void_t<decltype(declval<C>().*declval<F>())>
> : true_type {
};
// Pointer to member data, call through pointer, no args.
template<FunctionalProcedure F, typename C>
struct is_callable_impl<
F(C),
3u,
void_t<decltype(*declval<C>().*declval<F>())>
> : true_type {
};
// General call, n args.
template<FunctionalProcedure F, typename... Ts>
struct is_callable_impl<
F(Ts...),
4u,
void_t<decltype(declval<F>()(declval<Ts>()...))>
> : true_type {
};
// Not callable.
template<FunctionalProcedure F>
struct is_callable_impl<F, 5u> : false_type {};
template<typename Call>
struct is_callable : is_callable_impl<Call> {};
#else
template<typename, typename = void>
struct is_callable_impl : false_type {};
template<FunctionalProcedure F, typename... Ts>
struct is_callable_impl<
F(Ts...),
void_t<result_of_t<F(Ts...)>>> : true_type {};
template<typename F>
struct is_callable : is_callable_impl<F> {};
#endif
#endif
}
+2 -2
파일 보기
@@ -23,11 +23,11 @@ THE SOFTWARE.
#ifndef HIP_LDG_H
#define HIP_LDG_H
#if __HCC__
#if defined __HCC__
#if __hcc_workweek__ >= 16164
#include "hip_vector_types.h"
#include "host_defines.h"
#warning "LDG header included"
__device__ char __ldg(const char* );
__device__ char2 __ldg(const char2* );
+28 -18
파일 보기
@@ -32,7 +32,6 @@ THE SOFTWARE.
//---
// Top part of file can be compiled with any compiler
//#include <cstring>
#if __cplusplus
#include <cmath>
@@ -40,7 +39,8 @@ THE SOFTWARE.
#include <math.h>
#include <string.h>
#include <stddef.h>
#endif
#endif//__cplusplus
// Define NVCC_COMPAT for CUDA compatibility
#define NVCC_COMPAT
#define CUDA_SUCCESS hipSuccess
@@ -58,20 +58,30 @@ THE SOFTWARE.
//---
// Remainder of this file only compiles with HCC
#ifdef __HCC__
#if defined __HCC__
#include <grid_launch.h>
#if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20)
// Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
//TODO-HCC-GL - change this to typedef.
//typedef grid_launch_parm hipLaunchParm ;
struct EmptyLaunchParm{};
#ifndef GENERIC_GRID_LAUNCH
#define hipLaunchParm grid_launch_parm
#else
#define hipLaunchParm EmptyLaunchParm
#endif //GENERIC_GRID_LAUNCH
#if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || defined GENERIC_GRID_LAUNCH
#else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
#error (HCC must support GRID_LAUNCH_20)
#endif
#endif //GRID_LAUNCH_VERSION
#endif //HCC
#if defined GENERIC_GRID_LAUNCH && defined __HCC__
#include "grid_launch_v2.hpp"
#endif//GENERIC_GRID_LAUNCH
extern int HIP_TRACE_API;
//TODO-HCC-GL - change this to typedef.
//typedef grid_launch_parm hipLaunchParm ;
#define hipLaunchParm grid_launch_parm
#ifdef __cplusplus
//#include <hip/hcc_detail/hip_texture.h>
#include <hip/hcc_detail/hip_ldg.h>
@@ -266,7 +276,7 @@ __device__ float __shfl(float input, int lane, int width);
__device__ float __shfl_up(float input, unsigned int lane_delta, int width);
__device__ float __shfl_down(float input, unsigned int lane_delta, int width);
__device__ float __shfl_xor(float input, int lane_mask, int width);
#endif
#endif //__cplusplus
__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
__device__ float __hip_ds_bpermutef(int index, float src);
@@ -278,7 +288,7 @@ __device__ float __hip_ds_swizzlef(float src, int pattern);
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
#endif
#endif //__HIP_ARCH_GFX803__ == 1
__host__ __device__ int min(int arg1, int arg2);
__host__ __device__ int max(int arg1, int arg2);
@@ -409,14 +419,15 @@ static inline __device__ void* memset(void* ptr, int val, size_t size)
#define HIP_KERNEL_NAME(...) __VA_ARGS__
#define HIP_SYMBOL(X) #X
#ifdef __HCC_CPP__
#if defined __HCC_CPP__
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
extern void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp);
#ifndef GENERIC_GRID_LAUNCH
#warning "Original hipLaunchKernel defined"
// Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be either size_t or dim3 types
#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
do {\
@@ -426,13 +437,13 @@ do {\
_kernelName (lp, ##__VA_ARGS__);\
ihipPostLaunchKernel(#_kernelName, trueStream, lp);\
} while(0)
#endif //GENERIC_GRID_LAUNCH
#elif defined (__HCC_C__)
//TODO - develop C interface.
#endif
#endif //__HCC_CPP__
/**
* extern __shared__
@@ -446,7 +457,6 @@ do {\
#define HIP_DYNAMIC_SHARED_ATTRIBUTE ADDRESS_SPACE_3
#endif // __HCC__
/**
@@ -470,4 +480,4 @@ do {\
#endif
#endif//HIP_HCC_DETAIL_RUNTIME_H
+2 -1
파일 보기
@@ -27,7 +27,8 @@ THE SOFTWARE.
* @file hcc_detail/hip_runtime_api.h
* @brief Contains C function APIs for HIP runtime. This file does not use any HCC builtin or special language extensions (-hc mode) ; those functions in hip_runtime.h.
*/
// guard for grid_launch_v2
#define GENERIC_GRID_LAUNCH
#include <stdint.h>
#include <stddef.h>
#include <iostream>
+7
파일 보기
@@ -37,7 +37,14 @@ THE SOFTWARE.
#define __host__ __attribute__((cpu))
#define __device__ __attribute__((hc))
#warning "HOST DEFINE header included"
#ifndef GENERIC_GRID_LAUNCH
#warning "original global define reached"
#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
#else
#warning "GGL global define reached"
#define __global__ [[hc]] __attribute__((weak))
#endif //GENERIC_GRID_LAUNCH
#define __noinline__ __attribute__((noinline))
#define __forceinline__ __attribute__((always_inline))
+1 -1
파일 보기
@@ -83,7 +83,7 @@ int main(int argc, char *argv[])
const unsigned threadsPerBlock = 256;
printf ("info: launch 'vector_square' kernel\n");
hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
hipLaunchKernel(vector_square<float>, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N);
printf ("info: copy Device2Host\n");
CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-1
파일 보기
@@ -1556,7 +1556,6 @@ void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launc
MARKER_END();
}
//=================================================================================================
// HIP API Implementation
//
+2 -2
파일 보기
@@ -28,7 +28,6 @@ THE SOFTWARE.
#include <iostream>
#include <iomanip>
#include <string>
//---
// Helper functions to convert HIP function arguments into strings.
// Handles POD data types as well as enumerations (ie hipMemcpyKind).
@@ -71,7 +70,7 @@ inline std::string ToString(hipEvent_t v)
ss << v;
return ss.str();
};
#ifndef GENERIC_GRID_LAUNCH
// hipStream_t
template <>
inline std::string ToString(hipStream_t v)
@@ -85,6 +84,7 @@ inline std::string ToString(hipStream_t v)
return ss.str();
};
#endif //GENERIC_GRID_LAUNCH
// hipMemcpyKind specialization
template <>