1. Split hip_ir.ll to hip_hc.ll and hip_hc_gfx803.ll
 a. hip_hc.ll contains arch generic ir implementations
 b. hip_hc_gfx803.ll contains gfx803 (fiji, polaris) specific ir
2. HIPCC can now parse --amdgpu-target=*.
 a. Usage: hipcc --amdgpu-target=gfx803 --amdgpu-target=gfx701
 b. TODO: Convert to --amdgpu-target=gfx803,gfx701
3. With LLC in HCC able to generate native f16 isa, removed inline half asm math ops
4. Fixed threadfence and threadfence_block to use functions in rocdl

Change-Id: Ic9a9e3e04139b0d75d2c2a263c030ca77adc1019


[ROCm/clr commit: 60ec83c683]
Этот коммит содержится в:
Aditya Atluri
2017-02-08 12:04:05 -06:00
родитель 2cef50ac30
Коммит 14dcdff8f4
7 изменённых файлов: 134 добавлений и 160 удалений
+1 -1
Просмотреть файл
@@ -218,7 +218,7 @@ add_custom_target(doc COMMAND HIP_PATH=${CMAKE_CURRENT_SOURCE_DIR} doxygen ${CMA
# Install hip_hcc if platform is hcc
if(HIP_PLATFORM STREQUAL "hcc")
install(TARGETS hip_hcc_static hip_hcc hip_device DESTINATION lib)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_ir.ll DESTINATION lib)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_hc.ll ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_hc_gfx803.ll DESTINATION lib)
# Install .hipInfo
install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib)
+38 -9
Просмотреть файл
@@ -70,6 +70,11 @@ if ($verbose & 0x2) {
# set if user explicitly requests -stdlib=libc++. (else we default to libstdc++ for better interop with g++):
$setStdLib = 0; # TODO - set to 0
$target_gfx701 = 0;
$target_gfx801 = 0;
$target_gfx802 = 0;
$target_gfx803 = 0;
if ($HIP_PLATFORM eq "hcc") {
$HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa";
@@ -129,18 +134,25 @@ if ($HIP_PLATFORM eq "hcc") {
}
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n";
# Handle ROCm target platform
if ($ROCM_TARGET eq "fiji") {
$HIPLDFLAGS .= " --amdgpu-target=gfx803";
}
if ($ROCM_TARGET eq "carrizo") {
$HIPLDFLAGS .= " --amdgpu-target=gfx801";
}
if ($ROCM_TARGET eq "hawaii") {
if ($target_gfx701 eq 1) {
$HIPLDFLAGS .= " --amdgpu-target=gfx701";
}
if ($ROCM_TARGET eq "polaris") {
if ($target_gfx801 eq 1) {
$HIPLDFLAGS .= " --amdgpu-target=gfx801";
}
if ($target_gfx802 eq 1) {
$HIPLDFLAGS .= " --amdgpu-target=gfx802";
}
if ($target_gfx803 eq 1) {
$HIPLDFLAGS .= " --amdgpu-target=gfx803";
$ENV{HIP_HC_IR_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
}
if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0)
{
$HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803";
}
# Add trace marker library:
@@ -222,7 +234,8 @@ if($HIP_PLATFORM eq "hcc"){
if(($HIP_PLATFORM eq "hcc")){
$EXPORT_LL=" ";
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n";
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n";
$ENV{HIP_HC_IR_FILE}="";
}
if($HIP_PLATFORM eq "nvcc"){
@@ -261,6 +274,22 @@ foreach $arg (@ARGV)
$HIPCXXFLAGS .= " -stdlib=libc++";
$setStdLib = 1;
}
if($arg eq '--amdgpu-target=gfx701')
{
$target_gfx701 = 1;
}
if($arg eq '--amdgpu-target=gfx801')
{
$target_gfx801 = 1;
}
if($arg eq '--amdgpu-target=gfx802')
{
$target_gfx802 = 1;
}
if($arg eq '--amdgpu-target=gfx803')
{
$target_gfx803 = 1;
}
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
{
$HIPCXXFLAGS .= " -stdlib=libstdc++";
+14 -59
Просмотреть файл
@@ -39,16 +39,16 @@ typedef struct __attribute__((aligned(4))){
/*
Half Arithmetic Functions
*/
__device__ static __half __hadd(const __half a, const __half b);
__device__ static __half __hadd_sat(__half a, __half b);
__device__ static __half __hfma(__half a, __half b, __half c);
__device__ static __half __hfma_sat(__half a, __half b, __half c);
__device__ static __half __hmul(__half a, __half b);
__device__ static __half __hmul_sat(__half a, __half b);
__device__ static __half __hneg(__half a);
__device__ static __half __hsub(__half a, __half b);
__device__ static __half __hsub_sat(__half a, __half b);
__device__ static __half hdiv(__half a, __half b);
__device__ __half __hadd(const __half a, const __half b);
__device__ __half __hadd_sat(__half a, __half b);
__device__ __half __hfma(__half a, __half b, __half c);
__device__ __half __hfma_sat(__half a, __half b, __half c);
__device__ __half __hmul(__half a, __half b);
__device__ __half __hmul_sat(__half a, __half b);
__device__ __half __hneg(__half a);
__device__ __half __hsub(__half a, __half b);
__device__ __half __hsub_sat(__half a, __half b);
__device__ __half hdiv(__half a, __half b);
/*
Half2 Arithmetic Functions
@@ -211,11 +211,6 @@ __device__ __half __ushort2half_ru(unsigned short int i);
__device__ __half __ushort2half_rz(unsigned short int i);
__device__ __half __ushort_as_half(const unsigned short int i);
extern "C" __half __hip_hc_ir_hadd_half(__half, __half);
extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half);
extern "C" __half __hip_hc_ir_hmul_half(__half, __half);
extern "C" __half __hip_hc_ir_hsub_half(__half, __half);
extern "C" int __hip_hc_ir_hadd2_int(int, int);
extern "C" int __hip_hc_ir_hfma2_int(int, int, int);
extern "C" int __hip_hc_ir_hmul2_int(int, int);
@@ -244,46 +239,6 @@ extern "C" int __hip_hc_ir_h2sin_int(int);
extern "C" int __hip_hc_ir_h2sqrt_int(int);
extern "C" int __hip_hc_ir_h2trunc_int(int);
__device__ static inline __half __hadd(const __half a, const __half b) {
return __hip_hc_ir_hadd_half(a, b);
}
__device__ static inline __half __hadd_sat(__half a, __half b) {
return __hip_hc_ir_hadd_half(a, b);
}
__device__ static inline __half __hfma(__half a, __half b, __half c) {
return __hip_hc_ir_hfma_half(a, b, c);
}
__device__ static inline __half __hfma_sat(__half a, __half b, __half c) {
return __hip_hc_ir_hfma_half(a, b, c);
}
__device__ static inline __half __hmul(__half a, __half b) {
return __hip_hc_ir_hmul_half(a, b);
}
__device__ static inline __half __hmul_sat(__half a, __half b) {
return __hip_hc_ir_hmul_half(a, b);
}
__device__ static inline __half __hneg(__half a) {
return -a;
}
__device__ static inline __half __hsub(__half a, __half b) {
return __hip_hc_ir_hsub_half(a, b);
}
__device__ static inline __half __hsub_sat(__half a, __half b) {
return __hip_hc_ir_hsub_half(a, b);
}
__device__ static inline __half hdiv(__half a, __half b) {
return a/b;
}
/*
Half2 Arithmetic Functions
*/
@@ -360,11 +315,11 @@ __device__ static inline __half hcos(const __half h) {
}
__device__ static inline __half hexp(const __half h) {
return __hip_hc_ir_hexp2_half(__hip_hc_ir_hmul_half(h, 1.442694));
return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694));
}
__device__ static inline __half hexp10(const __half h) {
return __hip_hc_ir_hexp2_half(__hip_hc_ir_hmul_half(h, 3.3219281));
return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281));
}
__device__ static inline __half hexp2(const __half h) {
@@ -376,11 +331,11 @@ __device__ static inline __half hfloor(const __half h) {
}
__device__ static inline __half hlog(const __half h) {
return __hip_hc_ir_hmul_half(__hip_hc_ir_hlog2_half(h), 0.693147);
return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147);
}
__device__ static inline __half hlog10(const __half h) {
return __hip_hc_ir_hmul_half(__hip_hc_ir_hlog2_half(h), 0.301029);
return __hmul(__hip_hc_ir_hlog2_half(h), 0.301029);
}
__device__ static inline __half hlog2(const __half h) {
+10 -2
Просмотреть файл
@@ -277,6 +277,10 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
* On AMD platforms, the threadfence* routines are currently empty stubs.
*/
extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm("__llvm_fence_sc_wg");
/**
* @brief threadfence_block makes writes visible to threads running in same block.
*
@@ -287,7 +291,9 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
* @warning __threadfence_block is a stub and map to no-op.
*/
// __device__ void __threadfence_block(void);
extern "C" __device__ void __threadfence_block(void);
__device__ static inline void __threadfence_block(void) {
return __hip_hc_threadfence_block();
}
/**
* @brief threadfence makes wirtes visible to other threads running on same GPU.
@@ -299,7 +305,9 @@ extern "C" __device__ void __threadfence_block(void);
* @warning __threadfence is a stub and map to no-op, application should set "export HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
*/
// __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
extern "C" __device__ void __threadfence(void);
__device__ static inline void __threadfence(void) {
return __hip_hc_threadfence();
}
/**
* @brief threadfence_system makes writes to pinned system memory visible on host CPU.
+41
Просмотреть файл
@@ -32,6 +32,47 @@ struct hipHalfHolder{
#define HINF 65504
static struct hipHalfHolder __hInfValue = {HINF};
__device__ __half __hadd(__half a, __half b) {
return a + b;
}
__device__ __half __hadd_sat(__half a, __half b) {
return a + b;
}
__device__ __half __hfma(__half a, __half b, __half c) {
return a * b + c;
}
__device__ __half __hfma_sat(__half a, __half b, __half c) {
return a * b + c;
}
__device__ __half __hmul(__half a, __half b) {
return a * b;
}
__device__ __half __hmul_sat(__half a, __half b) {
return a * b;
}
__device__ __half __hneg(__half a) {
return -a;
}
__device__ __half __hsub(__half a, __half b) {
return a - b;
}
__device__ __half __hsub_sat(__half a, __half b) {
return a - b;
}
__device__ __half hdiv(__half a, __half b) {
return a / b;
}
/*
Half comparision Functions
*/
+30
Просмотреть файл
@@ -0,0 +1,30 @@
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--amdhsa"
define i32 @__hip_hc_ir_mul24_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_i32_i24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_umul24_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_u32_u24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_mulhi_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_hi_i32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_umulhi_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_hi_u32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_usad_int(i32 %a, i32 %b, i32 %c) #1 {
%1 = tail call i32 asm sideeffect "v_sad_u32 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
ret i32 %1
}
attributes #1 = { alwaysinline nounwind }
-89
Просмотреть файл
@@ -2,65 +2,6 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
target triple = "amdgcn--amdhsa"
define void @__threadfence() #1 {
fence syncscope(2) seq_cst
ret void
}
define void @__threadfence_block() #1 {
fence syncscope(3) seq_cst
ret void
}
; Lightning does not support inline asm for 16-bit data types
; So, bitcast half to short and then extend to 32bit i32
; After inline asm, convert back to half
define half @__hip_hc_ir_hadd_half(half %a, half %b) #1 {
%1 = bitcast half %a to i16
%2 = bitcast half %b to i16
%3 = zext i16 %1 to i32
%4 = zext i16 %2 to i32
%5 = tail call i32 asm "v_add_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
%6 = trunc i32 %5 to i16
%7 = bitcast i16 %6 to half
ret half %7
}
define half @__hip_hc_ir_hsub_half(half %a, half %b) #1 {
%1 = bitcast half %a to i16
%2 = bitcast half %b to i16
%3 = zext i16 %1 to i32
%4 = zext i16 %2 to i32
%5 = tail call i32 asm "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
%6 = trunc i32 %5 to i16
%7 = bitcast i16 %6 to half
ret half %7
}
define half @__hip_hc_ir_hmul_half(half %a, half %b) #1 {
%1 = bitcast half %a to i16
%2 = bitcast half %b to i16
%3 = zext i16 %1 to i32
%4 = zext i16 %2 to i32
%5 = tail call i32 asm "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
%6 = trunc i32 %5 to i16
%7 = bitcast i16 %6 to half
ret half %7
}
define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 {
%1 = bitcast half %a to i16
%2 = bitcast half %b to i16
%3 = bitcast half %c to i16
%4 = zext i16 %1 to i32
%5 = zext i16 %2 to i32
%6 = zext i16 %3 to i32
%7 = tail call i32 asm "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %4, i32 %5, i32 %6)
%8 = trunc i32 %7 to i16
%9 = bitcast i16 %8 to half
ret half %9
}
define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
@@ -146,34 +87,4 @@ define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 {
ret i32 %1
}
define i32 @__hip_hc_ir_mul24_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_i32_i24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_umul24_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_u32_u24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_mulhi_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_hi_i32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_umulhi_int(i32 %a, i32 %b) #1 {
%1 = tail call i32 asm sideeffect "v_mul_hi_u32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
ret i32 %1
}
define i32 @__hip_hc_ir_usad_int(i32 %a, i32 %b, i32 %c) #1 {
%1 = tail call i32 asm sideeffect "v_sad_u32 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
ret i32 %1
}
define i32 @__hip_hc_ir_sadu8_int(i32 %a, i32 %b, i32 %c) #1 {
%1 = tail call i32 asm sideeffect "v_sad_u8 $0, $1, $2 $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
ret i32 %1
}
attributes #1 = { alwaysinline nounwind }