Fixed Hawaii link issues
1. Split hip_ir.ll to hip_hc.ll and hip_hc_gfx803.ll
a. hip_hc.ll contains arch generic ir implementations
b. hip_hc_gfx803.ll contains gfx803 (fiji, polaris) specific ir
2. HIPCC can now parse --amdgpu-target=*.
a. Usage: hipcc --amdgpu-target=gfx803 --amdgpu-target=gfx701
b. TODO: Convert to --amdgpu-target=gfx803,gfx701
3. With LLC in HCC able to generate native f16 isa, removed inline half asm math ops
4. Fixed threadfence and threadfence_block to use functions in rocdl
Change-Id: Ic9a9e3e04139b0d75d2c2a263c030ca77adc1019
[ROCm/clr commit: 60ec83c683]
Этот коммит содержится в:
@@ -218,7 +218,7 @@ add_custom_target(doc COMMAND HIP_PATH=${CMAKE_CURRENT_SOURCE_DIR} doxygen ${CMA
|
||||
# Install hip_hcc if platform is hcc
|
||||
if(HIP_PLATFORM STREQUAL "hcc")
|
||||
install(TARGETS hip_hcc_static hip_hcc hip_device DESTINATION lib)
|
||||
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_ir.ll DESTINATION lib)
|
||||
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_hc.ll ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_hc_gfx803.ll DESTINATION lib)
|
||||
|
||||
# Install .hipInfo
|
||||
install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib)
|
||||
|
||||
@@ -70,6 +70,11 @@ if ($verbose & 0x2) {
|
||||
# set if user explicitly requests -stdlib=libc++. (else we default to libstdc++ for better interop with g++):
|
||||
$setStdLib = 0; # TODO - set to 0
|
||||
|
||||
$target_gfx701 = 0;
|
||||
$target_gfx801 = 0;
|
||||
$target_gfx802 = 0;
|
||||
$target_gfx803 = 0;
|
||||
|
||||
if ($HIP_PLATFORM eq "hcc") {
|
||||
$HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa";
|
||||
|
||||
@@ -129,18 +134,25 @@ if ($HIP_PLATFORM eq "hcc") {
|
||||
}
|
||||
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";
|
||||
|
||||
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n";
|
||||
|
||||
# Handle ROCm target platform
|
||||
if ($ROCM_TARGET eq "fiji") {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx803";
|
||||
}
|
||||
if ($ROCM_TARGET eq "carrizo") {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx801";
|
||||
}
|
||||
if ($ROCM_TARGET eq "hawaii") {
|
||||
if ($target_gfx701 eq 1) {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx701";
|
||||
}
|
||||
if ($ROCM_TARGET eq "polaris") {
|
||||
if ($target_gfx801 eq 1) {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx801";
|
||||
}
|
||||
if ($target_gfx802 eq 1) {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx802";
|
||||
}
|
||||
if ($target_gfx803 eq 1) {
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx803";
|
||||
$ENV{HIP_HC_IR_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
|
||||
}
|
||||
if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0)
|
||||
{
|
||||
$HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803";
|
||||
}
|
||||
|
||||
# Add trace marker library:
|
||||
@@ -222,7 +234,8 @@ if($HIP_PLATFORM eq "hcc"){
|
||||
|
||||
if(($HIP_PLATFORM eq "hcc")){
|
||||
$EXPORT_LL=" ";
|
||||
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n";
|
||||
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n";
|
||||
$ENV{HIP_HC_IR_FILE}="";
|
||||
}
|
||||
|
||||
if($HIP_PLATFORM eq "nvcc"){
|
||||
@@ -261,6 +274,22 @@ foreach $arg (@ARGV)
|
||||
$HIPCXXFLAGS .= " -stdlib=libc++";
|
||||
$setStdLib = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx701')
|
||||
{
|
||||
$target_gfx701 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx801')
|
||||
{
|
||||
$target_gfx801 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx802')
|
||||
{
|
||||
$target_gfx802 = 1;
|
||||
}
|
||||
if($arg eq '--amdgpu-target=gfx803')
|
||||
{
|
||||
$target_gfx803 = 1;
|
||||
}
|
||||
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
|
||||
{
|
||||
$HIPCXXFLAGS .= " -stdlib=libstdc++";
|
||||
|
||||
@@ -39,16 +39,16 @@ typedef struct __attribute__((aligned(4))){
|
||||
/*
|
||||
Half Arithmetic Functions
|
||||
*/
|
||||
__device__ static __half __hadd(const __half a, const __half b);
|
||||
__device__ static __half __hadd_sat(__half a, __half b);
|
||||
__device__ static __half __hfma(__half a, __half b, __half c);
|
||||
__device__ static __half __hfma_sat(__half a, __half b, __half c);
|
||||
__device__ static __half __hmul(__half a, __half b);
|
||||
__device__ static __half __hmul_sat(__half a, __half b);
|
||||
__device__ static __half __hneg(__half a);
|
||||
__device__ static __half __hsub(__half a, __half b);
|
||||
__device__ static __half __hsub_sat(__half a, __half b);
|
||||
__device__ static __half hdiv(__half a, __half b);
|
||||
__device__ __half __hadd(const __half a, const __half b);
|
||||
__device__ __half __hadd_sat(__half a, __half b);
|
||||
__device__ __half __hfma(__half a, __half b, __half c);
|
||||
__device__ __half __hfma_sat(__half a, __half b, __half c);
|
||||
__device__ __half __hmul(__half a, __half b);
|
||||
__device__ __half __hmul_sat(__half a, __half b);
|
||||
__device__ __half __hneg(__half a);
|
||||
__device__ __half __hsub(__half a, __half b);
|
||||
__device__ __half __hsub_sat(__half a, __half b);
|
||||
__device__ __half hdiv(__half a, __half b);
|
||||
|
||||
/*
|
||||
Half2 Arithmetic Functions
|
||||
@@ -211,11 +211,6 @@ __device__ __half __ushort2half_ru(unsigned short int i);
|
||||
__device__ __half __ushort2half_rz(unsigned short int i);
|
||||
__device__ __half __ushort_as_half(const unsigned short int i);
|
||||
|
||||
extern "C" __half __hip_hc_ir_hadd_half(__half, __half);
|
||||
extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half);
|
||||
extern "C" __half __hip_hc_ir_hmul_half(__half, __half);
|
||||
extern "C" __half __hip_hc_ir_hsub_half(__half, __half);
|
||||
|
||||
extern "C" int __hip_hc_ir_hadd2_int(int, int);
|
||||
extern "C" int __hip_hc_ir_hfma2_int(int, int, int);
|
||||
extern "C" int __hip_hc_ir_hmul2_int(int, int);
|
||||
@@ -244,46 +239,6 @@ extern "C" int __hip_hc_ir_h2sin_int(int);
|
||||
extern "C" int __hip_hc_ir_h2sqrt_int(int);
|
||||
extern "C" int __hip_hc_ir_h2trunc_int(int);
|
||||
|
||||
__device__ static inline __half __hadd(const __half a, const __half b) {
|
||||
return __hip_hc_ir_hadd_half(a, b);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hadd_sat(__half a, __half b) {
|
||||
return __hip_hc_ir_hadd_half(a, b);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hfma(__half a, __half b, __half c) {
|
||||
return __hip_hc_ir_hfma_half(a, b, c);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hfma_sat(__half a, __half b, __half c) {
|
||||
return __hip_hc_ir_hfma_half(a, b, c);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hmul(__half a, __half b) {
|
||||
return __hip_hc_ir_hmul_half(a, b);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hmul_sat(__half a, __half b) {
|
||||
return __hip_hc_ir_hmul_half(a, b);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hneg(__half a) {
|
||||
return -a;
|
||||
}
|
||||
|
||||
__device__ static inline __half __hsub(__half a, __half b) {
|
||||
return __hip_hc_ir_hsub_half(a, b);
|
||||
}
|
||||
|
||||
__device__ static inline __half __hsub_sat(__half a, __half b) {
|
||||
return __hip_hc_ir_hsub_half(a, b);
|
||||
}
|
||||
|
||||
__device__ static inline __half hdiv(__half a, __half b) {
|
||||
return a/b;
|
||||
}
|
||||
|
||||
/*
|
||||
Half2 Arithmetic Functions
|
||||
*/
|
||||
@@ -360,11 +315,11 @@ __device__ static inline __half hcos(const __half h) {
|
||||
}
|
||||
|
||||
__device__ static inline __half hexp(const __half h) {
|
||||
return __hip_hc_ir_hexp2_half(__hip_hc_ir_hmul_half(h, 1.442694));
|
||||
return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694));
|
||||
}
|
||||
|
||||
__device__ static inline __half hexp10(const __half h) {
|
||||
return __hip_hc_ir_hexp2_half(__hip_hc_ir_hmul_half(h, 3.3219281));
|
||||
return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281));
|
||||
}
|
||||
|
||||
__device__ static inline __half hexp2(const __half h) {
|
||||
@@ -376,11 +331,11 @@ __device__ static inline __half hfloor(const __half h) {
|
||||
}
|
||||
|
||||
__device__ static inline __half hlog(const __half h) {
|
||||
return __hip_hc_ir_hmul_half(__hip_hc_ir_hlog2_half(h), 0.693147);
|
||||
return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147);
|
||||
}
|
||||
|
||||
__device__ static inline __half hlog10(const __half h) {
|
||||
return __hip_hc_ir_hmul_half(__hip_hc_ir_hlog2_half(h), 0.301029);
|
||||
return __hmul(__hip_hc_ir_hlog2_half(h), 0.301029);
|
||||
}
|
||||
|
||||
__device__ static inline __half hlog2(const __half h) {
|
||||
|
||||
@@ -277,6 +277,10 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
|
||||
* On AMD platforms, the threadfence* routines are currently empty stubs.
|
||||
*/
|
||||
|
||||
extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
|
||||
extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm("__llvm_fence_sc_wg");
|
||||
|
||||
|
||||
/**
|
||||
* @brief threadfence_block makes writes visible to threads running in same block.
|
||||
*
|
||||
@@ -287,7 +291,9 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
|
||||
* @warning __threadfence_block is a stub and map to no-op.
|
||||
*/
|
||||
// __device__ void __threadfence_block(void);
|
||||
extern "C" __device__ void __threadfence_block(void);
|
||||
__device__ static inline void __threadfence_block(void) {
|
||||
return __hip_hc_threadfence_block();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief threadfence makes wirtes visible to other threads running on same GPU.
|
||||
@@ -299,7 +305,9 @@ extern "C" __device__ void __threadfence_block(void);
|
||||
* @warning __threadfence is a stub and map to no-op, application should set "export HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
|
||||
*/
|
||||
// __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
|
||||
extern "C" __device__ void __threadfence(void);
|
||||
__device__ static inline void __threadfence(void) {
|
||||
return __hip_hc_threadfence();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief threadfence_system makes writes to pinned system memory visible on host CPU.
|
||||
|
||||
@@ -32,6 +32,47 @@ struct hipHalfHolder{
|
||||
#define HINF 65504
|
||||
|
||||
static struct hipHalfHolder __hInfValue = {HINF};
|
||||
|
||||
__device__ __half __hadd(__half a, __half b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
__device__ __half __hadd_sat(__half a, __half b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
__device__ __half __hfma(__half a, __half b, __half c) {
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
__device__ __half __hfma_sat(__half a, __half b, __half c) {
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
__device__ __half __hmul(__half a, __half b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
__device__ __half __hmul_sat(__half a, __half b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
__device__ __half __hneg(__half a) {
|
||||
return -a;
|
||||
}
|
||||
|
||||
__device__ __half __hsub(__half a, __half b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
__device__ __half __hsub_sat(__half a, __half b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
__device__ __half hdiv(__half a, __half b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
/*
|
||||
Half comparision Functions
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
target triple = "amdgcn--amdhsa"
|
||||
|
||||
define i32 @__hip_hc_ir_mul24_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_i32_i24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_umul24_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_u32_u24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_mulhi_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_hi_i32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_umulhi_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_hi_u32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_usad_int(i32 %a, i32 %b, i32 %c) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_sad_u32 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
attributes #1 = { alwaysinline nounwind }
|
||||
|
||||
@@ -2,65 +2,6 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
|
||||
target triple = "amdgcn--amdhsa"
|
||||
|
||||
|
||||
define void @__threadfence() #1 {
|
||||
fence syncscope(2) seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__threadfence_block() #1 {
|
||||
fence syncscope(3) seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
; Lightning does not support inline asm for 16-bit data types
|
||||
; So, bitcast half to short and then extend to 32bit i32
|
||||
; After inline asm, convert back to half
|
||||
define half @__hip_hc_ir_hadd_half(half %a, half %b) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = zext i16 %1 to i32
|
||||
%4 = zext i16 %2 to i32
|
||||
%5 = tail call i32 asm "v_add_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
|
||||
%6 = trunc i32 %5 to i16
|
||||
%7 = bitcast i16 %6 to half
|
||||
ret half %7
|
||||
}
|
||||
|
||||
define half @__hip_hc_ir_hsub_half(half %a, half %b) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = zext i16 %1 to i32
|
||||
%4 = zext i16 %2 to i32
|
||||
%5 = tail call i32 asm "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
|
||||
%6 = trunc i32 %5 to i16
|
||||
%7 = bitcast i16 %6 to half
|
||||
ret half %7
|
||||
}
|
||||
|
||||
define half @__hip_hc_ir_hmul_half(half %a, half %b) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = zext i16 %1 to i32
|
||||
%4 = zext i16 %2 to i32
|
||||
%5 = tail call i32 asm "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
|
||||
%6 = trunc i32 %5 to i16
|
||||
%7 = bitcast i16 %6 to half
|
||||
ret half %7
|
||||
}
|
||||
|
||||
define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = bitcast half %c to i16
|
||||
%4 = zext i16 %1 to i32
|
||||
%5 = zext i16 %2 to i32
|
||||
%6 = zext i16 %3 to i32
|
||||
%7 = tail call i32 asm "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %4, i32 %5, i32 %6)
|
||||
%8 = trunc i32 %7 to i16
|
||||
%9 = bitcast i16 %8 to half
|
||||
ret half %9
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
|
||||
@@ -146,34 +87,4 @@ define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 {
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_mul24_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_i32_i24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_umul24_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_u32_u24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_mulhi_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_hi_i32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_umulhi_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_hi_u32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_usad_int(i32 %a, i32 %b, i32 %c) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_sad_u32 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_sadu8_int(i32 %a, i32 %b, i32 %c) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_sad_u8 $0, $1, $2 $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
attributes #1 = { alwaysinline nounwind }
|
||||
Ссылка в новой задаче
Block a user