diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index 53663e44b3..9c469afbc1 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -218,7 +218,7 @@ add_custom_target(doc COMMAND HIP_PATH=${CMAKE_CURRENT_SOURCE_DIR} doxygen ${CMA # Install hip_hcc if platform is hcc if(HIP_PLATFORM STREQUAL "hcc") install(TARGETS hip_hcc_static hip_hcc hip_device DESTINATION lib) - install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_ir.ll DESTINATION lib) + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_hc.ll ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_hc_gfx803.ll DESTINATION lib) # Install .hipInfo install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib) diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index e43131501c..777a1b3f9d 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -70,6 +70,11 @@ if ($verbose & 0x2) { # set if user explicitly requests -stdlib=libc++. (else we default to libstdc++ for better interop with g++): $setStdLib = 0; # TODO - set to 0 +$target_gfx701 = 0; +$target_gfx801 = 0; +$target_gfx802 = 0; +$target_gfx803 = 0; + if ($HIP_PLATFORM eq "hcc") { $HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa"; @@ -129,18 +134,25 @@ if ($HIP_PLATFORM eq "hcc") { } $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt"; + $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n"; + # Handle ROCm target platform - if ($ROCM_TARGET eq "fiji") { - $HIPLDFLAGS .= " --amdgpu-target=gfx803"; - } - if ($ROCM_TARGET eq "carrizo") { - $HIPLDFLAGS .= " --amdgpu-target=gfx801"; - } - if ($ROCM_TARGET eq "hawaii") { + if ($target_gfx701 eq 1) { $HIPLDFLAGS .= " --amdgpu-target=gfx701"; } - if ($ROCM_TARGET eq "polaris") { + if ($target_gfx801 eq 1) { + $HIPLDFLAGS .= " --amdgpu-target=gfx801"; + } + if ($target_gfx802 eq 1) { + $HIPLDFLAGS .= " --amdgpu-target=gfx802"; + } + if ($target_gfx803 eq 1) { $HIPLDFLAGS .= " --amdgpu-target=gfx803"; + $ENV{HIP_HC_IR_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; + } + if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0) + { + $HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803"; } # Add trace marker library: @@ -222,7 +234,8 @@ if($HIP_PLATFORM eq "hcc"){ if(($HIP_PLATFORM eq "hcc")){ $EXPORT_LL=" "; - $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n"; + $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_hc.ll\n"; + $ENV{HIP_HC_IR_FILE}=""; } if($HIP_PLATFORM eq "nvcc"){ @@ -261,6 +274,22 @@ foreach $arg (@ARGV) $HIPCXXFLAGS .= " -stdlib=libc++"; $setStdLib = 1; } + if($arg eq '--amdgpu-target=gfx701') + { + $target_gfx701 = 1; + } + if($arg eq '--amdgpu-target=gfx801') + { + $target_gfx801 = 1; + } + if($arg eq '--amdgpu-target=gfx802') + { + $target_gfx802 = 1; + } + if($arg eq '--amdgpu-target=gfx803') + { + $target_gfx803 = 1; + } if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0)) { $HIPCXXFLAGS .= " -stdlib=libstdc++"; diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h index 67d1fe4e06..2c7c23440c 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -39,16 +39,16 @@ typedef struct __attribute__((aligned(4))){ /* Half Arithmetic Functions */ -__device__ static __half __hadd(const __half a, const __half b); -__device__ static __half __hadd_sat(__half a, __half b); -__device__ static __half __hfma(__half a, __half b, __half c); -__device__ static __half __hfma_sat(__half a, __half b, __half c); -__device__ static __half __hmul(__half a, __half b); -__device__ static __half __hmul_sat(__half a, __half b); -__device__ static __half __hneg(__half a); -__device__ static __half __hsub(__half a, __half b); -__device__ static __half __hsub_sat(__half a, __half b); -__device__ static __half hdiv(__half a, __half b); +__device__ __half __hadd(const __half a, const __half b); +__device__ __half __hadd_sat(__half a, __half b); +__device__ __half __hfma(__half a, __half b, __half c); +__device__ __half __hfma_sat(__half a, __half b, __half c); +__device__ __half __hmul(__half a, __half b); +__device__ __half __hmul_sat(__half a, __half b); +__device__ __half __hneg(__half a); +__device__ __half __hsub(__half a, __half b); +__device__ __half __hsub_sat(__half a, __half b); +__device__ __half hdiv(__half a, __half b); /* Half2 Arithmetic Functions @@ -211,11 +211,6 @@ __device__ __half __ushort2half_ru(unsigned short int i); __device__ __half __ushort2half_rz(unsigned short int i); __device__ __half __ushort_as_half(const unsigned short int i); -extern "C" __half __hip_hc_ir_hadd_half(__half, __half); -extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half); -extern "C" __half __hip_hc_ir_hmul_half(__half, __half); -extern "C" __half __hip_hc_ir_hsub_half(__half, __half); - extern "C" int __hip_hc_ir_hadd2_int(int, int); extern "C" int __hip_hc_ir_hfma2_int(int, int, int); extern "C" int __hip_hc_ir_hmul2_int(int, int); @@ -244,46 +239,6 @@ extern "C" int __hip_hc_ir_h2sin_int(int); extern "C" int __hip_hc_ir_h2sqrt_int(int); extern "C" int __hip_hc_ir_h2trunc_int(int); -__device__ static inline __half __hadd(const __half a, const __half b) { - return __hip_hc_ir_hadd_half(a, b); -} - -__device__ static inline __half __hadd_sat(__half a, __half b) { - return __hip_hc_ir_hadd_half(a, b); -} - -__device__ static inline __half __hfma(__half a, __half b, __half c) { - return __hip_hc_ir_hfma_half(a, b, c); -} - -__device__ static inline __half __hfma_sat(__half a, __half b, __half c) { - return __hip_hc_ir_hfma_half(a, b, c); -} - -__device__ static inline __half __hmul(__half a, __half b) { - return __hip_hc_ir_hmul_half(a, b); -} - -__device__ static inline __half __hmul_sat(__half a, __half b) { - return __hip_hc_ir_hmul_half(a, b); -} - -__device__ static inline __half __hneg(__half a) { - return -a; -} - -__device__ static inline __half __hsub(__half a, __half b) { - return __hip_hc_ir_hsub_half(a, b); -} - -__device__ static inline __half __hsub_sat(__half a, __half b) { - return __hip_hc_ir_hsub_half(a, b); -} - -__device__ static inline __half hdiv(__half a, __half b) { - return a/b; -} - /* Half2 Arithmetic Functions */ @@ -360,11 +315,11 @@ __device__ static inline __half hcos(const __half h) { } __device__ static inline __half hexp(const __half h) { - return __hip_hc_ir_hexp2_half(__hip_hc_ir_hmul_half(h, 1.442694)); + return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694)); } __device__ static inline __half hexp10(const __half h) { - return __hip_hc_ir_hexp2_half(__hip_hc_ir_hmul_half(h, 3.3219281)); + return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281)); } __device__ static inline __half hexp2(const __half h) { @@ -376,11 +331,11 @@ __device__ static inline __half hfloor(const __half h) { } __device__ static inline __half hlog(const __half h) { - return __hip_hc_ir_hmul_half(__hip_hc_ir_hlog2_half(h), 0.693147); + return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147); } __device__ static inline __half hlog10(const __half h) { - return __hip_hc_ir_hmul_half(__hip_hc_ir_hlog2_half(h), 0.301029); + return __hmul(__hip_hc_ir_hlog2_half(h), 0.301029); } __device__ static inline __half hlog2(const __half h) { diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index aa6fe06337..98c3ada969 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -277,6 +277,10 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr(); * On AMD platforms, the threadfence* routines are currently empty stubs. */ +extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev"); +extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm("__llvm_fence_sc_wg"); + + /** * @brief threadfence_block makes writes visible to threads running in same block. * @@ -287,7 +291,9 @@ __device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr(); * @warning __threadfence_block is a stub and map to no-op. */ // __device__ void __threadfence_block(void); -extern "C" __device__ void __threadfence_block(void); +__device__ static inline void __threadfence_block(void) { + return __hip_hc_threadfence_block(); +} /** * @brief threadfence makes wirtes visible to other threads running on same GPU. @@ -299,7 +305,9 @@ extern "C" __device__ void __threadfence_block(void); * @warning __threadfence is a stub and map to no-op, application should set "export HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches. */ // __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional"))); -extern "C" __device__ void __threadfence(void); +__device__ static inline void __threadfence(void) { + return __hip_hc_threadfence(); +} /** * @brief threadfence_system makes writes to pinned system memory visible on host CPU. diff --git a/projects/clr/hipamd/src/hip_fp16.cpp b/projects/clr/hipamd/src/hip_fp16.cpp index ac79ddba08..b306a9d3de 100644 --- a/projects/clr/hipamd/src/hip_fp16.cpp +++ b/projects/clr/hipamd/src/hip_fp16.cpp @@ -32,6 +32,47 @@ struct hipHalfHolder{ #define HINF 65504 static struct hipHalfHolder __hInfValue = {HINF}; + +__device__ __half __hadd(__half a, __half b) { + return a + b; +} + +__device__ __half __hadd_sat(__half a, __half b) { + return a + b; +} + +__device__ __half __hfma(__half a, __half b, __half c) { + return a * b + c; +} + +__device__ __half __hfma_sat(__half a, __half b, __half c) { + return a * b + c; +} + +__device__ __half __hmul(__half a, __half b) { + return a * b; +} + +__device__ __half __hmul_sat(__half a, __half b) { + return a * b; +} + +__device__ __half __hneg(__half a) { + return -a; +} + +__device__ __half __hsub(__half a, __half b) { + return a - b; +} + +__device__ __half __hsub_sat(__half a, __half b) { + return a - b; +} + +__device__ __half hdiv(__half a, __half b) { + return a / b; +} + /* Half comparision Functions */ diff --git a/projects/clr/hipamd/src/hip_hc.ll b/projects/clr/hipamd/src/hip_hc.ll new file mode 100644 index 0000000000..aba9205912 --- /dev/null +++ b/projects/clr/hipamd/src/hip_hc.ll @@ -0,0 +1,30 @@ +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target triple = "amdgcn--amdhsa" + +define i32 @__hip_hc_ir_mul24_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_mul_i32_i24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + ret i32 %1 +} + +define i32 @__hip_hc_ir_umul24_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_mul_u32_u24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + ret i32 %1 +} + +define i32 @__hip_hc_ir_mulhi_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_mul_hi_i32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + ret i32 %1 +} + +define i32 @__hip_hc_ir_umulhi_int(i32 %a, i32 %b) #1 { + %1 = tail call i32 asm sideeffect "v_mul_hi_u32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) + ret i32 %1 +} + +define i32 @__hip_hc_ir_usad_int(i32 %a, i32 %b, i32 %c) #1 { + %1 = tail call i32 asm sideeffect "v_sad_u32 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) + ret i32 %1 +} + +attributes #1 = { alwaysinline nounwind } + diff --git a/projects/clr/hipamd/src/hip_ir.ll b/projects/clr/hipamd/src/hip_hc_gfx803.ll similarity index 62% rename from projects/clr/hipamd/src/hip_ir.ll rename to projects/clr/hipamd/src/hip_hc_gfx803.ll index 5a14266086..0080fc7d81 100644 --- a/projects/clr/hipamd/src/hip_ir.ll +++ b/projects/clr/hipamd/src/hip_hc_gfx803.ll @@ -2,65 +2,6 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64: target triple = "amdgcn--amdhsa" -define void @__threadfence() #1 { - fence syncscope(2) seq_cst - ret void -} - -define void @__threadfence_block() #1 { - fence syncscope(3) seq_cst - ret void -} - -; Lightning does not support inline asm for 16-bit data types -; So, bitcast half to short and then extend to 32bit i32 -; After inline asm, convert back to half -define half @__hip_hc_ir_hadd_half(half %a, half %b) #1 { - %1 = bitcast half %a to i16 - %2 = bitcast half %b to i16 - %3 = zext i16 %1 to i32 - %4 = zext i16 %2 to i32 - %5 = tail call i32 asm "v_add_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4) - %6 = trunc i32 %5 to i16 - %7 = bitcast i16 %6 to half - ret half %7 -} - -define half @__hip_hc_ir_hsub_half(half %a, half %b) #1 { - %1 = bitcast half %a to i16 - %2 = bitcast half %b to i16 - %3 = zext i16 %1 to i32 - %4 = zext i16 %2 to i32 - %5 = tail call i32 asm "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4) - %6 = trunc i32 %5 to i16 - %7 = bitcast i16 %6 to half - ret half %7 -} - -define half @__hip_hc_ir_hmul_half(half %a, half %b) #1 { - %1 = bitcast half %a to i16 - %2 = bitcast half %b to i16 - %3 = zext i16 %1 to i32 - %4 = zext i16 %2 to i32 - %5 = tail call i32 asm "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4) - %6 = trunc i32 %5 to i16 - %7 = bitcast i16 %6 to half - ret half %7 -} - -define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 { - %1 = bitcast half %a to i16 - %2 = bitcast half %b to i16 - %3 = bitcast half %c to i16 - %4 = zext i16 %1 to i32 - %5 = zext i16 %2 to i32 - %6 = zext i16 %3 to i32 - %7 = tail call i32 asm "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %4, i32 %5, i32 %6) - %8 = trunc i32 %7 to i16 - %9 = bitcast i16 %8 to half - ret half %9 -} - define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 { %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) @@ -146,34 +87,4 @@ define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 { ret i32 %1 } -define i32 @__hip_hc_ir_mul24_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_i32_i24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - ret i32 %1 -} - -define i32 @__hip_hc_ir_umul24_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_u32_u24 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - ret i32 %1 -} - -define i32 @__hip_hc_ir_mulhi_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_hi_i32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - ret i32 %1 -} - -define i32 @__hip_hc_ir_umulhi_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_hi_u32 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - ret i32 %1 -} - -define i32 @__hip_hc_ir_usad_int(i32 %a, i32 %b, i32 %c) #1 { - %1 = tail call i32 asm sideeffect "v_sad_u32 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) - ret i32 %1 -} - -define i32 @__hip_hc_ir_sadu8_int(i32 %a, i32 %b, i32 %c) #1 { - %1 = tail call i32 asm sideeffect "v_sad_u8 $0, $1, $2 $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) - ret i32 %1 -} - attributes #1 = { alwaysinline nounwind }