diff --git a/hipamd/.gitignore b/hipamd/.gitignore index 64cdd493a6..fe07943cad 100644 --- a/hipamd/.gitignore +++ b/hipamd/.gitignore @@ -10,7 +10,6 @@ bin/hipInfo bin/hipBusBandwidth bin/hipDispatchLatency bin/hipify-clang -include/hip/hip_version.h tags samples/1_Utils/hipInfo/hipInfo samples/1_Utils/hipBusBandwidth/hipBusBandwidth diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index fdf019b9da..16c3f11edf 100755 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -234,7 +234,7 @@ endif (NOT CPACK_SET_DESTDIR) # Generate profiling API macros/structures header if(HIP_PLATFORM STREQUAL "hcc") if(USE_PROF_API EQUAL 1) -set(PROF_API_STR "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hcc_detail/hip_prof_str.h") +set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/hcc_detail/hip_prof_str.h") set(PROF_API_HDR "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hcc_detail/hip_runtime_api.h") set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}/src") set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py") @@ -315,6 +315,7 @@ endif() message(STATUS "\nHSA runtime in: " ${HSA_PATH}) # Build hip_hcc if platform is hcc if(HIP_PLATFORM STREQUAL "hcc") + include_directories(${PROJECT_BINARY_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/include) set(HIP_HCC_BUILD_FLAGS) @@ -425,7 +426,8 @@ set(_versionInfoHeader #define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n #endif\n ") -file(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hip_version.h" ${_versionInfoHeader}) +file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader}) +include_directories(${PROJECT_BINARY_DIR}/include) # Build doxygen documentation find_program(DOXYGEN_EXE doxygen) @@ -457,7 +459,11 @@ install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin) execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE INSTALL_SOURCE) if(NOT ${INSTALL_SOURCE} EQUAL 0) - install(DIRECTORY src DESTINATION .) + if(HIP_RUNTIME STREQUAL "HCC") + install(DIRECTORY src DESTINATION .) + elseif(HIP_RUNTIME STREQUAL "ROCclr") + install(DIRECTORY rocclr DESTINATION .) + endif() install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS) install(DIRECTORY include DESTINATION .) install(DIRECTORY cmake DESTINATION .) @@ -468,7 +474,9 @@ endif() install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/hip/hcc_detail DESTINATION include/hip FILES_MATCHING PATTERN "*.h*") - +install(DIRECTORY ${PROJECT_BINARY_DIR}/include/hip + DESTINATION include + FILES_MATCHING PATTERN "*.h*") ############################# # hip-config diff --git a/hipamd/bin/extractkernel b/hipamd/bin/extractkernel index 81760f50de..d12645a996 100755 --- a/hipamd/bin/extractkernel +++ b/hipamd/bin/extractkernel @@ -34,26 +34,35 @@ defined $options{i} || die("input not specified"); $input_file = $options{i}; (-f $input_file) || die("can't find $input_file"); +# derive HIP_PATH via env var or use parent directory of extractkernel +my $HIP_PATH=$ENV{'HIP_PATH'} // dirname(Cwd::abs_path("$0/../")); +my $HIP_COMPILER = `$HIP_PATH/bin/hipconfig --compiler`; +my $ROCM_PATH = `$HIP_PATH/bin/hipconfig --rocmpath`; +my $HIP_CLANG_PATH = `$HIP_PATH/bin/hipconfig --hipclangpath`; + # look for llvm-objdump and clang-offload-bundler my $tools_path_prefix; my $llvm_objdump; my $clang_offload_bundler; -if (defined $ENV{'HCC_HOME'}) { - $tools_path_prefix = File::Spec->catfile($ENV{'HCC_HOME'}, "bin"); - $llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump"); - $clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler"); +if (defined $HIP_COMPILER and $HIP_COMPILER eq "clang"){ + # Search the path with respect to HIP_CLANG_PATH + $tools_path_prefix = $HIP_CLANG_PATH; } else { - $tools_path_prefix = dirname(realpath($0)); - $llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump"); - $clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler"); - if (!(-f $llvm_objdump)) { - $tools_path_prefix = realpath($tools_path_prefix."/../../hcc/bin"); - $llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump"); - $clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler"); + if (defined $HIP_COMPILER and $HIP_COMPILER eq "hcc") { + # Search the path with respect to HCC_HOME if it is set, else search in ROCM_PATH + if (defined $ENV{'HCC_HOME'}) { + $tools_path_prefix = File::Spec->catfile($ENV{'HCC_HOME'}, "bin"); + } + else { + $tools_path_prefix = realpath($ROCM_PATH."/hcc/bin"); + } } } +# Find llvm-objdump and clang-offload-bundler in the path set above +$llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump"); +$clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler"); if (!(-f $llvm_objdump)) { $llvm_objdump = which("llvm-objdump"); diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 4759c5319e..fd88e63c33 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -26,7 +26,7 @@ use Cwd 'abs_path'; # script's abs_path). Used on AMD platforms only. # HSA_PATH : Path to HSA dir (defaults to ../../hsa relative to abs_path # of this script). Used on AMD platforms only. -# HIP_ROCclr_HOME : Path to HIP/ROCclr directory. Used on AMD platforms only. +# HIP_ROCCLR_HOME : Path to HIP/ROCclr directory. Used on AMD platforms only. # HIP_CLANG_PATH : Path to HIP-Clang (default to ../../llvm/bin relative to this # script's abs_path). Used on AMD platforms only. @@ -82,15 +82,15 @@ if (-e "$HIP_PATH/../.info/version") { } else { $ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm"; } -$HIP_ROCclr_HOME=$ENV{'HIP_ROCclr_HOME'}; +$HIP_ROCCLR_HOME=$ENV{'HIP_ROCCLR_HOME'}; $HIP_LIB_PATH=$ENV{'HIP_LIB_PATH'}; $HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'}; $DEVICE_LIB_PATH=$ENV{'DEVICE_LIB_PATH'}; $HIP_CLANG_HCC_COMPAT_MODE=$ENV{'HIP_CLANG_HCC_COMPAT_MODE'}; # HCC compatibility mode $HIP_COMPILE_CXX_AS_HIP=$ENV{'HIP_COMPILE_CXX_AS_HIP'} // "1"; -if (defined $HIP_ROCclr_HOME) { - $HIP_INFO_PATH= "$HIP_ROCclr_HOME/lib/.hipInfo"; +if (defined $HIP_ROCCLR_HOME) { + $HIP_INFO_PATH= "$HIP_ROCCLR_HOME/lib/.hipInfo"; } else { $HIP_INFO_PATH= "$HIP_PATH/lib/.hipInfo"; # use actual file } @@ -123,35 +123,43 @@ sub delete_temp_dirs { #--- #HIP_PLATFORM controls whether to use hcc (AMD) or nvcc as the platform: -$HIP_PLATFORM= `$HIP_PATH/bin/hipconfig --platform` // "hcc"; -$HIP_VERSION= `$HIP_PATH/bin/hipconfig --version`; #HIP_COMPILER controls whether to use hcc, clang or nvcc for compilation: -$HIP_COMPILER= `$HIP_PATH/bin/hipconfig --compiler`; #HIP_RUNTIME controls whether to use HCC, ROCclr, or NVCC as the runtime: -$HIP_RUNTIME= `$HIP_PATH/bin/hipconfig --runtime`; +if ($isWindows) { + # Windows cannot run perl natively, so hipcc will explicitly call perl + $HIP_PLATFORM= `perl $HIP_PATH/bin/hipconfig --platform`; + $HIP_VERSION= `perl $HIP_PATH/bin/hipconfig --version`; + $HIP_COMPILER= `perl $HIP_PATH/bin/hipconfig --compiler`; + $HIP_RUNTIME= `perl $HIP_PATH/bin/hipconfig --runtime`; +} else { + $HIP_PLATFORM= `$HIP_PATH/bin/hipconfig --platform`; + $HIP_VERSION= `$HIP_PATH/bin/hipconfig --version`; + $HIP_COMPILER= `$HIP_PATH/bin/hipconfig --compiler`; + $HIP_RUNTIME= `$HIP_PATH/bin/hipconfig --runtime`; +} -# If using ROCclr runtime, need to find HIP_ROCclr_HOME -if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "ROCclr" and !defined $HIP_ROCclr_HOME) { +# If using ROCclr runtime, need to find HIP_ROCCLR_HOME +if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "ROCclr" and !defined $HIP_ROCCLR_HOME) { my $hipcc_dir = dirname($0); if (-e "$hipcc_dir/../lib/bitcode") { - $HIP_ROCclr_HOME = abs_path($hipcc_dir . "/.."); + $HIP_ROCCLR_HOME = abs_path($hipcc_dir . "/.."); } else { - $HIP_ROCclr_HOME = $HIP_PATH; # use HIP_PATH + $HIP_ROCCLR_HOME = $HIP_PATH; # use HIP_PATH } $HIPCXXFLAGS .= "-D__HIP_ROCclr__"; $HIPCFLAGS .= "-D__HIP_ROCclr__"; } -if (defined $HIP_ROCclr_HOME) { - if (!defined $HIP_CLANG_PATH and (-e "$HIP_ROCclr_HOME/bin/clang" or -e "$HIP_ROCclr_HOME/bin/clang.exe")) { - $HIP_CLANG_PATH = "$HIP_ROCclr_HOME/bin"; +if (defined $HIP_ROCCLR_HOME) { + if (!defined $HIP_CLANG_PATH and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) { + $HIP_CLANG_PATH = "$HIP_ROCCLR_HOME/bin"; } - if (!defined $DEVICE_LIB_PATH and -e "$HIP_ROCclr_HOME/lib/bitcode") { - $DEVICE_LIB_PATH = "$HIP_ROCclr_HOME/lib/bitcode"; + if (!defined $DEVICE_LIB_PATH and -e "$HIP_ROCCLR_HOME/lib/bitcode") { + $DEVICE_LIB_PATH = "$HIP_ROCCLR_HOME/lib/bitcode"; } - $HIP_INCLUDE_PATH = "$HIP_ROCclr_HOME/include"; + $HIP_INCLUDE_PATH = "$HIP_ROCCLR_HOME/include"; if (!defined $HIP_LIB_PATH) { - $HIP_LIB_PATH = "$HIP_ROCclr_HOME/lib"; + $HIP_LIB_PATH = "$HIP_ROCCLR_HOME/lib"; } } @@ -199,8 +207,8 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") { $HIP_LIB_PATH = "$HIP_PATH/lib"; } if ($verbose & 0x2) { - if (defined $HIP_ROCclr_HOME) { - print ("HIP_ROCclr_HOME=$HIP_ROCclr_HOME\n"); + if (defined $HIP_ROCCLR_HOME) { + print ("HIP_ROCCLR_HOME=$HIP_ROCCLR_HOME\n"); } print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n"); print ("HIP_CLANG_INCLUDE_PATH=$HIP_CLANG_INCLUDE_PATH\n"); @@ -393,6 +401,7 @@ if($HIP_PLATFORM eq "nvcc"){ } } +# TODO: convert toolArgs to an array rather than a string my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool my $optArg = ""; # -O args my $targetOpt = '--amdgpu-target='; @@ -402,7 +411,11 @@ my $prevArg = ""; # previous argument foreach $arg (@ARGV) { + # Save $arg, it can get changed in the loop. $trimarg = $arg; + # TODO: figure out why this space removal is wanted. + # TODO: If someone has gone to the effort of quoting the spaces to the shell + # TODO: why are we removing it here? $trimarg =~ s/^\s+|\s+$//g; # Remive whitespace my $swallowArg = 0; if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') { @@ -411,6 +424,7 @@ foreach $arg (@ARGV) } if ($skipOutputFile) { + # TODO: handle filename with shell metacharacters $toolArgs .= " $arg"; $prevArg = $arg; $skipOutputFile = 0; @@ -452,7 +466,7 @@ foreach $arg (@ARGV) $arg = "--cuda-device-only"; } - if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0)) + if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0) and $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'hcc') { $HIPCXXFLAGS .= $HCC_WA_FLAGS; $setStdLib = 1; @@ -659,6 +673,14 @@ foreach $arg (@ARGV) push (@inputs, $arg); #print "I: <$arg>\n"; } + # Produce a version of $arg where characters significant to the shell are + # quoted. One could quote everything of course but don't bother for + # common characters such as alphanumerics. + # Do the quoting here because sometimes the $arg is changed in the loop + # Important to have all of '-Xlinker' in the set of unquoted characters. + if (not $isWindows) { # Windows needs different quoting, ignore for now + $arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g; + } $toolArgs .= " $arg" unless $swallowArg; $prevArg = $arg; } @@ -805,6 +827,7 @@ if ($HIPCC_LINK_FLAGS_APPEND) { $HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND"; } +# TODO: convert CMD to an array rather than a string my $CMD="$HIPCC"; if ($needCFLAGS) { diff --git a/hipamd/bin/hipconfig b/hipamd/bin/hipconfig index ecd1449b2e..9b10bf7110 100755 --- a/hipamd/bin/hipconfig +++ b/hipamd/bin/hipconfig @@ -17,9 +17,11 @@ Getopt::Long::Configure ( qw{bundling no_ignore_case}); GetOptions( "help|h" => \$p_help ,"path|p" => \$p_path + ,"rocmpath|R" => \$p_rocmpath ,"compiler|c" => \$p_compiler ,"platform|P" => \$p_platform ,"runtime|r" => \$p_runtime + ,"hipclangpath|l" => \$p_hipclangpath ,"cpp_config|cxx_config|C" => \$p_cpp_config ,"full|f|info" => \$p_full, ,"version|v" => \$p_version, @@ -30,10 +32,12 @@ GetOptions( if ($p_help) { print "usage: hipconfig [OPTIONS]\n"; print " --path, -p : print HIP_PATH (use env var if set, else determine from hipconfig path)\n"; + print " --rocmpath, -R : print ROCM_PATH (use env var if set, else determine from hip path or /opt/rocm)\n"; print " --cpp_config, -C : print C++ compiler options\n"; print " --compiler, -c : print compiler (hcc or clang or nvcc)\n"; print " --platform, -P : print platform (hcc or nvcc)\n"; print " --runtime, -r : print runtime (HCC or ROCclr)\n"; + print " --hipclangpath, -l : print HIP_CLANG_PATH\n"; print " --full, -f : print full config\n"; print " --version, -v : print hip version\n"; print " --check : check configuration\n"; @@ -85,15 +89,33 @@ $CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda'; $HCC_HOME=$ENV{'HCC_HOME'} // "$ROCM_PATH/hcc"; $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa"; $HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$ROCM_PATH/llvm/bin"; +# HIP_ROCCLR_HOME is used by Windows builds +$HIP_ROCCLR_HOME=$ENV{'HIP_ROCCLR_HOME'}; +if (defined $HIP_ROCCLR_HOME) { + $HIP_INFO_PATH= "$HIP_ROCCLR_HOME/lib/.hipInfo"; +} else { + $HIP_INFO_PATH= "$HIP_PATH/lib/.hipInfo"; # use actual file +} #--- #HIP_PLATFORM controls whether to use NVCC or HCC for compilation: $HIP_PLATFORM=$ENV{'HIP_PLATFORM'}; # Read .hipInfo my %hipInfo = (); -parse_config_file("$HIP_PATH/lib/.hipInfo", \%hipInfo); -$HIP_COMPILER = $hipInfo{'HIP_COMPILER'} // "hcc"; -$HIP_RUNTIME = $hipInfo{'HIP_RUNTIME'} // "HCC"; +parse_config_file("$HIP_INFO_PATH", \%hipInfo); +# Prioritize Env first, otherwise use the hipInfo config file +$HIP_COMPILER = $ENV{'HIP_COMPILER'} // $hipInfo{'HIP_COMPILER'} // "hcc"; +$HIP_RUNTIME = $ENV{'HIP_RUNTIME'} // $hipInfo{'HIP_RUNTIME'} // "HCC"; + +# If using ROCclr runtime, need to find HIP_ROCCLR_HOME +if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "ROCclr" and !defined $HIP_ROCCLR_HOME) { + my $hipconfig_dir = dirname($0); + if (-e "$hipconfig_dir/../lib/bitcode") { + $HIP_ROCCLR_HOME = abs_path($hipconfig_dir . "/.."); + } else { + $HIP_ROCCLR_HOME = $HIP_PATH; # use HIP_PATH + } +} if (not defined $HIP_PLATFORM) { if (can_run("$HCC_HOME/bin/hcc") or can_run("hcc")) { @@ -112,6 +134,10 @@ if ($HIP_COMPILER eq "hcc") { $CPP_CONFIG = " -D__HIP_PLATFORM_HCC__= -I$HIP_PATH/include -I$HCC_HOME/include -I$HSA_PATH/include"; } if ($HIP_COMPILER eq "clang") { + # Windows does not have clang at linux default path + if (defined $HIP_ROCCLR_HOME and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) { + $HIP_CLANG_PATH = "$HIP_ROCCLR_HOME/bin"; + } $HIP_CLANG_VERSION = `$HIP_CLANG_PATH/clang++ --version`; $HIP_CLANG_VERSION=~/.*clang version ([^ ]+).*/; $HIP_CLANG_VERSION=$1; @@ -139,6 +165,11 @@ if ($p_path) { $printed = 1; } +if ($p_rocmpath) { + print "$ROCM_PATH"; + $printed = 1; +} + if ($p_cpp_config) { print $CPP_CONFIG; $printed = 1; @@ -159,6 +190,13 @@ if ($p_runtime) { $printed = 1; } +if ($p_hipclangpath) { + if (defined $HIP_CLANG_PATH) { + print $HIP_CLANG_PATH; + } + $printed = 1; +} + if ($p_version) { print $HIP_VERSION; $printed = 1; @@ -168,6 +206,7 @@ if (!$printed or $p_full) { print "HIP version : ", $HIP_VERSION, "\n\n"; print "== hipconfig\n"; print "HIP_PATH : ", $HIP_PATH, "\n"; + print "ROCM_PATH : ", $ROCM_PATH, "\n"; print "HIP_COMPILER : ", $HIP_COMPILER, "\n"; print "HIP_PLATFORM : ", $HIP_PLATFORM, "\n"; print "HIP_RUNTIME : ", $HIP_RUNTIME, "\n"; diff --git a/hipamd/cmake/FindHIP.cmake b/hipamd/cmake/FindHIP.cmake index cc7f4af20c..498b5e4570 100644 --- a/hipamd/cmake/FindHIP.cmake +++ b/hipamd/cmake/FindHIP.cmake @@ -615,23 +615,25 @@ macro(HIP_ADD_EXECUTABLE hip_target) endif() if("${HIP_COMPILER}" STREQUAL "hcc") if("x${HCC_HOME}" STREQUAL "x") - if (DEFINED $ENV{ROCM_PATH}) - set(HCC_HOME "$ENV{ROCM_PATH}/hcc") - elseif( DEFINED $ENV{HIP_PATH}) - set(HCC_HOME "$ENV{HIP_PATH}/../hcc") + if (DEFINED ENV{ROCM_PATH}) + set(HCC_HOME "$ENV{ROCM_PATH}/hcc") + elseif(DEFINED ENV{HIP_PATH}) + set(HCC_HOME "$ENV{HIP_PATH}/../hcc") else() - set(HCC_HOME "/opt/rocm/hcc") + set(HCC_HOME "/opt/rocm/hcc") endif() endif() set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") elseif("${HIP_COMPILER}" STREQUAL "clang") - if("x${HIP_CLANG_PATH}" STREQUAL "x") - if (DEFINED $ENV{ROCM_PATH}) - set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin") - elseif( DEFINED $ENV{HIP_PATH}) - set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin") + if("x${HIP_CLANG_PATH}" STREQUAL "x") + if(DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH}) + elseif(DEFINED ENV{ROCM_PATH}) + set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin") + elseif(DEFINED ENV{HIP_PATH}) + set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin") else() - set(HIP_CLANG_PATH "/opt/rocm/llvm/bin") + set(HIP_CLANG_PATH "/opt/rocm/llvm/bin") endif() endif() set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o ") diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index eaee437cea..7bc0b97617 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -85,11 +85,11 @@ __device__ static inline unsigned int __ffsll(long long int input) { } __device__ static inline unsigned int __brev(unsigned int input) { - return __llvm_bitrev_b32(input); + return __builtin_bitreverse32(input); } __device__ static inline unsigned long long int __brevll(unsigned long long int input) { - return __llvm_bitrev_b64(input); + return __builtin_bitreverse64(input); } __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) { @@ -233,7 +233,10 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns return __ockl_sadd_u32(x, y, z); } -__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); } +__device__ static inline unsigned int __lane_id() { + return __builtin_amdgcn_mbcnt_hi( + -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); +} /* HIP specific device functions @@ -241,25 +244,25 @@ HIP specific device functions __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; - tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i); + tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i); return tmp.u; } __device__ static inline float __hip_ds_bpermutef(int index, float src) { union { int i; unsigned u; float f; } tmp; tmp.f = src; - tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i); + tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i); return tmp.f; } __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; - tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i); + tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i); return tmp.u; } __device__ static inline float __hip_ds_permutef(int index, float src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; - tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i); + tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i); return tmp.u; } @@ -293,8 +296,8 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) { template __device__ static inline int __hip_move_dpp_N(int src) { - return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, - bound_ctrl); + return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask, + bound_ctrl); } static constexpr int warpSize = 64; @@ -304,7 +307,7 @@ inline int __shfl(int var, int src_lane, int width = warpSize) { int self = __lane_id(); int index = src_lane + (self & ~(width-1)); - return __llvm_amdgcn_ds_bpermute(index<<2, var); + return __builtin_amdgcn_ds_bpermute(index<<2, var); } __device__ inline @@ -376,7 +379,7 @@ int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) { int self = __lane_id(); int index = self - lane_delta; index = (index < (self & ~(width-1)))?self:index; - return __llvm_amdgcn_ds_bpermute(index<<2, var); + return __builtin_amdgcn_ds_bpermute(index<<2, var); } __device__ inline @@ -446,7 +449,7 @@ int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) { int self = __lane_id(); int index = self + lane_delta; index = (int)((self&(width-1))+lane_delta) >= width?self:index; - return __llvm_amdgcn_ds_bpermute(index<<2, var); + return __builtin_amdgcn_ds_bpermute(index<<2, var); } __device__ inline @@ -516,7 +519,7 @@ int __shfl_xor(int var, int lane_mask, int width = warpSize) { int self = __lane_id(); int index = self^lane_mask; index = index >= ((self+width)&~(width-1))?self:index; - return __llvm_amdgcn_ds_bpermute(index<<2, var); + return __builtin_amdgcn_ds_bpermute(index<<2, var); } __device__ inline diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 7363f904ed..17c34b0ad5 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -506,6 +506,14 @@ hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config); */ hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig); +/** + * @brief Gets the flags set for current device + * + * @param [out] flags + * + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue + */ +hipError_t hipGetDeviceFlags(unsigned *flags); /** * @brief The bank width of shared memory on current device is set diff --git a/hipamd/include/hip/hcc_detail/llvm_intrinsics.h b/hipamd/include/hip/hcc_detail/llvm_intrinsics.h index dc6fd05c52..330b3d91c2 100644 --- a/hipamd/include/hip/hcc_detail/llvm_intrinsics.h +++ b/hipamd/include/hip/hcc_detail/llvm_intrinsics.h @@ -31,40 +31,11 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -__device__ -__attribute__((convergent)) -ulong __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) __asm("llvm.amdgcn.icmp.i32"); - +// FIXME: These should all be removed and proper builtins used. __device__ unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize"); -__device__ -unsigned int __llvm_bitrev_b32(unsigned int src0) __asm("llvm.bitreverse.i32"); - -__device__ -uint64_t __llvm_bitrev_b64(uint64_t src0) __asm("llvm.bitreverse.i64"); - -extern -__device__ -__attribute__((const)) -unsigned int __mbcnt_lo(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.lo"); - -extern -__device__ -__attribute__((const)) -unsigned int __mbcnt_hi(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.hi"); - -__device__ -int __llvm_amdgcn_ds_bpermute(int index, int src) __asm("llvm.amdgcn.ds.bpermute"); - -__device__ -int __llvm_amdgcn_ds_permute(int index, int src) __asm("llvm.amdgcn.ds.permute"); - __device__ int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle"); -__device__ -int __llvm_amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, - bool bound_ctrl) __asm("llvm.amdgcn.mov.dpp.i32"); - #endif diff --git a/hipamd/include/hip/hcc_detail/math_functions.h b/hipamd/include/hip/hcc_detail/math_functions.h index 11985c3242..494685e261 100644 --- a/hipamd/include/hip/hcc_detail/math_functions.h +++ b/hipamd/include/hip/hcc_detail/math_functions.h @@ -1411,12 +1411,12 @@ float func(float x, int y) \ __DEF_FLOAT_FUN2I(scalbn) template -__DEVICE__ inline static T min(T arg1, T arg2) { +__DEVICE__ inline T min(T arg1, T arg2) { return (arg1 < arg2) ? arg1 : arg2; } template -__DEVICE__ inline static T max(T arg1, T arg2) { +__DEVICE__ inline T max(T arg1, T arg2) { return (arg1 > arg2) ? arg1 : arg2; } diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 3890028950..23b8a0619d 100644 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -1511,6 +1511,10 @@ inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cuda inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); } +inline static hipError_t hipGetDeviceFlags(unsigned int* flags) { + return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags)); +} + inline static hipError_t hipSetDeviceFlags(unsigned int flags) { return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags)); } diff --git a/hipamd/packaging/hip-base.txt b/hipamd/packaging/hip-base.txt index fc8becf84f..0923f0c8fd 100644 --- a/hipamd/packaging/hip-base.txt +++ b/hipamd/packaging/hip-base.txt @@ -3,6 +3,10 @@ project(hip_base) install(DIRECTORY @hip_SOURCE_DIR@/bin DESTINATION . USE_SOURCE_PERMISSIONS) install(DIRECTORY @hip_SOURCE_DIR@/include DESTINATION .) +install(FILES @PROJECT_BINARY_DIR@/include/hip/hcc_detail/hip_prof_str.h + DESTINATION include/hip/hcc_detail) +install(FILES @PROJECT_BINARY_DIR@/include/hip/hip_version.h + DESTINATION include/hip) install(FILES @PROJECT_BINARY_DIR@/.hipVersion DESTINATION bin) install(PROGRAMS @PROJECT_BINARY_DIR@/lpl DESTINATION bin) install(PROGRAMS @PROJECT_BINARY_DIR@/ca DESTINATION bin) diff --git a/hipamd/packaging/hip-hcc.postinst b/hipamd/packaging/hip-hcc.postinst index 2371b53e17..080c846f40 100755 --- a/hipamd/packaging/hip-hcc.postinst +++ b/hipamd/packaging/hip-hcc.postinst @@ -14,7 +14,7 @@ HIPDIR=$ROCMDIR/hip HIPLIBDIR=$ROCMDIR/hip/lib # Soft-link to library files -HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v [-/$]) +HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v "cmake\|[-/$]") mkdir -p $ROCMLIBDIR mkdir -p $ROCMLIBDIR/cmake pushd $ROCMLIBDIR diff --git a/hipamd/packaging/hip-rocclr.postinst b/hipamd/packaging/hip-rocclr.postinst index 2371b53e17..080c846f40 100755 --- a/hipamd/packaging/hip-rocclr.postinst +++ b/hipamd/packaging/hip-rocclr.postinst @@ -14,7 +14,7 @@ HIPDIR=$ROCMDIR/hip HIPLIBDIR=$ROCMDIR/hip/lib # Soft-link to library files -HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v [-/$]) +HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v "cmake\|[-/$]") mkdir -p $ROCMLIBDIR mkdir -p $ROCMLIBDIR/cmake pushd $ROCMLIBDIR diff --git a/hipamd/rocclr/CMakeLists.txt b/hipamd/rocclr/CMakeLists.txt index 5158b7935e..aaee0a4d0b 100644 --- a/hipamd/rocclr/CMakeLists.txt +++ b/hipamd/rocclr/CMakeLists.txt @@ -27,30 +27,44 @@ endif() set(USE_PROF_API "1") -if(NOT DEFINED LIBROCclr_STATIC_DIR) - find_path(LIBROCclr_STATIC_DIR - NAMES libamdrocclr_static.a - PATHS /opt/rocm/rocclr - PATH_SUFFIXES lib - ) +# FIXME: Make this required and remove the legacy handling below +set(save_rocclr_dir ${ROCclr_DIR}) +set(save_rocclr_static_dir ${LIBROCclr_STATIC_DIR}) + +find_package(ROCclr CONFIG + PATHS + /opt/rocm + /opt/rocm/rocclr) + +if (NOT ROCclr_FOUND) + if(NOT DEFINED LIBROCclr_STATIC_DIR) + find_path(LIBROCclr_STATIC_DIR + NAMES libamdrocclr_static.a + PATHS /opt/rocm/rocclr + PATH_SUFFIXES lib) + else() + set(LIBROCclr_STATIC_DIR ${save_rocclr_static_dir}) + endif() + + if(NOT DEFINED ROCclr_DIR) + find_path(ROCclr_DIR + NAMES top.hpp + PATH_SUFFIXES include + PATHS /opt/rocm/rocclr) + else() + set(ROCclr_DIR ${save_rocclr_dir}) + endif() + message("Found Static rocclr lib:${LIBROCclr_STATIC_DIR} and rocclr includes: ${ROCclr_DIR}") + include(${LIBROCclr_STATIC_DIR}/amdrocclr_staticTargets.cmake) endif() -if(NOT DEFINED ROCclr_DIR) - find_path(ROCclr_DIR - NAMES top.hpp - PATH_SUFFIXES include - PATHS /opt/rocm/rocclr - ) -endif() -message("Found Static rocclr lib:${LIBROCclr_STATIC_DIR} and rocclr includes: ${ROCclr_DIR}") set(PROF_API_HEADER_PATH ${ROCclr_DIR}/platform) ############################# # Profiling API support ############################# # Generate profiling API macros/structures header -# FIXME: This should not be writing to the source directory set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/hcc_detail/hip_prof_str.h") -set(PROF_API_HDR "${CMAKE_CURRENT_SOURCE_DIR}/../include/hip/hcc_detail/hip_runtime_api.h") +set(PROF_API_HDR "${PROJECT_SOURCE_DIR}/include/hip/hcc_detail/hip_runtime_api.h") set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}") set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py") set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt") @@ -66,25 +80,6 @@ add_custom_target(gen-prof-api-str-header ALL SOURCES ${PROF_API_HDR}) # Enable profiling API -if(USE_PROF_API EQUAL 1) - find_path(PROF_API_HEADER_DIR prof_protocol.h - HINTS - ${PROF_API_HEADER_PATH} - PATHS - /opt/rocm/roctracer - PATH_SUFFIXES - include/ext - ) - if(NOT PROF_API_HEADER_DIR) - MESSAGE(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=") - else() - add_definitions(-DUSE_PROF_API=1) - include_directories(${PROF_API_HEADER_DIR}) - MESSAGE(STATUS "Profiling API: ${PROF_API_HEADER_DIR}") - endif() -endif() - - if(NOT DEFINED ROCclr_DIR OR NOT DEFINED LIBOCL_STATIC_DIR OR NOT DEFINED LIBROCclr_STATIC_DIR ) # message(FATAL_ERROR "define ROCclr_DIR, LIBOCL_STATIC_DIR\n") @@ -92,35 +87,16 @@ endif() list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" ) set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") -include_directories(${ROCR_INCLUDES}) -if (DEFINED LLVM_INCLUDES AND NOT ${LLVM_INCLUDES} STREQUAL "") - message(STATUS "LLVM includes found ${LLVM_INCLUDES}") - include_directories(${LLVM_INCLUDES}) -endif() # if (DEFINED LLVM_INCLUDES AND NOT ${LLVM_INCLUDES} STREQUAL "") - -include_directories(${CMAKE_SOURCE_DIR}) -include_directories(${CMAKE_SOURCE_DIR}/include) -include_directories(${PROJECT_BINARY_DIR}/include) -include_directories(${CMAKE_SOURCE_DIR}/elfio) -include_directories(${CMAKE_SOURCE_DIR}/amdocl) -include_directories(${CMAKE_SOURCE_DIR}/include/hip/hcc_detail/elfio) -include_directories(${ROCclr_DIR}) -include_directories(${ROCclr_DIR}/include) -include_directories(${ROCclr_DIR}/compiler/lib) -include_directories(${ROCclr_DIR}/compiler/lib/include) -include_directories(${ROCclr_DIR}/elf/utils/common) -include_directories(${ROCclr_DIR}/elf/utils/libelf) add_definitions(-DUSE_COMGR_LIBRARY -DCOMGR_DYN_DLL) - find_package(amd_comgr REQUIRED CONFIG - PATHS - /opt/rocm/ - PATH_SUFFIXES - cmake/amd_comgr - lib/cmake/amd_comgr - ) - MESSAGE(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") -include_directories("$") +find_package(amd_comgr REQUIRED CONFIG + PATHS + /opt/rocm/ + PATH_SUFFIXES + cmake/amd_comgr + lib/cmake/amd_comgr) + +message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") add_definitions(-DBSD_LIBELF) @@ -147,6 +123,55 @@ add_library(hip64 OBJECT ) set_target_properties(hip64 PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_include_directories(hip64 + PUBLIC + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_BINARY_DIR}/include + PRIVATE + ${CMAKE_SOURCE_DIR}/elfio + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/amdocl + ${PROJECT_SOURCE_DIR}/include/hip/hcc_detail/elfio + ${ROCclr_DIR} + ${ROCclr_DIR}/include + ${ROCclr_DIR}/compiler/lib + ${ROCclr_DIR}/compiler/lib/include + ${ROCclr_DIR}/elf/utils/common + ${ROCclr_DIR}/elf/utils/libelf + ${ROCR_INCLUDES} + $) +target_compile_definitions(hip64 + PRIVATE + $) + +if(ROCclr_FOUND) + target_include_directories(hip64 + PRIVATE + $) + target_compile_definitions(hip64 + PRIVATE + $) +endif() + + # Enable profiling API +if(USE_PROF_API EQUAL 1) + find_path(PROF_API_HEADER_DIR prof_protocol.h + HINTS + ${PROF_API_HEADER_PATH} + PATHS + /opt/rocm/roctracer + PATH_SUFFIXES + include/ext) + + if(NOT PROF_API_HEADER_DIR) + message(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=") + else() + target_compile_definitions(hip64 PUBLIC USE_PROF_API=1) + target_include_directories(hip64 PUBLIC ${PROF_API_HEADER_DIR}) + message(STATUS "Profiling API: ${PROF_API_HEADER_DIR}") + endif() +endif() + set_target_properties( hip64 PROPERTIES CXX_STANDARD 14 @@ -157,7 +182,6 @@ add_dependencies(hip64 gen-prof-api-str-header) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) -include(${LIBROCclr_STATIC_DIR}/amdrocclr_staticTargets.cmake) add_library(amdhip64 SHARED $ diff --git a/hipamd/rocclr/hip_device.cpp b/hipamd/rocclr/hip_device.cpp index 3476ac14fc..8695ef43e5 100644 --- a/hipamd/rocclr/hip_device.cpp +++ b/hipamd/rocclr/hip_device.cpp @@ -188,13 +188,13 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) deviceProps.arch.hasGlobalFloatAtomicExch = 1; deviceProps.arch.hasSharedInt32Atomics = 1; deviceProps.arch.hasSharedFloatAtomicExch = 1; - deviceProps.arch.hasFloatAtomicAdd = 0; + deviceProps.arch.hasFloatAtomicAdd = 1; deviceProps.arch.hasGlobalInt64Atomics = 1; deviceProps.arch.hasSharedInt64Atomics = 1; deviceProps.arch.hasDoubles = 1; - deviceProps.arch.hasWarpVote = 0; - deviceProps.arch.hasWarpBallot = 0; - deviceProps.arch.hasWarpShuffle = 0; + deviceProps.arch.hasWarpVote = 1; + deviceProps.arch.hasWarpBallot = 1; + deviceProps.arch.hasWarpShuffle = 1; deviceProps.arch.hasFunnelShift = 0; deviceProps.arch.hasThreadFenceSystem = 1; deviceProps.arch.hasSyncThreadsExt = 0; diff --git a/hipamd/rocclr/hip_device_runtime.cpp b/hipamd/rocclr/hip_device_runtime.cpp index 86a1590533..531f35c732 100644 --- a/hipamd/rocclr/hip_device_runtime.cpp +++ b/hipamd/rocclr/hip_device_runtime.cpp @@ -471,7 +471,12 @@ hipError_t hipGetDeviceCount ( int* count ) { } hipError_t hipGetDeviceFlags ( unsigned int* flags ) { - HIP_RETURN(hipErrorNotSupported); + HIP_INIT_API(hipGetDeviceFlags, flags); + if (flags == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + *flags = hip::getCurrentDevice()->getFlags(); + HIP_RETURN(hipSuccess); } hipError_t hipIpcGetEventHandle ( hipIpcEventHandle_t* handle, hipEvent_t event ) { @@ -531,7 +536,8 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) { default: break; } - + hip::getCurrentDevice()->setFlags(flags & hipDeviceScheduleMask); + HIP_RETURN(hipSuccess); } diff --git a/hipamd/rocclr/hip_event.cpp b/hipamd/rocclr/hip_event.cpp index f2738169a8..dddadd1bfd 100644 --- a/hipamd/rocclr/hip_event.cpp +++ b/hipamd/rocclr/hip_event.cpp @@ -91,7 +91,7 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) { } ms = static_cast(static_cast(eStop.event_->profilingInfo().end_ - - event_->profilingInfo().start_))/1000000.f; + event_->profilingInfo().end_))/1000000.f; return hipSuccess; } diff --git a/hipamd/rocclr/hip_hcc.def.in b/hipamd/rocclr/hip_hcc.def.in index 238d7fe02a..579608e685 100755 --- a/hipamd/rocclr/hip_hcc.def.in +++ b/hipamd/rocclr/hip_hcc.def.in @@ -149,6 +149,7 @@ hipPointerGetAttributes hipProfilerStart hipProfilerStop hipRuntimeGetVersion +hipGetDeviceFlags hipSetDevice hipSetDeviceFlags hipStreamAddCallback diff --git a/hipamd/rocclr/hip_hcc.map.in b/hipamd/rocclr/hip_hcc.map.in index f2491cd283..19da8a6991 100755 --- a/hipamd/rocclr/hip_hcc.map.in +++ b/hipamd/rocclr/hip_hcc.map.in @@ -149,6 +149,7 @@ global: hipProfilerStart; hipProfilerStop; hipRuntimeGetVersion; + hipGetDeviceFlags; hipSetDevice; hipSetDeviceFlags; hipStreamAddCallback; diff --git a/hipamd/rocclr/hip_internal.hpp b/hipamd/rocclr/hip_internal.hpp index 4a40018745..4cc0dadd8a 100755 --- a/hipamd/rocclr/hip_internal.hpp +++ b/hipamd/rocclr/hip_internal.hpp @@ -119,12 +119,14 @@ namespace hip { int deviceId_; /// ROCclr host queue for default streams Stream null_stream_; - //Maintain list of user enabled peers + /// Store device flags + unsigned int flags_; + /// Maintain list of user enabled peers std::list userEnabledPeers; public: Device(amd::Context* ctx, int devId): - context_(ctx), deviceId_(devId), null_stream_(this, amd::CommandQueue::Priority::Normal, 0, true) + context_(ctx), deviceId_(devId), null_stream_(this, amd::CommandQueue::Priority::Normal, 0, true), flags_(hipDeviceScheduleSpin) { assert(ctx != nullptr); } ~Device() {} @@ -152,6 +154,8 @@ namespace hip { return hipErrorPeerAccessNotEnabled; } } + unsigned int getFlags() const { return flags_; } + void setFlags(unsigned int flags) { flags_ = flags; } amd::HostQueue* NullStream(bool skip_alloc = false); }; @@ -208,6 +212,7 @@ public: void init(); std::vector>* addFatBinary(const void*data) { + amd::ScopedLock lock(lock_); if (initialized_) { digestFatBinary(data, modules_[data]); } @@ -215,6 +220,7 @@ public: } void removeFatBinary(std::vector>* module) { + amd::ScopedLock lock(lock_); for (auto& mod : modules_) { if (&mod.second == module) { modules_.erase(&mod); diff --git a/hipamd/rocclr/hip_memory.cpp b/hipamd/rocclr/hip_memory.cpp index 593513c98d..4bd75b8ffb 100755 --- a/hipamd/rocclr/hip_memory.cpp +++ b/hipamd/rocclr/hip_memory.cpp @@ -129,7 +129,10 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin amd::HostQueue* pQueue = &queue; if (queueDevice != dstMemory->getContext().devices()[0]) { pQueue = hip::getNullStream(dstMemory->getContext()); - waitList.push_back(queue.getLastQueuedCommand(true)); + amd::Command* cmd = queue.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } } command = new amd::WriteMemoryCommand(*pQueue, CL_COMMAND_WRITE_BUFFER, waitList, *dstMemory->asBuffer(), dOffset, sizeBytes, src); @@ -138,7 +141,10 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin amd::HostQueue* pQueue = &queue; if (queueDevice != srcMemory->getContext().devices()[0]) { pQueue = hip::getNullStream(srcMemory->getContext()); - waitList.push_back(queue.getLastQueuedCommand(true)); + amd::Command* cmd = queue.getLastQueuedCommand(true); + if (cmd != nullptr) { + waitList.push_back(cmd); + } } command = new amd::ReadMemoryCommand(*pQueue, CL_COMMAND_READ_BUFFER, waitList, *srcMemory->asBuffer(), sOffset, sizeBytes, dst); diff --git a/hipamd/rocclr/hip_module.cpp b/hipamd/rocclr/hip_module.cpp index db39b234b4..7cda6864c6 100755 --- a/hipamd/rocclr/hip_module.cpp +++ b/hipamd/rocclr/hip_module.cpp @@ -314,21 +314,19 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc switch(attrib) { case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: - *value = static_cast(wrkGrpInfo->localMemSize_ - - wrkGrpInfo->privateMemSize_); + *value = static_cast(wrkGrpInfo->localMemSize_); break; case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: - *value = static_cast(wrkGrpInfo->wavefrontPerSIMD_ - * wrkGrpInfo->wavefrontSize_); + *value = static_cast(wrkGrpInfo->size_); break; case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: *value = 0; break; case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: - *value = static_cast(wrkGrpInfo->localMemSize_); + *value = static_cast(wrkGrpInfo->privateMemSize_); break; case HIP_FUNC_ATTRIBUTE_NUM_REGS: - *value = static_cast(wrkGrpInfo->availableGPRs_); + *value = static_cast(wrkGrpInfo->usedVGPRs_); break; case HIP_FUNC_ATTRIBUTE_PTX_VERSION: *value = 30; // Defaults to 3.0 as HCC @@ -340,7 +338,7 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc *value = 0; break; case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: - *value = static_cast(wrkGrpInfo->availableLDSSize_); + *value = static_cast(wrkGrpInfo->availableLDSSize_ - wrkGrpInfo->localMemSize_); break; case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: *value = 0; diff --git a/hipamd/rocclr/hip_platform.cpp b/hipamd/rocclr/hip_platform.cpp index 11bd373550..8759ef47f1 100755 --- a/hipamd/rocclr/hip_platform.cpp +++ b/hipamd/rocclr/hip_platform.cpp @@ -122,10 +122,12 @@ hipError_t __hipExtractCodeObjectFromFatBinary(const void* data, num_code_objs++; } } - if (num_code_objs == devices.size()) + if (num_code_objs == devices.size()) { return hipSuccess; - else + } else { + DevLogError("hipErrorNoBinaryForGpu: Coudn't find binary for current devices!"); return hipErrorNoBinaryForGpu; + } } extern "C" std::vector>* __hipRegisterFatBinary(const void* data) @@ -189,6 +191,13 @@ void PlatformState::init() for (auto& it : vars_) { it.second.rvars.resize(g_devices.size()); } + if (!HIP_ENABLE_LAZY_KERNEL_LOADING) { + for (size_t i = 0; i < g_devices.size(); ++i) { + for (auto& it: functions_) { + getFunc(it.first, i); + } + } + } } bool PlatformState::unregisterFunc(hipModule_t hmod) { @@ -227,6 +236,11 @@ std::vector< std::pair >* PlatformState::unregisterVar(hipMod = reinterpret_cast *>(dvar.shadowVptr); delete tex_hptr; } + for (size_t dev = 0; dev < g_devices.size(); ++dev) { + if (dvar.rvars[dev].getdeviceptr()) { + amd::MemObjMap::RemoveMemObj(dvar.rvars[dev].getdeviceptr()); + } + } vars_.erase(it++); } else { ++it; @@ -309,11 +323,20 @@ bool ihipGetFuncAttributes(const char* func_name, amd::Program* program, hipFunc return false; } - const device::Kernel::WorkGroupInfo* wginfo = it->second->workGroupInfo(); - func_attr->localSizeBytes = wginfo->localMemSize_; - func_attr->sharedSizeBytes = wginfo->size_; - func_attr->maxThreadsPerBlock = wginfo->wavefrontSize_; - func_attr->numRegs = wginfo->usedVGPRs_; + const device::Kernel* kernel = it->second; + const device::Kernel::WorkGroupInfo* wginfo = kernel->workGroupInfo(); + func_attr->sharedSizeBytes = static_cast(wginfo->localMemSize_); + func_attr->binaryVersion = static_cast(kernel->signature().version()); + func_attr->cacheModeCA = 0; + func_attr->constSizeBytes = 0; + func_attr->localSizeBytes = wginfo->privateMemSize_; + func_attr->maxDynamicSharedSizeBytes = static_cast(wginfo->availableLDSSize_ + - wginfo->localMemSize_); + + func_attr->maxThreadsPerBlock = static_cast(wginfo->size_); + func_attr->numRegs = static_cast(wginfo->usedVGPRs_); + func_attr->preferredShmemCarveout = 0; + func_attr->ptxVersion = 30; return true; } diff --git a/hipamd/rocclr/hip_stream.cpp b/hipamd/rocclr/hip_stream.cpp index e4bf4fe192..3bd7d343f7 100644 --- a/hipamd/rocclr/hip_stream.cpp +++ b/hipamd/rocclr/hip_stream.cpp @@ -23,7 +23,7 @@ #include "hip_event.hpp" #include "thread/monitor.hpp" -static amd::Monitor streamSetLock("Guards global stream set"); +static amd::Monitor streamSetLock{"Guards global stream set"}; static std::unordered_set streamSet; // Internal structure for stream callback handler @@ -83,11 +83,11 @@ amd::HostQueue* Stream::asHostQueue(bool skip_alloc) { // ================================================================================================ void Stream::Destroy() { if (queue_ != nullptr) { - queue_->release(); - queue_ = nullptr; - amd::ScopedLock lock(streamSetLock); streamSet.erase(this); + + queue_->release(); + queue_ = nullptr; } delete this; } diff --git a/hipamd/tests/src/deviceLib/hip_mbcnt.cpp b/hipamd/tests/src/deviceLib/hip_mbcnt.cpp index cd4bfa5daa..2cb958f280 100644 --- a/hipamd/tests/src/deviceLib/hip_mbcnt.cpp +++ b/hipamd/tests/src/deviceLib/hip_mbcnt.cpp @@ -38,11 +38,11 @@ THE SOFTWARE. __global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) { int x = blockDim.x * blockIdx.x + threadIdx.x; - mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0); - mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0); + mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0); + mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0); lane_id[x] = __lane_id(); } - + using namespace std; int main() { diff --git a/hipamd/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp b/hipamd/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp index 102387cbe7..8e67044eb0 100644 --- a/hipamd/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp @@ -37,18 +37,13 @@ THE SOFTWARE. using namespace std::chrono; -const static uint NumOfLoopIterrations = 16 * 1024; -const static uint BufferSizeInDwords = 28672 * NumOfLoopIterrations; +const static uint BufferSizeInDwords = 256 * 1024 * 1024; const static uint numQueues = 4; const static uint numIter = 100; constexpr uint NumKernelArgs = 4; constexpr uint MaxGPUs = 8; #include -/* -namespace cg = cooperative_groups; -using namespace cooperative_groups; -*/ __global__ void test_gws(uint* buf, uint bufSize, long* tmpBuf, long* result) { @@ -126,11 +121,13 @@ int main() { size_t SIZE = copySizeInDwords * sizeof(uint); HIPCHECK(hipMalloc((void**)&dA[i], SIZE)); + HIPCHECK(hipMalloc((void**)&dB[i], 64 * deviceProp[i].multiProcessorCount * sizeof(long))); if (i == 0) { HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent)); } HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice)); HIPCHECK(hipStreamCreate(&stream[i])); + hipDeviceSynchronize(); } dim3 dimBlock; @@ -146,22 +143,22 @@ int main() { uint workgroups[3] = {64, 128, 256}; hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; - - system_clock::time_point start = system_clock::now(); - + std::time_t end_time; + double time = 0; for (uint set = 0; set < 3; ++set) { void* args[MaxGPUs * NumKernelArgs]; - std::cout << "---------- Test#" << set << "---------------\n"; + std::cout << "---------- Test#" << set << ", size: "<< BufferSizeInDwords << + " dwords ---------------\n"; for (int i = 0; i < nGpu; i++) { HIPCHECK(hipSetDevice(i)); dimBlock.x = workgroups[set]; HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); - - std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n"; + + std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << + " Num blocks per CU: " << numBlocks << "\n"; dimGrid.x = deviceProp[i].multiProcessorCount * std::min(numBlocks, 32); - HIPCHECK(hipMalloc((void**)&dB[i], dimGrid.x * sizeof(long))); args[i * NumKernelArgs] = (void*)&dA[i]; args[i * NumKernelArgs + 1] = (void*)©SizeInDwords; @@ -175,32 +172,34 @@ int main() { launchParamsList[i].stream = stream[i]; launchParamsList[i].args = &args[i * NumKernelArgs]; } - - hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0); - if (*dC != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) { - std::cout << "Data validation failed for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"; + system_clock::time_point start = system_clock::now(); + hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0); + system_clock::time_point end = system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + end_time = std::chrono::system_clock::to_time_t(end); + + time += elapsed_seconds.count(); + + size_t processedDwords = copySizeInDwords * nGpu; + if (*dC != (((long)(processedDwords) * (processedDwords - 1)) / 2)) { + std::cout << "Data validation failed ("<< *dC << " != " << + (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2) << + ") for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"; std::cout << "Test failed! \n"; } - for (int i = 0; i < nGpu; i++) { - hipFree(dB[i]); - } } - system_clock::time_point end = system_clock::now(); delete [] launchParamsList; - std::chrono::duration elapsed_seconds = end - start; - - std::time_t end_time = std::chrono::system_clock::to_time_t(end); - std::cout << "finished computation at " << std::ctime(&end_time) << - "elapsed time: " << elapsed_seconds.count() << "s\n"; + "elapsed time: " << time << "s\n"; hipSetDevice(0); hipFree(dC); for (int i = 0; i < nGpu; i++) { hipFree(dA[i]); + hipFree(dB[i]); HIPCHECK(hipStreamDestroy(stream[i])); } delete [] init;