Merge branch 'amd-master-next' into amd-npi-next

Change-Id: Ie692324c8cf6215bf6cf87de0d5e3aa1bdfd5ea1
This commit is contained in:
Vlad Sytchenko
2020-05-15 11:15:27 -04:00
27 zmienionych plików z 354 dodań i 220 usunięć
-1
Wyświetl plik
@@ -10,7 +10,6 @@ bin/hipInfo
bin/hipBusBandwidth
bin/hipDispatchLatency
bin/hipify-clang
include/hip/hip_version.h
tags
samples/1_Utils/hipInfo/hipInfo
samples/1_Utils/hipBusBandwidth/hipBusBandwidth
+12 -4
Wyświetl plik
@@ -234,7 +234,7 @@ endif (NOT CPACK_SET_DESTDIR)
# Generate profiling API macros/structures header
if(HIP_PLATFORM STREQUAL "hcc")
if(USE_PROF_API EQUAL 1)
set(PROF_API_STR "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hcc_detail/hip_prof_str.h")
set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/hcc_detail/hip_prof_str.h")
set(PROF_API_HDR "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hcc_detail/hip_runtime_api.h")
set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py")
@@ -315,6 +315,7 @@ endif()
message(STATUS "\nHSA runtime in: " ${HSA_PATH})
# Build hip_hcc if platform is hcc
if(HIP_PLATFORM STREQUAL "hcc")
include_directories(${PROJECT_BINARY_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/include)
set(HIP_HCC_BUILD_FLAGS)
@@ -425,7 +426,8 @@ set(_versionInfoHeader
#define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n
#endif\n
")
file(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
include_directories(${PROJECT_BINARY_DIR}/include)
# Build doxygen documentation
find_program(DOXYGEN_EXE doxygen)
@@ -457,7 +459,11 @@ install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin)
execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE INSTALL_SOURCE)
if(NOT ${INSTALL_SOURCE} EQUAL 0)
install(DIRECTORY src DESTINATION .)
if(HIP_RUNTIME STREQUAL "HCC")
install(DIRECTORY src DESTINATION .)
elseif(HIP_RUNTIME STREQUAL "ROCclr")
install(DIRECTORY rocclr DESTINATION .)
endif()
install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS)
install(DIRECTORY include DESTINATION .)
install(DIRECTORY cmake DESTINATION .)
@@ -468,7 +474,9 @@ endif()
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/hip/hcc_detail
DESTINATION include/hip
FILES_MATCHING PATTERN "*.h*")
install(DIRECTORY ${PROJECT_BINARY_DIR}/include/hip
DESTINATION include
FILES_MATCHING PATTERN "*.h*")
#############################
# hip-config
+20 -11
Wyświetl plik
@@ -34,26 +34,35 @@ defined $options{i} || die("input not specified");
$input_file = $options{i};
(-f $input_file) || die("can't find $input_file");
# derive HIP_PATH via env var or use parent directory of extractkernel
my $HIP_PATH=$ENV{'HIP_PATH'} // dirname(Cwd::abs_path("$0/../"));
my $HIP_COMPILER = `$HIP_PATH/bin/hipconfig --compiler`;
my $ROCM_PATH = `$HIP_PATH/bin/hipconfig --rocmpath`;
my $HIP_CLANG_PATH = `$HIP_PATH/bin/hipconfig --hipclangpath`;
# look for llvm-objdump and clang-offload-bundler
my $tools_path_prefix;
my $llvm_objdump;
my $clang_offload_bundler;
if (defined $ENV{'HCC_HOME'}) {
$tools_path_prefix = File::Spec->catfile($ENV{'HCC_HOME'}, "bin");
$llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump");
$clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler");
if (defined $HIP_COMPILER and $HIP_COMPILER eq "clang"){
# Search the path with respect to HIP_CLANG_PATH
$tools_path_prefix = $HIP_CLANG_PATH;
}
else {
$tools_path_prefix = dirname(realpath($0));
$llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump");
$clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler");
if (!(-f $llvm_objdump)) {
$tools_path_prefix = realpath($tools_path_prefix."/../../hcc/bin");
$llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump");
$clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler");
if (defined $HIP_COMPILER and $HIP_COMPILER eq "hcc") {
# Search the path with respect to HCC_HOME if it is set, else search in ROCM_PATH
if (defined $ENV{'HCC_HOME'}) {
$tools_path_prefix = File::Spec->catfile($ENV{'HCC_HOME'}, "bin");
}
else {
$tools_path_prefix = realpath($ROCM_PATH."/hcc/bin");
}
}
}
# Find llvm-objdump and clang-offload-bundler in the path set above
$llvm_objdump = File::Spec->catfile($tools_path_prefix, "llvm-objdump");
$clang_offload_bundler = File::Spec->catfile($tools_path_prefix, "clang-offload-bundler");
if (!(-f $llvm_objdump)) {
$llvm_objdump = which("llvm-objdump");
+45 -22
Wyświetl plik
@@ -26,7 +26,7 @@ use Cwd 'abs_path';
# script's abs_path). Used on AMD platforms only.
# HSA_PATH : Path to HSA dir (defaults to ../../hsa relative to abs_path
# of this script). Used on AMD platforms only.
# HIP_ROCclr_HOME : Path to HIP/ROCclr directory. Used on AMD platforms only.
# HIP_ROCCLR_HOME : Path to HIP/ROCclr directory. Used on AMD platforms only.
# HIP_CLANG_PATH : Path to HIP-Clang (default to ../../llvm/bin relative to this
# script's abs_path). Used on AMD platforms only.
@@ -82,15 +82,15 @@ if (-e "$HIP_PATH/../.info/version") {
} else {
$ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm";
}
$HIP_ROCclr_HOME=$ENV{'HIP_ROCclr_HOME'};
$HIP_ROCCLR_HOME=$ENV{'HIP_ROCCLR_HOME'};
$HIP_LIB_PATH=$ENV{'HIP_LIB_PATH'};
$HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'};
$DEVICE_LIB_PATH=$ENV{'DEVICE_LIB_PATH'};
$HIP_CLANG_HCC_COMPAT_MODE=$ENV{'HIP_CLANG_HCC_COMPAT_MODE'}; # HCC compatibility mode
$HIP_COMPILE_CXX_AS_HIP=$ENV{'HIP_COMPILE_CXX_AS_HIP'} // "1";
if (defined $HIP_ROCclr_HOME) {
$HIP_INFO_PATH= "$HIP_ROCclr_HOME/lib/.hipInfo";
if (defined $HIP_ROCCLR_HOME) {
$HIP_INFO_PATH= "$HIP_ROCCLR_HOME/lib/.hipInfo";
} else {
$HIP_INFO_PATH= "$HIP_PATH/lib/.hipInfo"; # use actual file
}
@@ -123,35 +123,43 @@ sub delete_temp_dirs {
#---
#HIP_PLATFORM controls whether to use hcc (AMD) or nvcc as the platform:
$HIP_PLATFORM= `$HIP_PATH/bin/hipconfig --platform` // "hcc";
$HIP_VERSION= `$HIP_PATH/bin/hipconfig --version`;
#HIP_COMPILER controls whether to use hcc, clang or nvcc for compilation:
$HIP_COMPILER= `$HIP_PATH/bin/hipconfig --compiler`;
#HIP_RUNTIME controls whether to use HCC, ROCclr, or NVCC as the runtime:
$HIP_RUNTIME= `$HIP_PATH/bin/hipconfig --runtime`;
if ($isWindows) {
# Windows cannot run perl natively, so hipcc will explicitly call perl
$HIP_PLATFORM= `perl $HIP_PATH/bin/hipconfig --platform`;
$HIP_VERSION= `perl $HIP_PATH/bin/hipconfig --version`;
$HIP_COMPILER= `perl $HIP_PATH/bin/hipconfig --compiler`;
$HIP_RUNTIME= `perl $HIP_PATH/bin/hipconfig --runtime`;
} else {
$HIP_PLATFORM= `$HIP_PATH/bin/hipconfig --platform`;
$HIP_VERSION= `$HIP_PATH/bin/hipconfig --version`;
$HIP_COMPILER= `$HIP_PATH/bin/hipconfig --compiler`;
$HIP_RUNTIME= `$HIP_PATH/bin/hipconfig --runtime`;
}
# If using ROCclr runtime, need to find HIP_ROCclr_HOME
if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "ROCclr" and !defined $HIP_ROCclr_HOME) {
# If using ROCclr runtime, need to find HIP_ROCCLR_HOME
if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "ROCclr" and !defined $HIP_ROCCLR_HOME) {
my $hipcc_dir = dirname($0);
if (-e "$hipcc_dir/../lib/bitcode") {
$HIP_ROCclr_HOME = abs_path($hipcc_dir . "/..");
$HIP_ROCCLR_HOME = abs_path($hipcc_dir . "/..");
} else {
$HIP_ROCclr_HOME = $HIP_PATH; # use HIP_PATH
$HIP_ROCCLR_HOME = $HIP_PATH; # use HIP_PATH
}
$HIPCXXFLAGS .= "-D__HIP_ROCclr__";
$HIPCFLAGS .= "-D__HIP_ROCclr__";
}
if (defined $HIP_ROCclr_HOME) {
if (!defined $HIP_CLANG_PATH and (-e "$HIP_ROCclr_HOME/bin/clang" or -e "$HIP_ROCclr_HOME/bin/clang.exe")) {
$HIP_CLANG_PATH = "$HIP_ROCclr_HOME/bin";
if (defined $HIP_ROCCLR_HOME) {
if (!defined $HIP_CLANG_PATH and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) {
$HIP_CLANG_PATH = "$HIP_ROCCLR_HOME/bin";
}
if (!defined $DEVICE_LIB_PATH and -e "$HIP_ROCclr_HOME/lib/bitcode") {
$DEVICE_LIB_PATH = "$HIP_ROCclr_HOME/lib/bitcode";
if (!defined $DEVICE_LIB_PATH and -e "$HIP_ROCCLR_HOME/lib/bitcode") {
$DEVICE_LIB_PATH = "$HIP_ROCCLR_HOME/lib/bitcode";
}
$HIP_INCLUDE_PATH = "$HIP_ROCclr_HOME/include";
$HIP_INCLUDE_PATH = "$HIP_ROCCLR_HOME/include";
if (!defined $HIP_LIB_PATH) {
$HIP_LIB_PATH = "$HIP_ROCclr_HOME/lib";
$HIP_LIB_PATH = "$HIP_ROCCLR_HOME/lib";
}
}
@@ -199,8 +207,8 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") {
$HIP_LIB_PATH = "$HIP_PATH/lib";
}
if ($verbose & 0x2) {
if (defined $HIP_ROCclr_HOME) {
print ("HIP_ROCclr_HOME=$HIP_ROCclr_HOME\n");
if (defined $HIP_ROCCLR_HOME) {
print ("HIP_ROCCLR_HOME=$HIP_ROCCLR_HOME\n");
}
print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n");
print ("HIP_CLANG_INCLUDE_PATH=$HIP_CLANG_INCLUDE_PATH\n");
@@ -393,6 +401,7 @@ if($HIP_PLATFORM eq "nvcc"){
}
}
# TODO: convert toolArgs to an array rather than a string
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
my $optArg = ""; # -O args
my $targetOpt = '--amdgpu-target=';
@@ -402,7 +411,11 @@ my $prevArg = ""; # previous argument
foreach $arg (@ARGV)
{
# Save $arg, it can get changed in the loop.
$trimarg = $arg;
# TODO: figure out why this space removal is wanted.
# TODO: If someone has gone to the effort of quoting the spaces to the shell
# TODO: why are we removing it here?
$trimarg =~ s/^\s+|\s+$//g; # Remive whitespace
my $swallowArg = 0;
if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') {
@@ -411,6 +424,7 @@ foreach $arg (@ARGV)
}
if ($skipOutputFile) {
# TODO: handle filename with shell metacharacters
$toolArgs .= " $arg";
$prevArg = $arg;
$skipOutputFile = 0;
@@ -452,7 +466,7 @@ foreach $arg (@ARGV)
$arg = "--cuda-device-only";
}
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0) and $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'hcc')
{
$HIPCXXFLAGS .= $HCC_WA_FLAGS;
$setStdLib = 1;
@@ -659,6 +673,14 @@ foreach $arg (@ARGV)
push (@inputs, $arg);
#print "I: <$arg>\n";
}
# Produce a version of $arg where characters significant to the shell are
# quoted. One could quote everything of course but don't bother for
# common characters such as alphanumerics.
# Do the quoting here because sometimes the $arg is changed in the loop
# Important to have all of '-Xlinker' in the set of unquoted characters.
if (not $isWindows) { # Windows needs different quoting, ignore for now
$arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g;
}
$toolArgs .= " $arg" unless $swallowArg;
$prevArg = $arg;
}
@@ -805,6 +827,7 @@ if ($HIPCC_LINK_FLAGS_APPEND) {
$HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND";
}
# TODO: convert CMD to an array rather than a string
my $CMD="$HIPCC";
if ($needCFLAGS) {
+42 -3
Wyświetl plik
@@ -17,9 +17,11 @@ Getopt::Long::Configure ( qw{bundling no_ignore_case});
GetOptions(
"help|h" => \$p_help
,"path|p" => \$p_path
,"rocmpath|R" => \$p_rocmpath
,"compiler|c" => \$p_compiler
,"platform|P" => \$p_platform
,"runtime|r" => \$p_runtime
,"hipclangpath|l" => \$p_hipclangpath
,"cpp_config|cxx_config|C" => \$p_cpp_config
,"full|f|info" => \$p_full,
,"version|v" => \$p_version,
@@ -30,10 +32,12 @@ GetOptions(
if ($p_help) {
print "usage: hipconfig [OPTIONS]\n";
print " --path, -p : print HIP_PATH (use env var if set, else determine from hipconfig path)\n";
print " --rocmpath, -R : print ROCM_PATH (use env var if set, else determine from hip path or /opt/rocm)\n";
print " --cpp_config, -C : print C++ compiler options\n";
print " --compiler, -c : print compiler (hcc or clang or nvcc)\n";
print " --platform, -P : print platform (hcc or nvcc)\n";
print " --runtime, -r : print runtime (HCC or ROCclr)\n";
print " --hipclangpath, -l : print HIP_CLANG_PATH\n";
print " --full, -f : print full config\n";
print " --version, -v : print hip version\n";
print " --check : check configuration\n";
@@ -85,15 +89,33 @@ $CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda';
$HCC_HOME=$ENV{'HCC_HOME'} // "$ROCM_PATH/hcc";
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
$HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$ROCM_PATH/llvm/bin";
# HIP_ROCCLR_HOME is used by Windows builds
$HIP_ROCCLR_HOME=$ENV{'HIP_ROCCLR_HOME'};
if (defined $HIP_ROCCLR_HOME) {
$HIP_INFO_PATH= "$HIP_ROCCLR_HOME/lib/.hipInfo";
} else {
$HIP_INFO_PATH= "$HIP_PATH/lib/.hipInfo"; # use actual file
}
#---
#HIP_PLATFORM controls whether to use NVCC or HCC for compilation:
$HIP_PLATFORM=$ENV{'HIP_PLATFORM'};
# Read .hipInfo
my %hipInfo = ();
parse_config_file("$HIP_PATH/lib/.hipInfo", \%hipInfo);
$HIP_COMPILER = $hipInfo{'HIP_COMPILER'} // "hcc";
$HIP_RUNTIME = $hipInfo{'HIP_RUNTIME'} // "HCC";
parse_config_file("$HIP_INFO_PATH", \%hipInfo);
# Prioritize Env first, otherwise use the hipInfo config file
$HIP_COMPILER = $ENV{'HIP_COMPILER'} // $hipInfo{'HIP_COMPILER'} // "hcc";
$HIP_RUNTIME = $ENV{'HIP_RUNTIME'} // $hipInfo{'HIP_RUNTIME'} // "HCC";
# If using ROCclr runtime, need to find HIP_ROCCLR_HOME
if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "ROCclr" and !defined $HIP_ROCCLR_HOME) {
my $hipconfig_dir = dirname($0);
if (-e "$hipconfig_dir/../lib/bitcode") {
$HIP_ROCCLR_HOME = abs_path($hipconfig_dir . "/..");
} else {
$HIP_ROCCLR_HOME = $HIP_PATH; # use HIP_PATH
}
}
if (not defined $HIP_PLATFORM) {
if (can_run("$HCC_HOME/bin/hcc") or can_run("hcc")) {
@@ -112,6 +134,10 @@ if ($HIP_COMPILER eq "hcc") {
$CPP_CONFIG = " -D__HIP_PLATFORM_HCC__= -I$HIP_PATH/include -I$HCC_HOME/include -I$HSA_PATH/include";
}
if ($HIP_COMPILER eq "clang") {
# Windows does not have clang at linux default path
if (defined $HIP_ROCCLR_HOME and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) {
$HIP_CLANG_PATH = "$HIP_ROCCLR_HOME/bin";
}
$HIP_CLANG_VERSION = `$HIP_CLANG_PATH/clang++ --version`;
$HIP_CLANG_VERSION=~/.*clang version ([^ ]+).*/;
$HIP_CLANG_VERSION=$1;
@@ -139,6 +165,11 @@ if ($p_path) {
$printed = 1;
}
if ($p_rocmpath) {
print "$ROCM_PATH";
$printed = 1;
}
if ($p_cpp_config) {
print $CPP_CONFIG;
$printed = 1;
@@ -159,6 +190,13 @@ if ($p_runtime) {
$printed = 1;
}
if ($p_hipclangpath) {
if (defined $HIP_CLANG_PATH) {
print $HIP_CLANG_PATH;
}
$printed = 1;
}
if ($p_version) {
print $HIP_VERSION;
$printed = 1;
@@ -168,6 +206,7 @@ if (!$printed or $p_full) {
print "HIP version : ", $HIP_VERSION, "\n\n";
print "== hipconfig\n";
print "HIP_PATH : ", $HIP_PATH, "\n";
print "ROCM_PATH : ", $ROCM_PATH, "\n";
print "HIP_COMPILER : ", $HIP_COMPILER, "\n";
print "HIP_PLATFORM : ", $HIP_PLATFORM, "\n";
print "HIP_RUNTIME : ", $HIP_RUNTIME, "\n";
+13 -11
Wyświetl plik
@@ -615,23 +615,25 @@ macro(HIP_ADD_EXECUTABLE hip_target)
endif()
if("${HIP_COMPILER}" STREQUAL "hcc")
if("x${HCC_HOME}" STREQUAL "x")
if (DEFINED $ENV{ROCM_PATH})
set(HCC_HOME "$ENV{ROCM_PATH}/hcc")
elseif( DEFINED $ENV{HIP_PATH})
set(HCC_HOME "$ENV{HIP_PATH}/../hcc")
if (DEFINED ENV{ROCM_PATH})
set(HCC_HOME "$ENV{ROCM_PATH}/hcc")
elseif(DEFINED ENV{HIP_PATH})
set(HCC_HOME "$ENV{HIP_PATH}/../hcc")
else()
set(HCC_HOME "/opt/rocm/hcc")
set(HCC_HOME "/opt/rocm/hcc")
endif()
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
elseif("${HIP_COMPILER}" STREQUAL "clang")
if("x${HIP_CLANG_PATH}" STREQUAL "x")
if (DEFINED $ENV{ROCM_PATH})
set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin")
elseif( DEFINED $ENV{HIP_PATH})
set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin")
if("x${HIP_CLANG_PATH}" STREQUAL "x")
if(DEFINED ENV{HIP_CLANG_PATH})
set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH})
elseif(DEFINED ENV{ROCM_PATH})
set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin")
elseif(DEFINED ENV{HIP_PATH})
set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin")
else()
set(HIP_CLANG_PATH "/opt/rocm/llvm/bin")
set(HIP_CLANG_PATH "/opt/rocm/llvm/bin")
endif()
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
@@ -85,11 +85,11 @@ __device__ static inline unsigned int __ffsll(long long int input) {
}
__device__ static inline unsigned int __brev(unsigned int input) {
return __llvm_bitrev_b32(input);
return __builtin_bitreverse32(input);
}
__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
return __llvm_bitrev_b64(input);
return __builtin_bitreverse64(input);
}
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
@@ -233,7 +233,10 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns
return __ockl_sadd_u32(x, y, z);
}
__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
__device__ static inline unsigned int __lane_id() {
return __builtin_amdgcn_mbcnt_hi(
-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}
/*
HIP specific device functions
@@ -241,25 +244,25 @@ HIP specific device functions
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
return tmp.u;
}
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
union { int i; unsigned u; float f; } tmp; tmp.f = src;
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
return tmp.f;
}
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
return tmp.u;
}
__device__ static inline float __hip_ds_permutef(int index, float src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
return tmp.u;
}
@@ -293,8 +296,8 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
__device__ static inline int __hip_move_dpp_N(int src) {
return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
bound_ctrl);
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
bound_ctrl);
}
static constexpr int warpSize = 64;
@@ -304,7 +307,7 @@ inline
int __shfl(int var, int src_lane, int width = warpSize) {
int self = __lane_id();
int index = src_lane + (self & ~(width-1));
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -376,7 +379,7 @@ int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self - lane_delta;
index = (index < (self & ~(width-1)))?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -446,7 +449,7 @@ int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self + lane_delta;
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -516,7 +519,7 @@ int __shfl_xor(int var, int lane_mask, int width = warpSize) {
int self = __lane_id();
int index = self^lane_mask;
index = index >= ((self+width)&~(width-1))?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -506,6 +506,14 @@ hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);
*/
hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
/**
* @brief Gets the flags set for current device
*
* @param [out] flags
*
* @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
*/
hipError_t hipGetDeviceFlags(unsigned *flags);
/**
* @brief The bank width of shared memory on current device is set
@@ -31,40 +31,11 @@ THE SOFTWARE.
#include "hip/hcc_detail/host_defines.h"
__device__
__attribute__((convergent))
ulong __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) __asm("llvm.amdgcn.icmp.i32");
// FIXME: These should all be removed and proper builtins used.
__device__
unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
__device__
unsigned int __llvm_bitrev_b32(unsigned int src0) __asm("llvm.bitreverse.i32");
__device__
uint64_t __llvm_bitrev_b64(uint64_t src0) __asm("llvm.bitreverse.i64");
extern
__device__
__attribute__((const))
unsigned int __mbcnt_lo(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.lo");
extern
__device__
__attribute__((const))
unsigned int __mbcnt_hi(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.hi");
__device__
int __llvm_amdgcn_ds_bpermute(int index, int src) __asm("llvm.amdgcn.ds.bpermute");
__device__
int __llvm_amdgcn_ds_permute(int index, int src) __asm("llvm.amdgcn.ds.permute");
__device__
int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle");
__device__
int __llvm_amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask,
bool bound_ctrl) __asm("llvm.amdgcn.mov.dpp.i32");
#endif
@@ -1411,12 +1411,12 @@ float func(float x, int y) \
__DEF_FLOAT_FUN2I(scalbn)
template<class T>
__DEVICE__ inline static T min(T arg1, T arg2) {
__DEVICE__ inline T min(T arg1, T arg2) {
return (arg1 < arg2) ? arg1 : arg2;
}
template<class T>
__DEVICE__ inline static T max(T arg1, T arg2) {
__DEVICE__ inline T max(T arg1, T arg2) {
return (arg1 > arg2) ? arg1 : arg2;
}
@@ -1511,6 +1511,10 @@ inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cuda
inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); }
inline static hipError_t hipGetDeviceFlags(unsigned int* flags) {
return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags));
}
inline static hipError_t hipSetDeviceFlags(unsigned int flags) {
return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags));
}
+4
Wyświetl plik
@@ -3,6 +3,10 @@ project(hip_base)
install(DIRECTORY @hip_SOURCE_DIR@/bin DESTINATION . USE_SOURCE_PERMISSIONS)
install(DIRECTORY @hip_SOURCE_DIR@/include DESTINATION .)
install(FILES @PROJECT_BINARY_DIR@/include/hip/hcc_detail/hip_prof_str.h
DESTINATION include/hip/hcc_detail)
install(FILES @PROJECT_BINARY_DIR@/include/hip/hip_version.h
DESTINATION include/hip)
install(FILES @PROJECT_BINARY_DIR@/.hipVersion DESTINATION bin)
install(PROGRAMS @PROJECT_BINARY_DIR@/lpl DESTINATION bin)
install(PROGRAMS @PROJECT_BINARY_DIR@/ca DESTINATION bin)
+1 -1
Wyświetl plik
@@ -14,7 +14,7 @@ HIPDIR=$ROCMDIR/hip
HIPLIBDIR=$ROCMDIR/hip/lib
# Soft-link to library files
HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v [-/$])
HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v "cmake\|[-/$]")
mkdir -p $ROCMLIBDIR
mkdir -p $ROCMLIBDIR/cmake
pushd $ROCMLIBDIR
+1 -1
Wyświetl plik
@@ -14,7 +14,7 @@ HIPDIR=$ROCMDIR/hip
HIPLIBDIR=$ROCMDIR/hip/lib
# Soft-link to library files
HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v [-/$])
HIPLIBFILES=$(ls -A $HIPLIBDIR | grep -v "cmake\|[-/$]")
mkdir -p $ROCMLIBDIR
mkdir -p $ROCMLIBDIR/cmake
pushd $ROCMLIBDIR
+87 -63
Wyświetl plik
@@ -27,30 +27,44 @@ endif()
set(USE_PROF_API "1")
if(NOT DEFINED LIBROCclr_STATIC_DIR)
find_path(LIBROCclr_STATIC_DIR
NAMES libamdrocclr_static.a
PATHS /opt/rocm/rocclr
PATH_SUFFIXES lib
)
# FIXME: Make this required and remove the legacy handling below
set(save_rocclr_dir ${ROCclr_DIR})
set(save_rocclr_static_dir ${LIBROCclr_STATIC_DIR})
find_package(ROCclr CONFIG
PATHS
/opt/rocm
/opt/rocm/rocclr)
if (NOT ROCclr_FOUND)
if(NOT DEFINED LIBROCclr_STATIC_DIR)
find_path(LIBROCclr_STATIC_DIR
NAMES libamdrocclr_static.a
PATHS /opt/rocm/rocclr
PATH_SUFFIXES lib)
else()
set(LIBROCclr_STATIC_DIR ${save_rocclr_static_dir})
endif()
if(NOT DEFINED ROCclr_DIR)
find_path(ROCclr_DIR
NAMES top.hpp
PATH_SUFFIXES include
PATHS /opt/rocm/rocclr)
else()
set(ROCclr_DIR ${save_rocclr_dir})
endif()
message("Found Static rocclr lib:${LIBROCclr_STATIC_DIR} and rocclr includes: ${ROCclr_DIR}")
include(${LIBROCclr_STATIC_DIR}/amdrocclr_staticTargets.cmake)
endif()
if(NOT DEFINED ROCclr_DIR)
find_path(ROCclr_DIR
NAMES top.hpp
PATH_SUFFIXES include
PATHS /opt/rocm/rocclr
)
endif()
message("Found Static rocclr lib:${LIBROCclr_STATIC_DIR} and rocclr includes: ${ROCclr_DIR}")
set(PROF_API_HEADER_PATH ${ROCclr_DIR}/platform)
#############################
# Profiling API support
#############################
# Generate profiling API macros/structures header
# FIXME: This should not be writing to the source directory
set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/hcc_detail/hip_prof_str.h")
set(PROF_API_HDR "${CMAKE_CURRENT_SOURCE_DIR}/../include/hip/hcc_detail/hip_runtime_api.h")
set(PROF_API_HDR "${PROJECT_SOURCE_DIR}/include/hip/hcc_detail/hip_runtime_api.h")
set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py")
set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt")
@@ -66,25 +80,6 @@ add_custom_target(gen-prof-api-str-header ALL
SOURCES ${PROF_API_HDR})
# Enable profiling API
if(USE_PROF_API EQUAL 1)
find_path(PROF_API_HEADER_DIR prof_protocol.h
HINTS
${PROF_API_HEADER_PATH}
PATHS
/opt/rocm/roctracer
PATH_SUFFIXES
include/ext
)
if(NOT PROF_API_HEADER_DIR)
MESSAGE(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>")
else()
add_definitions(-DUSE_PROF_API=1)
include_directories(${PROF_API_HEADER_DIR})
MESSAGE(STATUS "Profiling API: ${PROF_API_HEADER_DIR}")
endif()
endif()
if(NOT DEFINED ROCclr_DIR OR NOT DEFINED LIBOCL_STATIC_DIR OR NOT DEFINED LIBROCclr_STATIC_DIR )
# message(FATAL_ERROR "define ROCclr_DIR, LIBOCL_STATIC_DIR\n")
@@ -92,35 +87,16 @@ endif()
list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" )
set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
include_directories(${ROCR_INCLUDES})
if (DEFINED LLVM_INCLUDES AND NOT ${LLVM_INCLUDES} STREQUAL "")
message(STATUS "LLVM includes found ${LLVM_INCLUDES}")
include_directories(${LLVM_INCLUDES})
endif() # if (DEFINED LLVM_INCLUDES AND NOT ${LLVM_INCLUDES} STREQUAL "")
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${CMAKE_SOURCE_DIR}/include)
include_directories(${PROJECT_BINARY_DIR}/include)
include_directories(${CMAKE_SOURCE_DIR}/elfio)
include_directories(${CMAKE_SOURCE_DIR}/amdocl)
include_directories(${CMAKE_SOURCE_DIR}/include/hip/hcc_detail/elfio)
include_directories(${ROCclr_DIR})
include_directories(${ROCclr_DIR}/include)
include_directories(${ROCclr_DIR}/compiler/lib)
include_directories(${ROCclr_DIR}/compiler/lib/include)
include_directories(${ROCclr_DIR}/elf/utils/common)
include_directories(${ROCclr_DIR}/elf/utils/libelf)
add_definitions(-DUSE_COMGR_LIBRARY -DCOMGR_DYN_DLL)
find_package(amd_comgr REQUIRED CONFIG
PATHS
/opt/rocm/
PATH_SUFFIXES
cmake/amd_comgr
lib/cmake/amd_comgr
)
MESSAGE(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
include_directories("$<TARGET_PROPERTY:amd_comgr,INTERFACE_INCLUDE_DIRECTORIES>")
find_package(amd_comgr REQUIRED CONFIG
PATHS
/opt/rocm/
PATH_SUFFIXES
cmake/amd_comgr
lib/cmake/amd_comgr)
message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
add_definitions(-DBSD_LIBELF)
@@ -147,6 +123,55 @@ add_library(hip64 OBJECT
)
set_target_properties(hip64 PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(hip64
PUBLIC
${PROJECT_SOURCE_DIR}/include
${PROJECT_BINARY_DIR}/include
PRIVATE
${CMAKE_SOURCE_DIR}/elfio
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/amdocl
${PROJECT_SOURCE_DIR}/include/hip/hcc_detail/elfio
${ROCclr_DIR}
${ROCclr_DIR}/include
${ROCclr_DIR}/compiler/lib
${ROCclr_DIR}/compiler/lib/include
${ROCclr_DIR}/elf/utils/common
${ROCclr_DIR}/elf/utils/libelf
${ROCR_INCLUDES}
$<TARGET_PROPERTY:amd_comgr,INTERFACE_INCLUDE_DIRECTORIES>)
target_compile_definitions(hip64
PRIVATE
$<TARGET_PROPERTY:amd_comgr,INTERFACE_COMPILE_DEFINITIONS>)
if(ROCclr_FOUND)
target_include_directories(hip64
PRIVATE
$<TARGET_PROPERTY:amdrocclr_static,INTERFACE_INCLUDE_DIRECTORIES>)
target_compile_definitions(hip64
PRIVATE
$<TARGET_PROPERTY:amdrocclr_static,INTERFACE_COMPILE_DEFINITIONS>)
endif()
# Enable profiling API
if(USE_PROF_API EQUAL 1)
find_path(PROF_API_HEADER_DIR prof_protocol.h
HINTS
${PROF_API_HEADER_PATH}
PATHS
/opt/rocm/roctracer
PATH_SUFFIXES
include/ext)
if(NOT PROF_API_HEADER_DIR)
message(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>")
else()
target_compile_definitions(hip64 PUBLIC USE_PROF_API=1)
target_include_directories(hip64 PUBLIC ${PROF_API_HEADER_DIR})
message(STATUS "Profiling API: ${PROF_API_HEADER_DIR}")
endif()
endif()
set_target_properties(
hip64 PROPERTIES
CXX_STANDARD 14
@@ -157,7 +182,6 @@ add_dependencies(hip64 gen-prof-api-str-header)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
include(${LIBROCclr_STATIC_DIR}/amdrocclr_staticTargets.cmake)
add_library(amdhip64 SHARED
$<TARGET_OBJECTS:hip64>
+4 -4
Wyświetl plik
@@ -188,13 +188,13 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
deviceProps.arch.hasGlobalFloatAtomicExch = 1;
deviceProps.arch.hasSharedInt32Atomics = 1;
deviceProps.arch.hasSharedFloatAtomicExch = 1;
deviceProps.arch.hasFloatAtomicAdd = 0;
deviceProps.arch.hasFloatAtomicAdd = 1;
deviceProps.arch.hasGlobalInt64Atomics = 1;
deviceProps.arch.hasSharedInt64Atomics = 1;
deviceProps.arch.hasDoubles = 1;
deviceProps.arch.hasWarpVote = 0;
deviceProps.arch.hasWarpBallot = 0;
deviceProps.arch.hasWarpShuffle = 0;
deviceProps.arch.hasWarpVote = 1;
deviceProps.arch.hasWarpBallot = 1;
deviceProps.arch.hasWarpShuffle = 1;
deviceProps.arch.hasFunnelShift = 0;
deviceProps.arch.hasThreadFenceSystem = 1;
deviceProps.arch.hasSyncThreadsExt = 0;
+8 -2
Wyświetl plik
@@ -471,7 +471,12 @@ hipError_t hipGetDeviceCount ( int* count ) {
}
hipError_t hipGetDeviceFlags ( unsigned int* flags ) {
HIP_RETURN(hipErrorNotSupported);
HIP_INIT_API(hipGetDeviceFlags, flags);
if (flags == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
*flags = hip::getCurrentDevice()->getFlags();
HIP_RETURN(hipSuccess);
}
hipError_t hipIpcGetEventHandle ( hipIpcEventHandle_t* handle, hipEvent_t event ) {
@@ -531,7 +536,8 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) {
default:
break;
}
hip::getCurrentDevice()->setFlags(flags & hipDeviceScheduleMask);
HIP_RETURN(hipSuccess);
}
+1 -1
Wyświetl plik
@@ -91,7 +91,7 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
}
ms = static_cast<float>(static_cast<int64_t>(eStop.event_->profilingInfo().end_ -
event_->profilingInfo().start_))/1000000.f;
event_->profilingInfo().end_))/1000000.f;
return hipSuccess;
}
+1
Wyświetl plik
@@ -149,6 +149,7 @@ hipPointerGetAttributes
hipProfilerStart
hipProfilerStop
hipRuntimeGetVersion
hipGetDeviceFlags
hipSetDevice
hipSetDeviceFlags
hipStreamAddCallback
+1
Wyświetl plik
@@ -149,6 +149,7 @@ global:
hipProfilerStart;
hipProfilerStop;
hipRuntimeGetVersion;
hipGetDeviceFlags;
hipSetDevice;
hipSetDeviceFlags;
hipStreamAddCallback;
+8 -2
Wyświetl plik
@@ -119,12 +119,14 @@ namespace hip {
int deviceId_;
/// ROCclr host queue for default streams
Stream null_stream_;
//Maintain list of user enabled peers
/// Store device flags
unsigned int flags_;
/// Maintain list of user enabled peers
std::list<int> userEnabledPeers;
public:
Device(amd::Context* ctx, int devId):
context_(ctx), deviceId_(devId), null_stream_(this, amd::CommandQueue::Priority::Normal, 0, true)
context_(ctx), deviceId_(devId), null_stream_(this, amd::CommandQueue::Priority::Normal, 0, true), flags_(hipDeviceScheduleSpin)
{ assert(ctx != nullptr); }
~Device() {}
@@ -152,6 +154,8 @@ namespace hip {
return hipErrorPeerAccessNotEnabled;
}
}
unsigned int getFlags() const { return flags_; }
void setFlags(unsigned int flags) { flags_ = flags; }
amd::HostQueue* NullStream(bool skip_alloc = false);
};
@@ -208,6 +212,7 @@ public:
void init();
std::vector<std::pair<hipModule_t, bool>>* addFatBinary(const void*data)
{
amd::ScopedLock lock(lock_);
if (initialized_) {
digestFatBinary(data, modules_[data]);
}
@@ -215,6 +220,7 @@ public:
}
void removeFatBinary(std::vector<std::pair<hipModule_t, bool>>* module)
{
amd::ScopedLock lock(lock_);
for (auto& mod : modules_) {
if (&mod.second == module) {
modules_.erase(&mod);
+8 -2
Wyświetl plik
@@ -129,7 +129,10 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
amd::HostQueue* pQueue = &queue;
if (queueDevice != dstMemory->getContext().devices()[0]) {
pQueue = hip::getNullStream(dstMemory->getContext());
waitList.push_back(queue.getLastQueuedCommand(true));
amd::Command* cmd = queue.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
command = new amd::WriteMemoryCommand(*pQueue, CL_COMMAND_WRITE_BUFFER, waitList,
*dstMemory->asBuffer(), dOffset, sizeBytes, src);
@@ -138,7 +141,10 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
amd::HostQueue* pQueue = &queue;
if (queueDevice != srcMemory->getContext().devices()[0]) {
pQueue = hip::getNullStream(srcMemory->getContext());
waitList.push_back(queue.getLastQueuedCommand(true));
amd::Command* cmd = queue.getLastQueuedCommand(true);
if (cmd != nullptr) {
waitList.push_back(cmd);
}
}
command = new amd::ReadMemoryCommand(*pQueue, CL_COMMAND_READ_BUFFER, waitList,
*srcMemory->asBuffer(), sOffset, sizeBytes, dst);
+5 -7
Wyświetl plik
@@ -314,21 +314,19 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc
switch(attrib) {
case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
*value = static_cast<int>(wrkGrpInfo->localMemSize_
- wrkGrpInfo->privateMemSize_);
*value = static_cast<int>(wrkGrpInfo->localMemSize_);
break;
case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
*value = static_cast<int>(wrkGrpInfo->wavefrontPerSIMD_
* wrkGrpInfo->wavefrontSize_);
*value = static_cast<int>(wrkGrpInfo->size_);
break;
case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
*value = 0;
break;
case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
*value = static_cast<int>(wrkGrpInfo->localMemSize_);
*value = static_cast<int>(wrkGrpInfo->privateMemSize_);
break;
case HIP_FUNC_ATTRIBUTE_NUM_REGS:
*value = static_cast<int>(wrkGrpInfo->availableGPRs_);
*value = static_cast<int>(wrkGrpInfo->usedVGPRs_);
break;
case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
*value = 30; // Defaults to 3.0 as HCC
@@ -340,7 +338,7 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc
*value = 0;
break;
case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
*value = static_cast<int>(wrkGrpInfo->availableLDSSize_);
*value = static_cast<int>(wrkGrpInfo->availableLDSSize_ - wrkGrpInfo->localMemSize_);
break;
case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
*value = 0;
+30 -7
Wyświetl plik
@@ -122,10 +122,12 @@ hipError_t __hipExtractCodeObjectFromFatBinary(const void* data,
num_code_objs++;
}
}
if (num_code_objs == devices.size())
if (num_code_objs == devices.size()) {
return hipSuccess;
else
} else {
DevLogError("hipErrorNoBinaryForGpu: Coudn't find binary for current devices!");
return hipErrorNoBinaryForGpu;
}
}
extern "C" std::vector<std::pair<hipModule_t, bool>>* __hipRegisterFatBinary(const void* data)
@@ -189,6 +191,13 @@ void PlatformState::init()
for (auto& it : vars_) {
it.second.rvars.resize(g_devices.size());
}
if (!HIP_ENABLE_LAZY_KERNEL_LOADING) {
for (size_t i = 0; i < g_devices.size(); ++i) {
for (auto& it: functions_) {
getFunc(it.first, i);
}
}
}
}
bool PlatformState::unregisterFunc(hipModule_t hmod) {
@@ -227,6 +236,11 @@ std::vector< std::pair<hipModule_t, bool> >* PlatformState::unregisterVar(hipMod
= reinterpret_cast<texture<float, hipTextureType1D, hipReadModeElementType> *>(dvar.shadowVptr);
delete tex_hptr;
}
for (size_t dev = 0; dev < g_devices.size(); ++dev) {
if (dvar.rvars[dev].getdeviceptr()) {
amd::MemObjMap::RemoveMemObj(dvar.rvars[dev].getdeviceptr());
}
}
vars_.erase(it++);
} else {
++it;
@@ -309,11 +323,20 @@ bool ihipGetFuncAttributes(const char* func_name, amd::Program* program, hipFunc
return false;
}
const device::Kernel::WorkGroupInfo* wginfo = it->second->workGroupInfo();
func_attr->localSizeBytes = wginfo->localMemSize_;
func_attr->sharedSizeBytes = wginfo->size_;
func_attr->maxThreadsPerBlock = wginfo->wavefrontSize_;
func_attr->numRegs = wginfo->usedVGPRs_;
const device::Kernel* kernel = it->second;
const device::Kernel::WorkGroupInfo* wginfo = kernel->workGroupInfo();
func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
func_attr->binaryVersion = static_cast<int>(kernel->signature().version());
func_attr->cacheModeCA = 0;
func_attr->constSizeBytes = 0;
func_attr->localSizeBytes = wginfo->privateMemSize_;
func_attr->maxDynamicSharedSizeBytes = static_cast<int>(wginfo->availableLDSSize_
- wginfo->localMemSize_);
func_attr->maxThreadsPerBlock = static_cast<int>(wginfo->size_);
func_attr->numRegs = static_cast<int>(wginfo->usedVGPRs_);
func_attr->preferredShmemCarveout = 0;
func_attr->ptxVersion = 30;
return true;
}
+4 -4
Wyświetl plik
@@ -23,7 +23,7 @@
#include "hip_event.hpp"
#include "thread/monitor.hpp"
static amd::Monitor streamSetLock("Guards global stream set");
static amd::Monitor streamSetLock{"Guards global stream set"};
static std::unordered_set<hip::Stream*> streamSet;
// Internal structure for stream callback handler
@@ -83,11 +83,11 @@ amd::HostQueue* Stream::asHostQueue(bool skip_alloc) {
// ================================================================================================
void Stream::Destroy() {
if (queue_ != nullptr) {
queue_->release();
queue_ = nullptr;
amd::ScopedLock lock(streamSetLock);
streamSet.erase(this);
queue_->release();
queue_ = nullptr;
}
delete this;
}
@@ -38,11 +38,11 @@ THE SOFTWARE.
__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0);
mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0);
mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0);
mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0);
lane_id[x] = __lane_id();
}
using namespace std;
int main() {
@@ -37,18 +37,13 @@ THE SOFTWARE.
using namespace std::chrono;
const static uint NumOfLoopIterrations = 16 * 1024;
const static uint BufferSizeInDwords = 28672 * NumOfLoopIterrations;
const static uint BufferSizeInDwords = 256 * 1024 * 1024;
const static uint numQueues = 4;
const static uint numIter = 100;
constexpr uint NumKernelArgs = 4;
constexpr uint MaxGPUs = 8;
#include <stdio.h>
/*
namespace cg = cooperative_groups;
using namespace cooperative_groups;
*/
__global__ void test_gws(uint* buf, uint bufSize, long* tmpBuf, long* result)
{
@@ -126,11 +121,13 @@ int main() {
size_t SIZE = copySizeInDwords * sizeof(uint);
HIPCHECK(hipMalloc((void**)&dA[i], SIZE));
HIPCHECK(hipMalloc((void**)&dB[i], 64 * deviceProp[i].multiProcessorCount * sizeof(long)));
if (i == 0) {
HIPCHECK(hipHostMalloc((void**)&dC, (nGpu + 1) * sizeof(long), hipHostMallocCoherent));
}
HIPCHECK(hipMemcpy(dA[i], &init[i * copySizeInDwords] , SIZE, hipMemcpyHostToDevice));
HIPCHECK(hipStreamCreate(&stream[i]));
hipDeviceSynchronize();
}
dim3 dimBlock;
@@ -146,22 +143,22 @@ int main() {
uint workgroups[3] = {64, 128, 256};
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
system_clock::time_point start = system_clock::now();
std::time_t end_time;
double time = 0;
for (uint set = 0; set < 3; ++set) {
void* args[MaxGPUs * NumKernelArgs];
std::cout << "---------- Test#" << set << "---------------\n";
std::cout << "---------- Test#" << set << ", size: "<< BufferSizeInDwords <<
" dwords ---------------\n";
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
dimBlock.x = workgroups[set];
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
std::cout << "GPU(" << i << ") Block size: " << dimBlock.x << " Num blocks per CU: " << numBlocks << "\n";
std::cout << "GPU(" << i << ") Block size: " << dimBlock.x <<
" Num blocks per CU: " << numBlocks << "\n";
dimGrid.x = deviceProp[i].multiProcessorCount * std::min(numBlocks, 32);
HIPCHECK(hipMalloc((void**)&dB[i], dimGrid.x * sizeof(long)));
args[i * NumKernelArgs] = (void*)&dA[i];
args[i * NumKernelArgs + 1] = (void*)&copySizeInDwords;
@@ -175,32 +172,34 @@ int main() {
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0);
if (*dC != (((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2)) {
std::cout << "Data validation failed for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
system_clock::time_point start = system_clock::now();
hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0);
system_clock::time_point end = system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
end_time = std::chrono::system_clock::to_time_t(end);
time += elapsed_seconds.count();
size_t processedDwords = copySizeInDwords * nGpu;
if (*dC != (((long)(processedDwords) * (processedDwords - 1)) / 2)) {
std::cout << "Data validation failed ("<< *dC << " != " <<
(((long)(BufferSizeInDwords) * (BufferSizeInDwords - 1)) / 2) <<
") for grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
std::cout << "Test failed! \n";
}
for (int i = 0; i < nGpu; i++) {
hipFree(dB[i]);
}
}
system_clock::time_point end = system_clock::now();
delete [] launchParamsList;
std::chrono::duration<double> elapsed_seconds = end - start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
std::cout << "finished computation at " << std::ctime(&end_time) <<
"elapsed time: " << elapsed_seconds.count() << "s\n";
"elapsed time: " << time << "s\n";
hipSetDevice(0);
hipFree(dC);
for (int i = 0; i < nGpu; i++) {
hipFree(dA[i]);
hipFree(dB[i]);
HIPCHECK(hipStreamDestroy(stream[i]));
}
delete [] init;