Merge branch 'amd-master' into privatestaging
Этот коммит содержится в:
@@ -34,7 +34,11 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
endif()
|
||||
endif()
|
||||
if(IS_ABSOLUTE ${HCC_HOME} AND EXISTS ${HCC_HOME} AND IS_DIRECTORY ${HCC_HOME})
|
||||
message(STATUS "Looking for HCC in: " ${HCC_HOME})
|
||||
execute_process(COMMAND ${HCC_HOME}/bin/hcc --version
|
||||
COMMAND cut -d\ -f9
|
||||
OUTPUT_VARIABLE HCC_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
message(STATUS "Looking for HCC in: " ${HCC_HOME} ". Found version: " ${HCC_VERSION})
|
||||
else()
|
||||
message(FATAL_ERROR "Don't know where to find HCC. Please specify abolute path using -DHCC_HOME")
|
||||
endif()
|
||||
@@ -204,6 +208,17 @@ add_custom_target(pkg_hip_doc COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR})
|
||||
|
||||
# Package: all
|
||||
add_custom_target(package DEPENDS pkg_hip_base pkg_hip_hcc pkg_hip_nvcc pkg_hip_doc)
|
||||
# Package: hip_samples
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip_samples)
|
||||
configure_file(packaging/hip_samples.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
|
||||
add_custom_target(pkg_hip_samples COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND rm -rf *.deb *.rpm *.tar.gz
|
||||
COMMAND make package
|
||||
COMMAND cp *.deb ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.rpm ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR})
|
||||
|
||||
# Package: all
|
||||
add_custom_target(package DEPENDS pkg_hip_base pkg_hip_hcc pkg_hip_nvcc pkg_hip_doc pkg_hip_samples)
|
||||
|
||||
|
||||
@@ -11,9 +11,6 @@
|
||||
- [HCC Options](#hcc-options)
|
||||
- [Using HIP with the AMD Native-GCN compiler.](#using-hip-with-the-amd-native-gcn-compiler)
|
||||
- [Compiling CodeXL markers for HIP Functions](#compiling-codexl-markers-for-hip-functions)
|
||||
- [Using clang-hipify](#using-clang-hipify)
|
||||
- [Building](#building)
|
||||
- [Running and using clang-hipify](#running-and-using-clang-hipify)
|
||||
|
||||
<!-- END doctoc generated TOC please keep comment here to allow auto update -->
|
||||
|
||||
@@ -147,43 +144,3 @@ HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp
|
||||
```
|
||||
|
||||
Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
|
||||
|
||||
|
||||
### Using clang-hipify
|
||||
|
||||
Clang-hipify is a clang-based tool which can automate the translation of CUDA source code into portable HIP C++.
|
||||
The clang-hipify tool can automatically add extra HIP arguments (notably the "hipLaunchParm" required at the
|
||||
beginning of every HIP kernel call). Clang-hipify has some additional dependencies explained below and
|
||||
can be built as a separate make step.
|
||||
|
||||
|
||||
#### Building
|
||||
|
||||
1. Download and unpack clang+llvm 3.8 binary package preqrequisite:
|
||||
```
|
||||
wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||
tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||
```
|
||||
|
||||
2. Enable build of clang-hipify and specify path to LLVM:
|
||||
Note LLVM_DIR must be a full absolute path (not relative) to the location extracted above. Here's an example assuming we
|
||||
extract the clang 3.8 package into ~/HIP-privatestaging/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/.
|
||||
```
|
||||
cd HIP-privatestaging
|
||||
mkdir build.clang-hipify
|
||||
cd build.clang-hipify
|
||||
cmake -DBUILD_CLANG_HIPIFY=1 -DLLVM_DIR=~/HIP-privatestaging/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ -DCMAKE_BUILD_TYPE=Release ..
|
||||
make
|
||||
make install
|
||||
```
|
||||
|
||||
#### Running and using clang-hipify
|
||||
clang-hipify performs an initial compile of the CUDA source code into a "symbol tree", and thus needs access to the appropriate header files:
|
||||
1. Download "deb(network)" variant of target installer from https://developer.nvidia.com/cuda-downloads. The commands below show how to download and install a recent version from the http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb.
|
||||
|
||||
```
|
||||
wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb
|
||||
sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb
|
||||
sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5
|
||||
```
|
||||
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
# Release notes
|
||||
|
||||
Since this is an early access release and we are still in development towards the production ready version Boltzmann Driver and runtime we recommend this release be used for research and early application development.
|
||||
|
||||
We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://gpuopen-professionalcompute-tools.github.io/HIP/bug.html) lists known bugs. Some of the key items we are working on:
|
||||
- Tuning built-in functions, including shfl.
|
||||
- Performance optimization.
|
||||
@@ -10,20 +8,22 @@ We have attempted to document known bugs and limitations - in particular the [HI
|
||||
Stay tuned - the work for many of these features is already in-flight.
|
||||
|
||||
===================================================================================================
|
||||
- clang-hipify : clang-based hipify tool. Improved parsing of source code, and automates
|
||||
Release:0.86.00
|
||||
Date: 2016.05.xx
|
||||
- Add clang-hipify : clang-based hipify tool. Improved parsing of source code, and automates
|
||||
creation of hipLaunchParm variable.
|
||||
- Memory register / unregister commands (hipHostRegister, hipHostUnregister)
|
||||
- Improve cross-linking support between G++ and HCC, in particular for interfaces that use
|
||||
- Implement memory register / unregister commands (hipHostRegister, hipHostUnregister)
|
||||
- Add cross-linking support between G++ and HCC, in particular for interfaces that use
|
||||
standard C++ libraries (ie std::vectors, std::strings). HIPCC now uses libstdc++ by default on the HCC
|
||||
compilation path.
|
||||
- More samples including GPUBurn and SHOC. See [HIP-Examples](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP-Examples)
|
||||
- More samples including gpu-burn, SHOC, nbody, rtm. See [HIP-Examples](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP-Examples)
|
||||
===================================================================================================
|
||||
|
||||
## Revision History:
|
||||
|
||||
===================================================================================================
|
||||
Release:0.84.00
|
||||
Date:
|
||||
Release:0.84.01
|
||||
Date: 2016.04.25
|
||||
- Refactor HIP make and install system:
|
||||
- Move to CMake. Refer to the installation section in README.md for details.
|
||||
- Split source into multiple modular .cpp and .h files.
|
||||
|
||||
@@ -84,7 +84,7 @@ push (@warn_whitelist, split(',',$warn_whitelist));
|
||||
|
||||
#---
|
||||
#Stats tracking code:
|
||||
@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "other");
|
||||
@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "extern_shared", "other");
|
||||
|
||||
|
||||
#---
|
||||
@@ -428,6 +428,34 @@ while (@ARGV) {
|
||||
$countKeywords += m/__global__/;
|
||||
$countKeywords += m/__shared__/;
|
||||
|
||||
#--------
|
||||
# CUDA extern __shared__ syntax
|
||||
# Note these only work if declaration is on a single line.
|
||||
{
|
||||
# match uses ? for <.*> which will be unitialized if this is not present in launch syntax.
|
||||
no warnings qw/uninitialized/;
|
||||
|
||||
my $k = 0;
|
||||
|
||||
# Match extern __shared__ type foo[]; syntax
|
||||
# Replace as HIP_DYNAMIC_SHARED() macro
|
||||
$k += s/extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;/HIP_DYNAMIC_SHARED($1 $2, $3)/g;
|
||||
|
||||
# test patterns for the regular expression above:
|
||||
#'extern __shared__ double foo[];'
|
||||
#'extern __shared__ unsigned int foo[];'
|
||||
#'extern volatile __shared__ double foo[];'
|
||||
#'extern volatile __shared__ unsigned int sdata[];'
|
||||
#'extern __shared__ volatile unsigned int sdata[];'
|
||||
#'extern __shared__ T s[];'
|
||||
#'extern __shared__ T::type s[];'
|
||||
#'extern __shared__ blah<T>::type s[];'
|
||||
#'extern __shared__ typename mapper<Float>::type s_data[];'
|
||||
#'extern __attribute__((used)) __shared__ typename mapper<Float>::type s_data[];'
|
||||
|
||||
$ft{'extern_shared'} += $k;
|
||||
}
|
||||
|
||||
#--------
|
||||
# CUDA Launch Syntax
|
||||
# Note these only work if launch is on a single line.
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
## Using hipify-clang
|
||||
|
||||
`hipify-clang` is a clang-based tool which can automate the translation of CUDA source code into portable HIP C++.
|
||||
The tool can automatically add extra HIP arguments (notably the "hipLaunchParm" required at the beginning of every HIP kernel call).
|
||||
`hipify-clang` has some additional dependencies explained below and can be built as a separate make step. The instructions below are specifically for **Ubuntu 14.04**
|
||||
|
||||
### Build and install
|
||||
|
||||
- Download and unpack clang+llvm 3.8 binary package preqrequisite.
|
||||
```shell
|
||||
wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||
tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||
```
|
||||
|
||||
- Enable build of hipify-clang and specify path to LLVM.
|
||||
|
||||
Note LLVM_DIR must be a full absolute path to the location extracted above. Here's an example assuming we extract the clang 3.8 package into ~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/
|
||||
```shell
|
||||
cd HIP
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DBUILD_CLANG_HIPIFY=1 -DLLVM_DIR=~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ -DCMAKE_BUILD_TYPE=Release ..
|
||||
make
|
||||
make install
|
||||
```
|
||||
|
||||
### Running and using hipify-clang
|
||||
|
||||
`hipify-clang` performs an initial compile of the CUDA source code into a "symbol tree", and thus needs access to the appropriate header files.
|
||||
|
||||
In the case when `hipify-clang` doesn't find cuda headers, it reports various errors about unknown keywords (e.g. '\__global\__'), API function names (e.g. 'cudaMalloc'), syntax (e.g. 'foo<<<1,n>>>(...)'), etc.
|
||||
|
||||
To install CUDA headers, download the "deb(network)" variant of the target installer from https://developer.nvidia.com/cuda-downloads. The commands below show how to download and install a recent version from http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb.
|
||||
```shell
|
||||
wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb
|
||||
sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb
|
||||
sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5
|
||||
```
|
||||
|
||||
#### Disclaimer
|
||||
|
||||
The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale.
|
||||
|
||||
AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
|
||||
|
||||
Copyright (c) 2014-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
@@ -769,27 +769,26 @@ private:
|
||||
} // end anonymous namespace
|
||||
|
||||
// Set up the command line options
|
||||
static cl::OptionCategory
|
||||
ToolTemplateCategory("CUDA to HIP source translator options");
|
||||
static cl::extrahelp MoreHelp("<source0> specify the path of source file\n\n");
|
||||
static cl::opt<std::string>
|
||||
InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
|
||||
|
||||
static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
|
||||
cl::value_desc("filename"),
|
||||
cl::cat(ToolTemplateCategory));
|
||||
cl::value_desc("filename"));
|
||||
|
||||
static cl::opt<bool>
|
||||
Inplace("inplace",
|
||||
cl::desc("Modify input file inplace, replacing input with hipified "
|
||||
"output, save backup in .prehip file. "),
|
||||
cl::value_desc("inplace"), cl::cat(ToolTemplateCategory));
|
||||
cl::value_desc("inplace"));
|
||||
|
||||
static cl::opt<bool>
|
||||
NoOutput("no-output",
|
||||
cl::desc("don't write any translated output to stdout"),
|
||||
cl::value_desc("no-output"), cl::cat(ToolTemplateCategory));
|
||||
cl::value_desc("no-output"));
|
||||
|
||||
static cl::opt<bool>
|
||||
PrintStats("print-stats", cl::desc("print the command-line, like a header"),
|
||||
cl::value_desc("print-stats"), cl::cat(ToolTemplateCategory));
|
||||
cl::value_desc("print-stats"));
|
||||
|
||||
int main(int argc, const char **argv) {
|
||||
|
||||
@@ -797,12 +796,13 @@ int main(int argc, const char **argv) {
|
||||
|
||||
int Result;
|
||||
|
||||
CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory,
|
||||
llvm::cl::Required);
|
||||
std::unique_ptr<CompilationDatabase> Compilations(
|
||||
new FixedCompilationDatabase(".",std::vector<std::string>()));
|
||||
cl::ParseCommandLineOptions(argc, argv);
|
||||
|
||||
std::string dst = OutputFilename;
|
||||
std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
|
||||
if (dst.empty()) {
|
||||
dst = fileSources[0];
|
||||
dst = InputFilename;
|
||||
if (!Inplace) {
|
||||
size_t pos = dst.rfind(".cu");
|
||||
if (pos != std::string::npos) {
|
||||
@@ -820,13 +820,13 @@ int main(int argc, const char **argv) {
|
||||
}
|
||||
|
||||
// copy source file since tooling makes changes "inplace"
|
||||
std::ifstream source(fileSources[0], std::ios::binary);
|
||||
std::ifstream source(InputFilename, std::ios::binary);
|
||||
std::ofstream dest(Inplace ? dst + ".prehip" : dst, std::ios::binary);
|
||||
dest << source.rdbuf();
|
||||
source.close();
|
||||
dest.close();
|
||||
|
||||
RefactoringTool Tool(OptionsParser.getCompilations(), dst);
|
||||
RefactoringTool Tool(*Compilations, dst);
|
||||
ast_matchers::MatchFinder Finder;
|
||||
Cuda2HipCallback Callback(&Tool.getReplacements(), &Finder);
|
||||
HipifyPPCallbacks PPCallbacks(&Tool.getReplacements());
|
||||
@@ -931,7 +931,7 @@ int main(int argc, const char **argv) {
|
||||
llvm::outs() << counterNames[i] << ':'
|
||||
<< Callback.countReps[i] + PPCallbacks.countReps[i] << ' ';
|
||||
}
|
||||
llvm::outs() << ") in \'" << fileSources[0] << "\'\n";
|
||||
llvm::outs() << ") in \'" << InputFilename << "\'\n";
|
||||
}
|
||||
return Result;
|
||||
}
|
||||
|
||||
+1
-1
@@ -3,7 +3,7 @@
|
||||
| **CUDA** | **HIP** | **CUDA description** |
|
||||
|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `cudaChooseDevice` | | Select compute-device which best matches criteria. |
|
||||
| `cudaDeviceGetAttribute` | | Returns information about the device. |
|
||||
| `cudaDeviceGetAttribute` | `hipDeviceGetAttribute` | Returns information about the device. |
|
||||
| `cudaDeviceGetByPCIBusId` | | Returns a handle to a compute device. |
|
||||
| `cudaDeviceGetCacheConfig` | `hipDeviceGetCacheConfig` | Returns the preferred cache configuration for the current device. |
|
||||
| `cudaDeviceGetLimit` | | Returns resource limits. |
|
||||
@@ -420,17 +420,7 @@ HIP provides the following built-in functions for reading a high-resolution time
|
||||
clock_t clock()
|
||||
long long int clock64()
|
||||
```
|
||||
|
||||
AMD devices employ a per-GPU timer that increments at a constant time interval regardless of any dynamic frequency changes. All compute units in the system share the timer.
|
||||
Nvidia devices implement the timer as a per-compute-unit clock that increments on every clock cycle.
|
||||
|
||||
To obtain the clock frequency, use the hipDeviceProp_t.clockInstructionRate field:
|
||||
|
||||
```
|
||||
hipGetDeviceProperties(&deviceProps, deviceId);
|
||||
// Compute time in ms--device_ticks is based on values reported from clock() device function
|
||||
float time = device_ticks / (float)deviceProps.clockInstructionRate;
|
||||
```
|
||||
Returns the value of counter that is incremented every clock cycle on device. Difference in values returned provides the cycles used.
|
||||
|
||||
## Atomic Functions
|
||||
|
||||
|
||||
@@ -376,6 +376,15 @@ __device__ unsigned int atomicXor(unsigned int* address,
|
||||
__device__ unsigned long long int atomicXor(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
//atomicInc()
|
||||
__device__ unsigned int atomicInc(unsigned int* address,
|
||||
unsigned int val);
|
||||
|
||||
|
||||
//atomicDec()
|
||||
__device__ unsigned int atomicDec(unsigned int* address,
|
||||
unsigned int val);
|
||||
|
||||
|
||||
// integer intrinsic function __poc __clz __ffs __brev
|
||||
__device__ unsigned int __popc( unsigned int input);
|
||||
@@ -421,6 +430,8 @@ __device__ float __shfl_xor(float input, int lane_mask, int width);
|
||||
__host__ __device__ int min(int arg1, int arg2);
|
||||
__host__ __device__ int max(int arg1, int arg2);
|
||||
|
||||
__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr();
|
||||
|
||||
//TODO - add a couple fast math operations here, the set here will grow :
|
||||
__device__ float __cosf(float x);
|
||||
__device__ float __expf(float x);
|
||||
@@ -547,6 +558,16 @@ do {\
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* extern __shared__
|
||||
*/
|
||||
|
||||
// Macro to replace extern __shared__ declarations
|
||||
// to local variable definitions
|
||||
#define HIP_DYNAMIC_SHARED(type, var) \
|
||||
__attribute__((address_space(3))) type* var = \
|
||||
(__attribute__((address_space(3))) type*)__get_dynamicgroupbaseptr(); \
|
||||
|
||||
#endif // __HCC__
|
||||
|
||||
|
||||
|
||||
@@ -95,6 +95,13 @@ kernelName<<<numblocks,numthreads,memperblock,streamId>>>(0, __VA_ARGS__);\
|
||||
#define hipGridDim_y gridDim.y
|
||||
#define hipGridDim_z gridDim.z
|
||||
|
||||
/**
|
||||
* extern __shared__
|
||||
*/
|
||||
|
||||
#define HIP_DYNAMIC_SHARED(type, var) \
|
||||
extern __shared__ type var[]; \
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -430,7 +430,10 @@ inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const v
|
||||
return hipCUDAErrorTohipError(cudaMemcpyPeerAsync ( dst, dstDevice, src, srcDevice, count, stream ));
|
||||
};
|
||||
|
||||
|
||||
inline static hipError_t hipSetDeviceFlags (unsigned int flags)
|
||||
{
|
||||
return hipCUDAErrorTohipError(cudaSetDeviceFlags( flags ));
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Исполняемый файл
+23
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
function die {
|
||||
echo "${1-Died}." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
payload=$1
|
||||
script=$2
|
||||
[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!"
|
||||
tmp=__extract__$RANDOM
|
||||
|
||||
printf "#!/bin/bash
|
||||
samples_dir=\$1
|
||||
[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir
|
||||
mkdir -p \$samples_dir
|
||||
PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\`
|
||||
tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir
|
||||
echo \"HIP samples installed in \$samples_dir\"
|
||||
exit 0
|
||||
__PAYLOAD_BELOW__\n" > "$tmp"
|
||||
|
||||
cat "$tmp" "$payload" > "$script" && rm "$tmp"
|
||||
chmod +x "$script"
|
||||
@@ -24,12 +24,12 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= 0.10.16155-077b4c8-d49f384)")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= @HCC_VERSION@)")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
|
||||
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
|
||||
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = 0.10.16155-077b4c8-d49f384")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = @HCC_VERSION@")
|
||||
set(CPACK_SOURCE_GENERATOR "TGZ")
|
||||
include(CPack)
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
cmake_minimum_required(VERSION 2.8.3)
|
||||
project(hip_samples)
|
||||
|
||||
add_custom_target(create_installer_script ALL
|
||||
COMMAND tar cvzf ${PROJECT_BINARY_DIR}/samples.tgz --exclude='*.o' .
|
||||
COMMAND @hip_SOURCE_DIR@/packaging/create_hip_samples_installer.sh ${PROJECT_BINARY_DIR}/samples.tgz ${PROJECT_BINARY_DIR}/unpack_hip_samples.sh
|
||||
WORKING_DIRECTORY @hip_SOURCE_DIR@/samples)
|
||||
install(PROGRAMS unpack_hip_samples.sh DESTINATION bin)
|
||||
|
||||
#############################
|
||||
# Packaging steps
|
||||
#############################
|
||||
set(CPACK_SET_DESTDIR TRUE)
|
||||
set(CPACK_INSTALL_PREFIX "/opt/rocm/hip")
|
||||
set(CPACK_PACKAGE_NAME "hip_samples")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
|
||||
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
|
||||
set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
|
||||
set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
|
||||
set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
|
||||
set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
|
||||
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
|
||||
set(CPACK_GENERATOR "TGZ;DEB;RPM")
|
||||
set(CPACK_BINARY_DEB "ON")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_BINARY_RPM "ON")
|
||||
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}")
|
||||
set(CPACK_SOURCE_GENERATOR "TGZ")
|
||||
include(CPack)
|
||||
@@ -12,7 +12,7 @@ ifeq (${HIP_PLATFORM}, nvcc)
|
||||
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
|
||||
endif
|
||||
ifeq (${HIP_PLATFORM}, hcc)
|
||||
HIPCC_FLAGS =
|
||||
HIPCC_FLAGS = -stdlib=libc++
|
||||
endif
|
||||
|
||||
|
||||
|
||||
@@ -22,6 +22,9 @@ THE SOFTWARE.
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <hip_runtime.h>
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hc.hpp>
|
||||
#endif
|
||||
|
||||
|
||||
#define CHECK(cmd) \
|
||||
|
||||
@@ -24,7 +24,7 @@ THE SOFTWARE.
|
||||
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
hipError_t error = cmd;\
|
||||
cudaError_t error = cmd;\
|
||||
if (error != cudaSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
|
||||
@@ -1,3 +1,22 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include"hip_runtime.h"
|
||||
#include<hc.hpp>
|
||||
#include<grid_launch.h>
|
||||
@@ -482,8 +501,8 @@ __device__ double trunc(double x)
|
||||
|
||||
const int warpSize = 64;
|
||||
|
||||
__device__ long long int clock64() { return (long long int)hc::__clock_u64(); };
|
||||
__device__ clock_t clock() { return (clock_t)hc::__clock_u64(); };
|
||||
__device__ long long int clock64() { return (long long int)hc::__cycle_u64(); };
|
||||
__device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); };
|
||||
|
||||
|
||||
//atomicAdd()
|
||||
@@ -638,7 +657,19 @@ __device__ unsigned long long int atomicXor(unsigned long long int* address,
|
||||
return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val);
|
||||
}
|
||||
|
||||
//atomicInc
|
||||
__device__ unsigned int atomicInc(unsigned int* address,
|
||||
unsigned int val)
|
||||
{
|
||||
return hc::__atomic_wrapinc(address,val);
|
||||
}
|
||||
|
||||
//atomicDec
|
||||
__device__ unsigned int atomicDec(unsigned int* address,
|
||||
unsigned int val)
|
||||
{
|
||||
return hc::__atomic_wrapdec(address,val);
|
||||
}
|
||||
|
||||
|
||||
__device__ unsigned int test__popc(unsigned int input)
|
||||
@@ -777,6 +808,11 @@ __host__ __device__ int max(int arg1, int arg2)
|
||||
return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));
|
||||
}
|
||||
|
||||
__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr()
|
||||
{
|
||||
return hc::get_dynamic_group_segment_base_pointer();
|
||||
}
|
||||
|
||||
|
||||
|
||||
//TODO - add a couple fast math operations here, the set here will grow :
|
||||
|
||||
@@ -32,6 +32,8 @@ THE SOFTWARE.
|
||||
#define tprintf(trace_level, ...)
|
||||
#endif
|
||||
|
||||
extern hsa_agent_t g_cpu_agent; // defined in hip_hcc.cpp
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) :
|
||||
_hsa_agent(hsaAgent),
|
||||
@@ -106,7 +108,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_
|
||||
|
||||
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
|
||||
|
||||
hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);
|
||||
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
@@ -169,7 +171,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte
|
||||
|
||||
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
|
||||
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);
|
||||
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
@@ -223,7 +225,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte
|
||||
|
||||
tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
|
||||
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _hsa_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
THROW_ERROR (hipErrorRuntimeMemory);
|
||||
}
|
||||
@@ -287,7 +289,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void*
|
||||
|
||||
tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
|
||||
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
|
||||
if (hsa_status != HSA_STATUS_SUCCESS) {
|
||||
THROW_ERROR (hipErrorRuntimeMemory);
|
||||
}
|
||||
@@ -315,7 +317,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void*
|
||||
|
||||
tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
|
||||
hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], dstAgent /*not used*/, theseBytes,
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent /*not used*/, theseBytes,
|
||||
hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex],
|
||||
_completion_signal2[bufferIndex]);
|
||||
|
||||
|
||||
@@ -100,7 +100,9 @@ endmacro()
|
||||
# Make a hip executable, using libc++
|
||||
macro (make_hip_executable_libcpp exe cpp)
|
||||
make_hip_executable( ${exe} ${cpp} ${ARGN} )
|
||||
set_source_files_properties (${cpp} i${ARGN} PROPERTIES COMPILE_FLAGS --stdlib=libc++ )
|
||||
if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
set_source_files_properties (${cpp} i${ARGN} PROPERTIES COMPILE_FLAGS --stdlib=libc++ )
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro (make_named_test exe testname )
|
||||
@@ -151,7 +153,9 @@ make_hip_executable (hip_popc hip_popc.cpp)
|
||||
make_hip_executable (hip_clz hip_clz.cpp)
|
||||
make_hip_executable (hip_brev hip_brev.cpp)
|
||||
make_hip_executable (hip_ffs hip_ffs.cpp)
|
||||
make_hip_executable (hip_test_ldg hip_test_ldg.cpp)
|
||||
if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
make_hip_executable (hip_test_ldg hip_test_ldg.cpp)
|
||||
endif()
|
||||
make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp)
|
||||
make_hip_executable (hipEnvVar hipEnvVar.cpp)
|
||||
make_hip_executable (hipEnvVarDriver hipEnvVarDriver.cpp)
|
||||
@@ -187,6 +191,7 @@ make_hip_executable (hipPeerToPeer_simple hipPeerToPeer_simple.cpp)
|
||||
make_hip_executable (hipMemcpyAll hipMemcpyAll.cpp)
|
||||
make_hip_executable (hipMultiThreadDevice hipMultiThreadDevice.cpp)
|
||||
make_hip_executable (hipTestMemcpyPin hipTestMemcpyPin.cpp)
|
||||
make_hip_executable (hipDynamicShared hipDynamicShared.cpp)
|
||||
|
||||
make_test(hip_ballot " " )
|
||||
make_test(hip_anyall " " )
|
||||
@@ -194,7 +199,6 @@ make_test(hip_popc " " )
|
||||
make_test(hip_brev " " )
|
||||
make_test(hip_clz " " )
|
||||
make_test(hip_ffs " " )
|
||||
make_test(hip_test_ldg " " )
|
||||
make_test(hipEventRecord --iterations 10)
|
||||
make_test(hipMemset " " )
|
||||
make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes.
|
||||
@@ -226,10 +230,8 @@ make_test(hipStreamL5 " ")
|
||||
make_test(hipRandomMemcpyAsync " ")
|
||||
#make_test(hipAPIStreamEnable " ")
|
||||
#make_test(hipAPIStreamDisable " ")
|
||||
make_test(hipMemoryAllocate " ")
|
||||
make_test(hipFuncSetDeviceFlags " ")
|
||||
make_test(hipFuncGetDevice " ")
|
||||
make_test(hipFuncSetDevice " ")
|
||||
make_test(hipFuncDeviceSynchronize " ")
|
||||
make_test(hipTestMemcpyPin " ")
|
||||
make_named_test (hipMultiThreadDevice "hipMultiThreadDevice-serial" --tests 0x1)
|
||||
@@ -243,4 +245,12 @@ if (${HIP_MULTI_GPU})
|
||||
|
||||
endif()
|
||||
|
||||
if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
make_test(hip_test_ldg " " )
|
||||
make_test(hipMemoryAllocate " ")
|
||||
make_test(hipFuncSetDevice " ")
|
||||
endif()
|
||||
|
||||
make_hipify_test(specialFunc.cu )
|
||||
|
||||
make_test(hipDynamicShared " ")
|
||||
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_runtime.h>
|
||||
#include "test_common.h"
|
||||
|
||||
template<typename T>
|
||||
__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t numElements, size_t groupElements) {
|
||||
|
||||
// declare dynamic shared memory
|
||||
HIP_DYNAMIC_SHARED(T, sdata)
|
||||
|
||||
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t tid = hipThreadIdx_x;
|
||||
|
||||
// initialize dynamic shared memory
|
||||
if (tid < groupElements) {
|
||||
sdata[tid] = static_cast<T>(tid);
|
||||
}
|
||||
|
||||
// prefix sum inside dynamic shared memory
|
||||
if (groupElements >= 512) {
|
||||
if (tid >= 256) { sdata[tid] += sdata[tid - 256]; } __syncthreads();
|
||||
}
|
||||
if (groupElements >= 256) {
|
||||
if (tid >= 128) { sdata[tid] += sdata[tid - 128]; } __syncthreads();
|
||||
}
|
||||
if (groupElements >= 128) {
|
||||
if (tid >= 64) { sdata[tid] += sdata[tid - 64]; } __syncthreads();
|
||||
}
|
||||
if (groupElements >= 64) { sdata[tid] += sdata[tid - 32]; } __syncthreads();
|
||||
if (groupElements >= 32) { sdata[tid] += sdata[tid - 16]; } __syncthreads();
|
||||
if (groupElements >= 16) { sdata[tid] += sdata[tid - 8]; } __syncthreads();
|
||||
if (groupElements >= 8) { sdata[tid] += sdata[tid - 4]; } __syncthreads();
|
||||
if (groupElements >= 4) { sdata[tid] += sdata[tid - 2]; } __syncthreads();
|
||||
if (groupElements >= 2) { sdata[tid] += sdata[tid - 1]; } __syncthreads();
|
||||
|
||||
C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void testExternShared(size_t N, size_t groupElements) {
|
||||
size_t Nbytes = N * sizeof(T);
|
||||
|
||||
T *A_d, *B_d, *C_d;
|
||||
T *A_h, *B_h, *C_h;
|
||||
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
//printf("blocksPerCU: %d\nthreadsPerBlock: %d\nN: %zu\n", blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
// calculate the amount of dynamic shared memory required
|
||||
size_t groupMemBytes = groupElements * sizeof(T);
|
||||
|
||||
// launch kernel with dynamic shared memory
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock), groupMemBytes, 0, A_d, B_d, C_d, N, groupElements);
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// verify
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
size_t tid = (i % groupElements);
|
||||
T sumFromSharedMemory = static_cast<T>(tid * (tid + 1) / 2);
|
||||
T expected = A_h[i] + B_h[i] + sumFromSharedMemory;
|
||||
if (C_h[i] != expected) {
|
||||
std::cout << std::fixed << std::setprecision(32);
|
||||
std::cout << "At " << i << std::endl;
|
||||
std::cout << " Computed:" << C_h[i] << std::endl;
|
||||
std::cout << " Expected:" << expected << std::endl;
|
||||
std::cout << sumFromSharedMemory << std::endl;
|
||||
std::cout << A_h[i] << std::endl;
|
||||
std::cout << B_h[i] << std::endl;
|
||||
|
||||
failed("Failed at index:%zu\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
//printf("info: set device to %d\n", p_gpuDevice);
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
|
||||
testExternShared<float>(1024, 4);
|
||||
testExternShared<float>(1024, 8);
|
||||
testExternShared<float>(1024, 16);
|
||||
testExternShared<float>(1024, 32);
|
||||
testExternShared<float>(1024, 64);
|
||||
|
||||
testExternShared<float>(65536, 4);
|
||||
testExternShared<float>(65536, 8);
|
||||
testExternShared<float>(65536, 16);
|
||||
testExternShared<float>(65536, 32);
|
||||
testExternShared<float>(65536, 64);
|
||||
|
||||
testExternShared<double>(1024, 4);
|
||||
testExternShared<double>(1024, 8);
|
||||
testExternShared<double>(1024, 16);
|
||||
testExternShared<double>(1024, 32);
|
||||
testExternShared<double>(1024, 64);
|
||||
|
||||
testExternShared<double>(65536, 4);
|
||||
testExternShared<double>(65536, 8);
|
||||
testExternShared<double>(65536, 16);
|
||||
testExternShared<double>(65536, 32);
|
||||
testExternShared<double>(65536, 64);
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -121,8 +121,7 @@ int computeGold(int *gpuData, const int len)
|
||||
|
||||
for (int i = 0; i < len; ++i)
|
||||
{
|
||||
//val = (val >= limit) ? 0 : val+1;
|
||||
val = val+1;
|
||||
val = (val >= limit) ? 0 : val+1;
|
||||
}
|
||||
|
||||
if (val != gpuData[5])
|
||||
@@ -136,8 +135,7 @@ int computeGold(int *gpuData, const int len)
|
||||
|
||||
for (int i = 0; i < len; ++i)
|
||||
{
|
||||
//val = ((val == 0) || (val > limit)) ? limit : val-1;
|
||||
val = val-1;
|
||||
val = ((val == 0) || (val > limit)) ? limit : val-1;
|
||||
}
|
||||
|
||||
if (val != gpuData[6])
|
||||
@@ -234,12 +232,10 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata)
|
||||
atomicMin(&g_odata[4], tid);
|
||||
|
||||
// Atomic increment (modulo 17+1)
|
||||
//atomicInc((unsigned int *)&g_odata[5], 17);
|
||||
//atomicInc((unsigned int *)&g_odata[5]);
|
||||
|
||||
atomicInc((unsigned int *)&g_odata[5], 17);
|
||||
|
||||
// Atomic decrement
|
||||
// atomicDec((unsigned int *)&g_odata[6], 137);
|
||||
//atomicDec((unsigned int *)&g_odata[6]);
|
||||
atomicDec((unsigned int *)&g_odata[6], 137);
|
||||
|
||||
// Atomic compare-and-swap
|
||||
atomicCAS(&g_odata[7], tid-1, tid);
|
||||
|
||||
Ссылка в новой задаче
Block a user