From d8bc30d79ab1ca6c66dc34c30551484e2e1899bf Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 6 May 2016 14:39:10 +0530 Subject: [PATCH 01/17] Add packaging scripts for creating hip_samples package Change-Id: I38205ce851ee2d8affce532c80c9a9fd4f66b345 --- hipamd/CMakeLists.txt | 15 +++++++-- .../packaging/create_hip_samples_installer.sh | 23 +++++++++++++ hipamd/packaging/hip_samples.txt | 32 +++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100755 hipamd/packaging/create_hip_samples_installer.sh create mode 100644 hipamd/packaging/hip_samples.txt diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 72f1609f99..f16e38ca49 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -204,6 +204,17 @@ add_custom_target(pkg_hip_doc COMMAND ${CMAKE_COMMAND} . COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR} WORKING_DIRECTORY ${BUILD_DIR}) -# Package: all -add_custom_target(package DEPENDS pkg_hip_base pkg_hip_hcc pkg_hip_nvcc pkg_hip_doc) +# Package: hip_samples +set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip_samples) +configure_file(packaging/hip_samples.txt ${BUILD_DIR}/CMakeLists.txt @ONLY) +add_custom_target(pkg_hip_samples COMMAND ${CMAKE_COMMAND} . + COMMAND rm -rf *.deb *.rpm *.tar.gz + COMMAND make package + COMMAND cp *.deb ${PROJECT_BINARY_DIR} + COMMAND cp *.rpm ${PROJECT_BINARY_DIR} + COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR} + WORKING_DIRECTORY ${BUILD_DIR}) + +# Package: all +add_custom_target(package DEPENDS pkg_hip_base pkg_hip_hcc pkg_hip_nvcc pkg_hip_doc pkg_hip_samples) diff --git a/hipamd/packaging/create_hip_samples_installer.sh b/hipamd/packaging/create_hip_samples_installer.sh new file mode 100755 index 0000000000..91789d2524 --- /dev/null +++ b/hipamd/packaging/create_hip_samples_installer.sh @@ -0,0 +1,23 @@ +#!/bin/bash +function die { + echo "${1-Died}." >&2 + exit 1 +} + +payload=$1 +script=$2 +[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!" +tmp=__extract__$RANDOM + +printf "#!/bin/bash +samples_dir=\$1 +[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir +mkdir -p \$samples_dir +PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\` +tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir +echo \"HIP samples installed in \$samples_dir\" +exit 0 +__PAYLOAD_BELOW__\n" > "$tmp" + +cat "$tmp" "$payload" > "$script" && rm "$tmp" +chmod +x "$script" diff --git a/hipamd/packaging/hip_samples.txt b/hipamd/packaging/hip_samples.txt new file mode 100644 index 0000000000..c3873bf05f --- /dev/null +++ b/hipamd/packaging/hip_samples.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 2.8.3) +project(hip_samples) + +add_custom_target(create_installer_script ALL + COMMAND tar cvzf ${PROJECT_BINARY_DIR}/samples.tgz --exclude='*.o' . + COMMAND @hip_SOURCE_DIR@/packaging/create_hip_samples_installer.sh ${PROJECT_BINARY_DIR}/samples.tgz ${PROJECT_BINARY_DIR}/unpack_hip_samples.sh + WORKING_DIRECTORY @hip_SOURCE_DIR@/samples) +install(PROGRAMS unpack_hip_samples.sh DESTINATION bin) + +############################# +# Packaging steps +############################# +set(CPACK_SET_DESTDIR TRUE) +set(CPACK_INSTALL_PREFIX "/opt/rocm/hip") +set(CPACK_PACKAGE_NAME "hip_samples") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]") +set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") +set(CPACK_PACKAGE_CONTACT "Maneesh Gupta ") +set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@) +set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@) +set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@) +set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@) +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) +set(CPACK_GENERATOR "TGZ;DEB;RPM") +set(CPACK_BINARY_DEB "ON") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION})") +set(CPACK_BINARY_RPM "ON") +set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") +set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") +set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_SOURCE_GENERATOR "TGZ") +include(CPack) From 67e2ee1efe0c471c283258d24395355eb4c14087 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 6 May 2016 10:51:06 -0500 Subject: [PATCH 02/17] Added copyright for device functions file Change-Id: I689345ae7428928b4d2d7cd37fbc561309db3256 --- hipamd/src/device_util.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index a5ad8db12a..1177ce8d06 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -1,3 +1,22 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + #include"hip_runtime.h" #include #include From 29c243a0a469e8d173a83815cf7cec16e12c114b Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 6 May 2016 16:06:15 +0530 Subject: [PATCH 03/17] dynamically compute HCC version dependency for hip_hcc package Change-Id: I5eca934826f12ee002416b299fd562db0f60056c --- hipamd/CMakeLists.txt | 6 +++++- hipamd/packaging/hip_hcc.txt | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index f16e38ca49..76b103f576 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -34,7 +34,11 @@ if(HIP_PLATFORM STREQUAL "hcc") endif() endif() if(IS_ABSOLUTE ${HCC_HOME} AND EXISTS ${HCC_HOME} AND IS_DIRECTORY ${HCC_HOME}) - message(STATUS "Looking for HCC in: " ${HCC_HOME}) + execute_process(COMMAND ${HCC_HOME}/bin/hcc --version + COMMAND cut -d\ -f9 + OUTPUT_VARIABLE HCC_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + message(STATUS "Looking for HCC in: " ${HCC_HOME} ". Found version: " ${HCC_VERSION}) else() message(FATAL_ERROR "Don't know where to find HCC. Please specify abolute path using -DHCC_HOME") endif() diff --git a/hipamd/packaging/hip_hcc.txt b/hipamd/packaging/hip_hcc.txt index 0df19b325e..63637b1e0a 100644 --- a/hipamd/packaging/hip_hcc.txt +++ b/hipamd/packaging/hip_hcc.txt @@ -24,12 +24,12 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR} set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= 0.10.16155-077b4c8-d49f384)") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), hcc_lc (= @HCC_VERSION@)") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") -set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = 0.10.16155-077b4c8-d49f384") +set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, hcc_lc = @HCC_VERSION@") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) From 349b92f2a8afa4eb82ede2f95a3b9ddb881f3da2 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 12 May 2016 10:13:07 +0530 Subject: [PATCH 04/17] Fix square.cu to use cudaError_t instead of hipError_t Change-Id: If3314910d1c03122741d3e0a45e14a4412c473b3 --- hipamd/samples/0_Intro/square/square.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/samples/0_Intro/square/square.cu b/hipamd/samples/0_Intro/square/square.cu index 8b6980cd02..ec8ca12fbf 100644 --- a/hipamd/samples/0_Intro/square/square.cu +++ b/hipamd/samples/0_Intro/square/square.cu @@ -24,7 +24,7 @@ THE SOFTWARE. #define CHECK(cmd) \ {\ - hipError_t error = cmd;\ + cudaError_t error = cmd;\ if (error != cudaSuccess) { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ exit(EXIT_FAILURE);\ From 8c11c333e292ed45a148548703450c7dd29eb740 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 11 May 2016 12:51:11 +0530 Subject: [PATCH 05/17] Support for Atomic inc and dec in HIP Change-Id: I783e4917cece5cc379894f0d293382315fbfa8b0 --- hipamd/include/hcc_detail/hip_runtime.h | 9 +++++++++ hipamd/src/device_util.cpp | 12 ++++++++++++ hipamd/tests/src/hipSimpleAtomicsTest.cpp | 14 +++++--------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/hipamd/include/hcc_detail/hip_runtime.h b/hipamd/include/hcc_detail/hip_runtime.h index 3288996dce..0d70eaa2a4 100644 --- a/hipamd/include/hcc_detail/hip_runtime.h +++ b/hipamd/include/hcc_detail/hip_runtime.h @@ -376,6 +376,15 @@ __device__ unsigned int atomicXor(unsigned int* address, __device__ unsigned long long int atomicXor(unsigned long long int* address, unsigned long long int val); +//atomicInc() +__device__ unsigned int atomicInc(unsigned int* address, + unsigned int val); + + +//atomicDec() +__device__ unsigned int atomicDec(unsigned int* address, + unsigned int val); + // integer intrinsic function __poc __clz __ffs __brev __device__ unsigned int __popc( unsigned int input); diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 1177ce8d06..c01ad30ab5 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -657,7 +657,19 @@ __device__ unsigned long long int atomicXor(unsigned long long int* address, return (long long int)hc::atomic_fetch_xor((uint64_t*)address,(uint64_t)val); } +//atomicInc +__device__ int atomicInc(unsigned int* address, + unsigned int val) +{ + return hc::__atomic_wrapinc(address,val); +} +//atomicDec +__device__ int atomicDec(unsigned int* address, + unsigned int val) +{ + return hc::__atomic_wrapdec(address,val); +} __device__ unsigned int test__popc(unsigned int input) diff --git a/hipamd/tests/src/hipSimpleAtomicsTest.cpp b/hipamd/tests/src/hipSimpleAtomicsTest.cpp index 1be32f6679..d02252c7ea 100644 --- a/hipamd/tests/src/hipSimpleAtomicsTest.cpp +++ b/hipamd/tests/src/hipSimpleAtomicsTest.cpp @@ -121,8 +121,7 @@ int computeGold(int *gpuData, const int len) for (int i = 0; i < len; ++i) { - //val = (val >= limit) ? 0 : val+1; - val = val+1; + val = (val >= limit) ? 0 : val+1; } if (val != gpuData[5]) @@ -136,8 +135,7 @@ int computeGold(int *gpuData, const int len) for (int i = 0; i < len; ++i) { - //val = ((val == 0) || (val > limit)) ? limit : val-1; - val = val-1; + val = ((val == 0) || (val > limit)) ? limit : val-1; } if (val != gpuData[6]) @@ -234,12 +232,10 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata) atomicMin(&g_odata[4], tid); // Atomic increment (modulo 17+1) - //atomicInc((unsigned int *)&g_odata[5], 17); - //atomicInc((unsigned int *)&g_odata[5]); - + atomicInc((unsigned int *)&g_odata[5], 17); + // Atomic decrement - // atomicDec((unsigned int *)&g_odata[6], 137); - //atomicDec((unsigned int *)&g_odata[6]); + atomicDec((unsigned int *)&g_odata[6], 137); // Atomic compare-and-swap atomicCAS(&g_odata[7], tid-1, tid); From 98e04c7a832e5117fa546dd21030de50af9eb804 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 9 May 2016 13:49:01 -0500 Subject: [PATCH 06/17] Update release notes for 0.86 Change-Id: I60eb5691aec35fad714aac38deb3c4eccf7ae12a --- hipamd/RELEASE.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hipamd/RELEASE.md b/hipamd/RELEASE.md index 57b4b7a5de..343e028e9c 100644 --- a/hipamd/RELEASE.md +++ b/hipamd/RELEASE.md @@ -1,7 +1,5 @@ # Release notes -Since this is an early access release and we are still in development towards the production ready version Boltzmann Driver and runtime we recommend this release be used for research and early application development. - We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://gpuopen-professionalcompute-tools.github.io/HIP/bug.html) lists known bugs. Some of the key items we are working on: - Tuning built-in functions, including shfl. - Performance optimization. @@ -10,20 +8,22 @@ We have attempted to document known bugs and limitations - in particular the [HI Stay tuned - the work for many of these features is already in-flight. =================================================================================================== -- clang-hipify : clang-based hipify tool. Improved parsing of source code, and automates +Release:0.86.00 +Date: 2016.05.xx +- Add clang-hipify : clang-based hipify tool. Improved parsing of source code, and automates creation of hipLaunchParm variable. -- Memory register / unregister commands (hipHostRegister, hipHostUnregister) -- Improve cross-linking support between G++ and HCC, in particular for interfaces that use +- Implement memory register / unregister commands (hipHostRegister, hipHostUnregister) +- Add cross-linking support between G++ and HCC, in particular for interfaces that use standard C++ libraries (ie std::vectors, std::strings). HIPCC now uses libstdc++ by default on the HCC compilation path. -- More samples including GPUBurn and SHOC. See [HIP-Examples](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP-Examples) +- More samples including gpu-burn, SHOC, nbody, rtm. See [HIP-Examples](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP-Examples) =================================================================================================== ## Revision History: =================================================================================================== -Release:0.84.00 -Date: +Release:0.84.01 +Date: 2016.04.25 - Refactor HIP make and install system: - Move to CMake. Refer to the installation section in README.md for details. - Split source into multiple modular .cpp and .h files. From 429c26ea937566c43ee6c9a07ec34e8b96069cff Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 16 May 2016 10:42:13 +0530 Subject: [PATCH 07/17] Add misssing unsigned keyword to atomicInc and atomicDec Change-Id: I658479c4c7c409dba117152165229880aeb5ab9f --- hipamd/src/device_util.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index c01ad30ab5..2e9e9e4540 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -658,14 +658,14 @@ __device__ unsigned long long int atomicXor(unsigned long long int* address, } //atomicInc -__device__ int atomicInc(unsigned int* address, +__device__ unsigned int atomicInc(unsigned int* address, unsigned int val) { return hc::__atomic_wrapinc(address,val); } //atomicDec -__device__ int atomicDec(unsigned int* address, +__device__ unsigned int atomicDec(unsigned int* address, unsigned int val) { return hc::__atomic_wrapdec(address,val); From b73bdb7e07067d95b80d91ecc4d3e6c3d83188e0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 16 May 2016 15:19:07 -0500 Subject: [PATCH 08/17] move/update API list Change-Id: I90457b90bbcb2a3309bf65a94a25be69af38f261 --- .../markdown}/CUDA_Runtime_API_functions_supported_by_HIP.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename hipamd/{ => docs/markdown}/CUDA_Runtime_API_functions_supported_by_HIP.md (99%) diff --git a/hipamd/CUDA_Runtime_API_functions_supported_by_HIP.md b/hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md similarity index 99% rename from hipamd/CUDA_Runtime_API_functions_supported_by_HIP.md rename to hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md index 3ebb80b79e..fc4f7c3fe5 100644 --- a/hipamd/CUDA_Runtime_API_functions_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md @@ -3,7 +3,7 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | `cudaChooseDevice` | | Select compute-device which best matches criteria. | -| `cudaDeviceGetAttribute` | | Returns information about the device. | +| `cudaDeviceGetAttribute` | `hipDeviceGetAttribute` | Returns information about the device. | | `cudaDeviceGetByPCIBusId` | | Returns a handle to a compute device. | | `cudaDeviceGetCacheConfig` | `hipDeviceGetCacheConfig` | Returns the preferred cache configuration for the current device. | | `cudaDeviceGetLimit` | | Returns resource limits. | From af65f415f04de0984d581f2cafabc994fc63b0ac Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 20 May 2016 10:15:33 +0530 Subject: [PATCH 09/17] Fix bit_extract sample Change-Id: I933f932bac26d9a9469d5d069973af166e11cbcd --- hipamd/samples/0_Intro/bit_extract/Makefile | 2 +- hipamd/samples/0_Intro/bit_extract/bit_extract.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hipamd/samples/0_Intro/bit_extract/Makefile b/hipamd/samples/0_Intro/bit_extract/Makefile index a01f60646e..0965ae7296 100644 --- a/hipamd/samples/0_Intro/bit_extract/Makefile +++ b/hipamd/samples/0_Intro/bit_extract/Makefile @@ -12,7 +12,7 @@ ifeq (${HIP_PLATFORM}, nvcc) HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif ifeq (${HIP_PLATFORM}, hcc) - HIPCC_FLAGS = + HIPCC_FLAGS = -stdlib=libc++ endif diff --git a/hipamd/samples/0_Intro/bit_extract/bit_extract.cpp b/hipamd/samples/0_Intro/bit_extract/bit_extract.cpp index 53302b7228..746e1012bd 100644 --- a/hipamd/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/hipamd/samples/0_Intro/bit_extract/bit_extract.cpp @@ -22,6 +22,9 @@ THE SOFTWARE. #include #include #include +#ifdef __HIP_PLATFORM_HCC__ +#include +#endif #define CHECK(cmd) \ From 02a6c1fbe06321398dc34873f95e0bb32ec55edc Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Fri, 20 May 2016 11:12:32 +0530 Subject: [PATCH 10/17] Update in clock function Change-Id: I5819aa62693dc3b9b5d7e39944d1e58aadc72027 --- hipamd/docs/markdown/hip_kernel_language.md | 12 +----------- hipamd/src/device_util.cpp | 4 ++-- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/hipamd/docs/markdown/hip_kernel_language.md b/hipamd/docs/markdown/hip_kernel_language.md index b54cf16613..e382ede4b3 100644 --- a/hipamd/docs/markdown/hip_kernel_language.md +++ b/hipamd/docs/markdown/hip_kernel_language.md @@ -420,17 +420,7 @@ HIP provides the following built-in functions for reading a high-resolution time clock_t clock() long long int clock64() ``` - -AMD devices employ a per-GPU timer that increments at a constant time interval regardless of any dynamic frequency changes. All compute units in the system share the timer. -Nvidia devices implement the timer as a per-compute-unit clock that increments on every clock cycle. - -To obtain the clock frequency, use the hipDeviceProp_t.clockInstructionRate field: - -``` -hipGetDeviceProperties(&deviceProps, deviceId); -// Compute time in ms--device_ticks is based on values reported from clock() device function -float time = device_ticks / (float)deviceProps.clockInstructionRate; -``` +Returns the value of counter that is incremented every clock cycle on device. Difference in values returned provides the cycles used. ## Atomic Functions diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 2e9e9e4540..3234408e50 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -501,8 +501,8 @@ __device__ double trunc(double x) const int warpSize = 64; -__device__ long long int clock64() { return (long long int)hc::__clock_u64(); }; -__device__ clock_t clock() { return (clock_t)hc::__clock_u64(); }; +__device__ long long int clock64() { return (long long int)hc::__cycle_u64(); }; +__device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); }; //atomicAdd() From 29e61560c0ca1130d7d52f64394244b6787dc60c Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 20 May 2016 12:24:54 +0530 Subject: [PATCH 11/17] Link against libc++ only on hcc platform Change-Id: I93569a5b8a9910dca0c88408cbe54fbb32384fcf --- hipamd/tests/src/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hipamd/tests/src/CMakeLists.txt b/hipamd/tests/src/CMakeLists.txt index 3b0a86884c..cbf5860e8d 100644 --- a/hipamd/tests/src/CMakeLists.txt +++ b/hipamd/tests/src/CMakeLists.txt @@ -100,7 +100,9 @@ endmacro() # Make a hip executable, using libc++ macro (make_hip_executable_libcpp exe cpp) make_hip_executable( ${exe} ${cpp} ${ARGN} ) - set_source_files_properties (${cpp} i${ARGN} PROPERTIES COMPILE_FLAGS --stdlib=libc++ ) + if (${HIP_PLATFORM} STREQUAL "hcc") + set_source_files_properties (${cpp} i${ARGN} PROPERTIES COMPILE_FLAGS --stdlib=libc++ ) + endif() endmacro() macro (make_named_test exe testname ) From f7cf463cb42078f47a913e3e04a339b8966e99e6 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 20 May 2016 12:30:37 +0530 Subject: [PATCH 12/17] Disable failing/unsupported test cases on nvcc platform Change-Id: Iad00db3dd1663303c43eed81d26be76b3fb1ba85 --- hipamd/tests/src/CMakeLists.txt | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hipamd/tests/src/CMakeLists.txt b/hipamd/tests/src/CMakeLists.txt index cbf5860e8d..111d945464 100644 --- a/hipamd/tests/src/CMakeLists.txt +++ b/hipamd/tests/src/CMakeLists.txt @@ -153,7 +153,9 @@ make_hip_executable (hip_popc hip_popc.cpp) make_hip_executable (hip_clz hip_clz.cpp) make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) -make_hip_executable (hip_test_ldg hip_test_ldg.cpp) +if (${HIP_PLATFORM} STREQUAL "hcc") + make_hip_executable (hip_test_ldg hip_test_ldg.cpp) +endif() make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) make_hip_executable (hipEnvVar hipEnvVar.cpp) make_hip_executable (hipEnvVarDriver hipEnvVarDriver.cpp) @@ -196,7 +198,6 @@ make_test(hip_popc " " ) make_test(hip_brev " " ) make_test(hip_clz " " ) make_test(hip_ffs " " ) -make_test(hip_test_ldg " " ) make_test(hipEventRecord --iterations 10) make_test(hipMemset " " ) make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes. @@ -228,10 +229,8 @@ make_test(hipStreamL5 " ") make_test(hipRandomMemcpyAsync " ") #make_test(hipAPIStreamEnable " ") #make_test(hipAPIStreamDisable " ") -make_test(hipMemoryAllocate " ") make_test(hipFuncSetDeviceFlags " ") make_test(hipFuncGetDevice " ") -make_test(hipFuncSetDevice " ") make_test(hipFuncDeviceSynchronize " ") make_test(hipTestMemcpyPin " ") make_named_test (hipMultiThreadDevice "hipMultiThreadDevice-serial" --tests 0x1) @@ -245,4 +244,10 @@ if (${HIP_MULTI_GPU}) endif() +if (${HIP_PLATFORM} STREQUAL "hcc") + make_test(hip_test_ldg " " ) + make_test(hipMemoryAllocate " ") + make_test(hipFuncSetDevice " ") +endif() + make_hipify_test(specialFunc.cu ) From a42b0f590943791a14d17ed80804b5d2fd6c0c12 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 20 May 2016 12:39:02 +0530 Subject: [PATCH 13/17] Add hipSetDeviceFlags implementation for NVCC Change-Id: Id19e965950b728cfe0a514e6cdf65a191d063c10 --- hipamd/include/nvcc_detail/hip_runtime_api.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hipamd/include/nvcc_detail/hip_runtime_api.h b/hipamd/include/nvcc_detail/hip_runtime_api.h index ef85b7fcd7..5434555668 100644 --- a/hipamd/include/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/nvcc_detail/hip_runtime_api.h @@ -430,7 +430,10 @@ inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const v return hipCUDAErrorTohipError(cudaMemcpyPeerAsync ( dst, dstDevice, src, srcDevice, count, stream )); }; - +inline static hipError_t hipSetDeviceFlags (unsigned int flags) +{ + return hipCUDAErrorTohipError(cudaSetDeviceFlags( flags )); +} #ifdef __cplusplus From 4efabb955b68e36593e85b6c7a96e4eb44a9b457 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 20 May 2016 14:06:27 +0530 Subject: [PATCH 14/17] Fix missing JSON database warning from hipify-clang Change-Id: I4aeff51556678266c1392a031deeb9ffff2386bc --- hipamd/clang-hipify/src/Cuda2Hip.cpp | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/hipamd/clang-hipify/src/Cuda2Hip.cpp b/hipamd/clang-hipify/src/Cuda2Hip.cpp index 1e29c65010..0a486a6596 100644 --- a/hipamd/clang-hipify/src/Cuda2Hip.cpp +++ b/hipamd/clang-hipify/src/Cuda2Hip.cpp @@ -769,27 +769,26 @@ private: } // end anonymous namespace // Set up the command line options -static cl::OptionCategory - ToolTemplateCategory("CUDA to HIP source translator options"); -static cl::extrahelp MoreHelp(" specify the path of source file\n\n"); +static cl::opt +InputFilename(cl::Positional, cl::desc(""), cl::init("-")); static cl::opt OutputFilename("o", cl::desc("Output filename"), - cl::value_desc("filename"), - cl::cat(ToolTemplateCategory)); + cl::value_desc("filename")); static cl::opt Inplace("inplace", cl::desc("Modify input file inplace, replacing input with hipified " "output, save backup in .prehip file. "), - cl::value_desc("inplace"), cl::cat(ToolTemplateCategory)); + cl::value_desc("inplace")); static cl::opt NoOutput("no-output", cl::desc("don't write any translated output to stdout"), - cl::value_desc("no-output"), cl::cat(ToolTemplateCategory)); + cl::value_desc("no-output")); + static cl::opt PrintStats("print-stats", cl::desc("print the command-line, like a header"), - cl::value_desc("print-stats"), cl::cat(ToolTemplateCategory)); + cl::value_desc("print-stats")); int main(int argc, const char **argv) { @@ -797,12 +796,13 @@ int main(int argc, const char **argv) { int Result; - CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, - llvm::cl::Required); + std::unique_ptr Compilations( + new FixedCompilationDatabase(".",std::vector())); + cl::ParseCommandLineOptions(argc, argv); + std::string dst = OutputFilename; - std::vector fileSources = OptionsParser.getSourcePathList(); if (dst.empty()) { - dst = fileSources[0]; + dst = InputFilename; if (!Inplace) { size_t pos = dst.rfind(".cu"); if (pos != std::string::npos) { @@ -820,13 +820,13 @@ int main(int argc, const char **argv) { } // copy source file since tooling makes changes "inplace" - std::ifstream source(fileSources[0], std::ios::binary); + std::ifstream source(InputFilename, std::ios::binary); std::ofstream dest(Inplace ? dst + ".prehip" : dst, std::ios::binary); dest << source.rdbuf(); source.close(); dest.close(); - RefactoringTool Tool(OptionsParser.getCompilations(), dst); + RefactoringTool Tool(*Compilations, dst); ast_matchers::MatchFinder Finder; Cuda2HipCallback Callback(&Tool.getReplacements(), &Finder); HipifyPPCallbacks PPCallbacks(&Tool.getReplacements()); @@ -931,7 +931,7 @@ int main(int argc, const char **argv) { llvm::outs() << counterNames[i] << ':' << Callback.countReps[i] + PPCallbacks.countReps[i] << ' '; } - llvm::outs() << ") in \'" << fileSources[0] << "\'\n"; + llvm::outs() << ") in \'" << InputFilename << "\'\n"; } return Result; } From be0d44bd741d8009fe964040af150bc7f44d3b21 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 24 May 2016 12:37:18 +0530 Subject: [PATCH 15/17] Move hipify-clang info to its own README Squashed commit of the following: commit bc44bcee461e46b0cf5cb9fe09213dca450b081a Author: Daniil Fukalov Date: Mon May 16 20:50:05 2016 +0300 added note about errors without CUDA sdk installed commit 5fd73ba90c0940bdc977737894362a99f4232b56 Author: Daniil Fukalov Date: Mon May 16 20:31:47 2016 +0300 move clang-hipify info to its own README commit 21d81a6d5acd3f093d77ac4d584e6f5bbe48f8cc Author: Daniil Fukalov Date: Mon May 16 20:30:00 2016 +0300 initial version Change-Id: I157294699a7be3d0bb38b2ee4a137a94280529c9 --- hipamd/INSTALL.md | 43 -------------------------------- hipamd/clang-hipify/README.md | 46 +++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 43 deletions(-) create mode 100644 hipamd/clang-hipify/README.md diff --git a/hipamd/INSTALL.md b/hipamd/INSTALL.md index dccbee2995..0b05a11f6b 100644 --- a/hipamd/INSTALL.md +++ b/hipamd/INSTALL.md @@ -11,9 +11,6 @@ - [HCC Options](#hcc-options) - [Using HIP with the AMD Native-GCN compiler.](#using-hip-with-the-amd-native-gcn-compiler) - [Compiling CodeXL markers for HIP Functions](#compiling-codexl-markers-for-hip-functions) - - [Using clang-hipify](#using-clang-hipify) - - [Building](#building) - - [Running and using clang-hipify](#running-and-using-clang-hipify) @@ -147,43 +144,3 @@ HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp ``` Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. - - -### Using clang-hipify - -Clang-hipify is a clang-based tool which can automate the translation of CUDA source code into portable HIP C++. -The clang-hipify tool can automatically add extra HIP arguments (notably the "hipLaunchParm" required at the -beginning of every HIP kernel call). Clang-hipify has some additional dependencies explained below and -can be built as a separate make step. - - -#### Building - -1. Download and unpack clang+llvm 3.8 binary package preqrequisite: -``` -wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz -tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz -``` - -2. Enable build of clang-hipify and specify path to LLVM: -Note LLVM_DIR must be a full absolute path (not relative) to the location extracted above. Here's an example assuming we -extract the clang 3.8 package into ~/HIP-privatestaging/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/. -``` -cd HIP-privatestaging -mkdir build.clang-hipify -cd build.clang-hipify -cmake -DBUILD_CLANG_HIPIFY=1 -DLLVM_DIR=~/HIP-privatestaging/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ -DCMAKE_BUILD_TYPE=Release .. -make -make install -``` - -#### Running and using clang-hipify -clang-hipify performs an initial compile of the CUDA source code into a "symbol tree", and thus needs access to the appropriate header files: - 1. Download "deb(network)" variant of target installer from https://developer.nvidia.com/cuda-downloads. The commands below show how to download and install a recent version from the http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb. - -``` -wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb -sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb -sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5 -``` - diff --git a/hipamd/clang-hipify/README.md b/hipamd/clang-hipify/README.md new file mode 100644 index 0000000000..6ea9e4a7a7 --- /dev/null +++ b/hipamd/clang-hipify/README.md @@ -0,0 +1,46 @@ +## Using hipify-clang + +`hipify-clang` is a clang-based tool which can automate the translation of CUDA source code into portable HIP C++. +The tool can automatically add extra HIP arguments (notably the "hipLaunchParm" required at the beginning of every HIP kernel call). +`hipify-clang` has some additional dependencies explained below and can be built as a separate make step. The instructions below are specifically for **Ubuntu 14.04** + +### Build and install + +- Download and unpack clang+llvm 3.8 binary package preqrequisite. +```shell +wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz +tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz +``` + +- Enable build of hipify-clang and specify path to LLVM. + +Note LLVM_DIR must be a full absolute path to the location extracted above. Here's an example assuming we extract the clang 3.8 package into ~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ +```shell +cd HIP +mkdir build +cd build +cmake -DBUILD_CLANG_HIPIFY=1 -DLLVM_DIR=~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ -DCMAKE_BUILD_TYPE=Release .. +make +make install +``` + +### Running and using hipify-clang + +`hipify-clang` performs an initial compile of the CUDA source code into a "symbol tree", and thus needs access to the appropriate header files. + +In the case when `hipify-clang` doesn't find cuda headers, it reports various errors about unknown keywords (e.g. '\__global\__'), API function names (e.g. 'cudaMalloc'), syntax (e.g. 'foo<<<1,n>>>(...)'), etc. + +To install CUDA headers, download the "deb(network)" variant of the target installer from https://developer.nvidia.com/cuda-downloads. The commands below show how to download and install a recent version from http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb. +```shell +wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb +sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb +sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5 +``` + +#### Disclaimer + +The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. + +AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. + +Copyright (c) 2014-2016 Advanced Micro Devices, Inc. All rights reserved. \ No newline at end of file From 65448e74ede2a06090e23c0cbcef7ae1ffbc001e Mon Sep 17 00:00:00 2001 From: Jack Chung Date: Mon, 23 May 2016 12:11:26 +0800 Subject: [PATCH 16/17] Squashed commit of the following: commit 9548493fa754b3bf5c31cbdc2211db1e73e8c07c Author: Jack Chung Date: Mon May 23 11:57:23 2016 +0800 Rename hipExternShared test to hipDynamicShared Change-Id: I180d9d539420fb69cfc121eceaa7db9da03483b2 commit 827081f8244a38f010789d556db0c4ff7b6422d8 Author: Jack Chung Date: Mon May 23 11:56:27 2016 +0800 Rename HIP_DECLARE_EXTERN_SHARED to HIP_DYNAMIC_SHARED Change-Id: I22362d179812ac547e0f11ba4e2bb999050e08ae commit 4c277228ed41af187739610fa17eab1fb144c947 Author: Jack Chung Date: Thu May 19 17:49:52 2016 +0800 Adopt new interface to get dynamic LDS in hc.hpp Change-Id: I47b433b714633a4c97df87c40a0b1d3386429a00 commit 5a36117d777064113a528dc47b42e8c8413baa97 Author: Jack Chung Date: Thu May 19 11:29:24 2016 +0800 Add test patterns for regular expression to match "extern __shared__" These test patterns should better be saved as an individual test case, but I'm not familiar with HIP test structures so I leave them as comments in hipify as of now. Change-Id: I7fee89c89b9e73de2133357a226ec0c769733531 commit 1b26284168c7f5339f63338fd0149bed5d994656 Author: Jack Chung Date: Thu May 19 11:25:23 2016 +0800 Add one HIP unit test to use HIP_DECLARE_EXTERN_SHARED Change-Id: I4d9907815920693a74ea9d575fe26e7c67636109 commit 77b816ee5972b13d829d5bbcf06fbfd07acea2af Author: Jack Chung Date: Wed May 18 19:18:59 2016 +0800 Adopt HIP_ prefix for DECLARE_EXTERN_SHARED macro Change-Id: I555ded16b449b67d2e20904013d86fe1ded6a2be commit ef0997939c3578a9ae11621bf21c0416f04d2622 Author: Jack Chung Date: Wed May 18 17:42:04 2016 +0800 Modify hipify to support converting extern __shared__ to DECLARE_EXTERN_SHARED macro Added regular expression to search & replace extern __shared__ declarations to DECLARE_EXTERN_SHARED macro. Limitation: - Won't work if "extern __shared__" is declared at global scope Sample Usages: extern __shared__ double foo[]; extern __shared__ unsigned int foo[]; extern volatile __shared__ double foo[]; extern volatile __shared__ unsigned int sdata[]; extern __shared__ volatile unsigned int sdata[]; extern __shared__ T s[]; extern __shared__ T::type s[]; extern __shared__ blah::type s[]; extern __shared__ typename mapper::type s_data[]; extern __attribute__((used)) __shared__ typename mapper::type s_data[]; Change-Id: I2be0b7039adeddb789f5a2b067d403a43fdc3e26 commit 93ff268724493aedfacdcd5a5aa9a100f4ebaed0 Author: Jack Chung Date: Wed May 18 15:13:09 2016 +0800 Introduce DECLARE_EXTERN_SHARED macro to encapsulate "extern __shared__" decls Change-Id: I93b2d37c763195b0ca9fd0afee78605a1e3272db commit cff9c95412de343cc6405158b5acc4f1029267ff Author: Jack Chung Date: Wed May 18 12:53:54 2016 +0800 Add __get_dynamic_groupbaseptr() to point to dynamic LDS Change-Id: I97b548d8a691488057617c551a8f331cad7afc77 Change-Id: I84e7875b76fa1f59e860e19c93bd4209cdd1fd2c --- hipamd/bin/hipify | 30 ++++- hipamd/include/hcc_detail/hip_runtime.h | 12 ++ hipamd/include/nvcc_detail/hip_runtime.h | 7 ++ hipamd/src/device_util.cpp | 5 + hipamd/tests/src/CMakeLists.txt | 3 + hipamd/tests/src/hipDynamicShared.cpp | 138 +++++++++++++++++++++++ 6 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 hipamd/tests/src/hipDynamicShared.cpp diff --git a/hipamd/bin/hipify b/hipamd/bin/hipify index af7c7edce7..ce934dff15 100755 --- a/hipamd/bin/hipify +++ b/hipamd/bin/hipify @@ -84,7 +84,7 @@ push (@warn_whitelist, split(',',$warn_whitelist)); #--- #Stats tracking code: -@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "other"); +@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "extern_shared", "other"); #--- @@ -428,6 +428,34 @@ while (@ARGV) { $countKeywords += m/__global__/; $countKeywords += m/__shared__/; + #-------- + # CUDA extern __shared__ syntax + # Note these only work if declaration is on a single line. + { + # match uses ? for <.*> which will be unitialized if this is not present in launch syntax. + no warnings qw/uninitialized/; + + my $k = 0; + + # Match extern __shared__ type foo[]; syntax + # Replace as HIP_DYNAMIC_SHARED() macro + $k += s/extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;/HIP_DYNAMIC_SHARED($1 $2, $3)/g; + + # test patterns for the regular expression above: + #'extern __shared__ double foo[];' + #'extern __shared__ unsigned int foo[];' + #'extern volatile __shared__ double foo[];' + #'extern volatile __shared__ unsigned int sdata[];' + #'extern __shared__ volatile unsigned int sdata[];' + #'extern __shared__ T s[];' + #'extern __shared__ T::type s[];' + #'extern __shared__ blah::type s[];' + #'extern __shared__ typename mapper::type s_data[];' + #'extern __attribute__((used)) __shared__ typename mapper::type s_data[];' + + $ft{'extern_shared'} += $k; + } + #-------- # CUDA Launch Syntax # Note these only work if launch is on a single line. diff --git a/hipamd/include/hcc_detail/hip_runtime.h b/hipamd/include/hcc_detail/hip_runtime.h index 0d70eaa2a4..eb518cc88d 100644 --- a/hipamd/include/hcc_detail/hip_runtime.h +++ b/hipamd/include/hcc_detail/hip_runtime.h @@ -430,6 +430,8 @@ __device__ float __shfl_xor(float input, int lane_mask, int width); __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); +__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr(); + //TODO - add a couple fast math operations here, the set here will grow : __device__ float __cosf(float x); __device__ float __expf(float x); @@ -556,6 +558,16 @@ do {\ #endif +/** + * extern __shared__ + */ + +// Macro to replace extern __shared__ declarations +// to local variable definitions +#define HIP_DYNAMIC_SHARED(type, var) \ + __attribute__((address_space(3))) type* var = \ + (__attribute__((address_space(3))) type*)__get_dynamicgroupbaseptr(); \ + #endif // __HCC__ diff --git a/hipamd/include/nvcc_detail/hip_runtime.h b/hipamd/include/nvcc_detail/hip_runtime.h index cb1253fdf1..06c6ffb9b3 100644 --- a/hipamd/include/nvcc_detail/hip_runtime.h +++ b/hipamd/include/nvcc_detail/hip_runtime.h @@ -95,6 +95,13 @@ kernelName<<>>(0, __VA_ARGS__);\ #define hipGridDim_y gridDim.y #define hipGridDim_z gridDim.z +/** + * extern __shared__ + */ + +#define HIP_DYNAMIC_SHARED(type, var) \ + extern __shared__ type var[]; \ + #endif diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 3234408e50..11686c030c 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -808,6 +808,11 @@ __host__ __device__ int max(int arg1, int arg2) return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } +__device__ __attribute__((address_space(3))) void* __get_dynamicgroupbaseptr() +{ + return hc::get_dynamic_group_segment_base_pointer(); +} + //TODO - add a couple fast math operations here, the set here will grow : diff --git a/hipamd/tests/src/CMakeLists.txt b/hipamd/tests/src/CMakeLists.txt index 111d945464..e0894df5fa 100644 --- a/hipamd/tests/src/CMakeLists.txt +++ b/hipamd/tests/src/CMakeLists.txt @@ -191,6 +191,7 @@ make_hip_executable (hipPeerToPeer_simple hipPeerToPeer_simple.cpp) make_hip_executable (hipMemcpyAll hipMemcpyAll.cpp) make_hip_executable (hipMultiThreadDevice hipMultiThreadDevice.cpp) make_hip_executable (hipTestMemcpyPin hipTestMemcpyPin.cpp) +make_hip_executable (hipDynamicShared hipDynamicShared.cpp) make_test(hip_ballot " " ) make_test(hip_anyall " " ) @@ -251,3 +252,5 @@ if (${HIP_PLATFORM} STREQUAL "hcc") endif() make_hipify_test(specialFunc.cu ) + +make_test(hipDynamicShared " ") diff --git a/hipamd/tests/src/hipDynamicShared.cpp b/hipamd/tests/src/hipDynamicShared.cpp new file mode 100644 index 0000000000..329529281c --- /dev/null +++ b/hipamd/tests/src/hipDynamicShared.cpp @@ -0,0 +1,138 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "test_common.h" + +template +__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t numElements, size_t groupElements) { + + // declare dynamic shared memory + HIP_DYNAMIC_SHARED(T, sdata) + + size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t tid = hipThreadIdx_x; + + // initialize dynamic shared memory + if (tid < groupElements) { + sdata[tid] = static_cast(tid); + } + + // prefix sum inside dynamic shared memory + if (groupElements >= 512) { + if (tid >= 256) { sdata[tid] += sdata[tid - 256]; } __syncthreads(); + } + if (groupElements >= 256) { + if (tid >= 128) { sdata[tid] += sdata[tid - 128]; } __syncthreads(); + } + if (groupElements >= 128) { + if (tid >= 64) { sdata[tid] += sdata[tid - 64]; } __syncthreads(); + } + if (groupElements >= 64) { sdata[tid] += sdata[tid - 32]; } __syncthreads(); + if (groupElements >= 32) { sdata[tid] += sdata[tid - 16]; } __syncthreads(); + if (groupElements >= 16) { sdata[tid] += sdata[tid - 8]; } __syncthreads(); + if (groupElements >= 8) { sdata[tid] += sdata[tid - 4]; } __syncthreads(); + if (groupElements >= 4) { sdata[tid] += sdata[tid - 2]; } __syncthreads(); + if (groupElements >= 2) { sdata[tid] += sdata[tid - 1]; } __syncthreads(); + + C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements]; +} + +template +void testExternShared(size_t N, size_t groupElements) { + size_t Nbytes = N * sizeof(T); + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + //printf("blocksPerCU: %d\nthreadsPerBlock: %d\nN: %zu\n", blocksPerCU, threadsPerBlock, N); + + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + // calculate the amount of dynamic shared memory required + size_t groupMemBytes = groupElements * sizeof(T); + + // launch kernel with dynamic shared memory + hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel), dim3(blocks), dim3(threadsPerBlock), groupMemBytes, 0, A_d, B_d, C_d, N, groupElements); + + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + // verify + for (size_t i = 0; i < N; ++i) { + size_t tid = (i % groupElements); + T sumFromSharedMemory = static_cast(tid * (tid + 1) / 2); + T expected = A_h[i] + B_h[i] + sumFromSharedMemory; + if (C_h[i] != expected) { + std::cout << std::fixed << std::setprecision(32); + std::cout << "At " << i << std::endl; + std::cout << " Computed:" << C_h[i] << std::endl; + std::cout << " Expected:" << expected << std::endl; + std::cout << sumFromSharedMemory << std::endl; + std::cout << A_h[i] << std::endl; + std::cout << B_h[i] << std::endl; + + failed("Failed at index:%zu\n", i); + } + } + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); +} + +int main(int argc, char *argv[]) { + HipTest::parseStandardArguments(argc, argv, true); + + //printf("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + testExternShared(1024, 4); + testExternShared(1024, 8); + testExternShared(1024, 16); + testExternShared(1024, 32); + testExternShared(1024, 64); + + testExternShared(65536, 4); + testExternShared(65536, 8); + testExternShared(65536, 16); + testExternShared(65536, 32); + testExternShared(65536, 64); + + testExternShared(1024, 4); + testExternShared(1024, 8); + testExternShared(1024, 16); + testExternShared(1024, 32); + testExternShared(1024, 64); + + testExternShared(65536, 4); + testExternShared(65536, 8); + testExternShared(65536, 16); + testExternShared(65536, 32); + testExternShared(65536, 64); + + passed(); +} + From b35fa83c47a667aac40e7b0cf48812e320294e95 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 6 Jun 2016 12:42:44 +0530 Subject: [PATCH 17/17] Use cpu agent when using staging buffer Change-Id: I195a8137e86f2752681d6ba4dc7ba1b6f654e264 --- hipamd/src/staging_buffer.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/hipamd/src/staging_buffer.cpp b/hipamd/src/staging_buffer.cpp index 109b3a1936..be5058e47b 100644 --- a/hipamd/src/staging_buffer.cpp +++ b/hipamd/src/staging_buffer.cpp @@ -32,6 +32,8 @@ THE SOFTWARE. #define tprintf(trace_level, ...) #endif +extern hsa_agent_t g_cpu_agent; // defined in hip_hcc.cpp + //------------------------------------------------------------------------------------------------- StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_region_t systemRegion, size_t bufferSize, int numBuffers) : _hsa_agent(hsaAgent), @@ -106,7 +108,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { @@ -169,7 +171,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { @@ -223,7 +225,7 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _hsa_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -287,7 +289,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], g_cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -315,7 +317,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], dstAgent /*not used*/, theseBytes, + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], g_cpu_agent /*not used*/, theseBytes, hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], _completion_signal2[bufferIndex]);