Re-sync with upstream.
This commit is contained in:
@@ -288,12 +288,22 @@ if(HIP_PLATFORM STREQUAL "hcc")
|
||||
install(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
if(HIP_COMPILER STREQUAL "hcc")
|
||||
configure_package_config_file(
|
||||
hip-config.cmake.in
|
||||
hip-config-hcc.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
|
||||
INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
|
||||
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
|
||||
)
|
||||
elseif(HIP_COMPILER STREQUAL "clang")
|
||||
configure_package_config_file(
|
||||
hip-config-clang.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
|
||||
INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
|
||||
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
|
||||
)
|
||||
endif()
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
|
||||
VERSION "${HIP_VERSION}"
|
||||
|
||||
Vendored
+39
-2
@@ -167,7 +167,8 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
|
||||
}
|
||||
|
||||
// Cap the maximum amount of testing, in case of hangs
|
||||
// Excluding hipPrintfKernel test from automation; variable fails on CI test machines
|
||||
// Excluding hipVectorTypes test from automation; due to regression from HCC commit 2367133
|
||||
// Excluding hipFloatMath test from automation; due to regression from ROCDL commit 2fc04e1
|
||||
timeout(time: 1, unit: 'HOURS')
|
||||
{
|
||||
stage("${platform} unit testing")
|
||||
@@ -177,7 +178,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
|
||||
cd ${build_dir_rel}
|
||||
make install -j\$(nproc)
|
||||
make build_tests -i -j\$(nproc)
|
||||
ctest -E hipVectorTypes
|
||||
ctest -E "(hipVectorTypes.tst|hipVectorTypesDevice.tst|hipFloatMath.tst)"
|
||||
"""
|
||||
// If unit tests output a junit or xunit file in the future, jenkins can parse that file
|
||||
// to display test results on the dashboard
|
||||
@@ -385,4 +386,40 @@ rocm_head:
|
||||
docker_clean_images( job_name, hip_image_name )
|
||||
*/
|
||||
}
|
||||
},
|
||||
cuda_9_x:
|
||||
{
|
||||
node('hip-cuda')
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Block of string constants customizing behavior for cuda
|
||||
String nvcc_ver = 'nvcc-9.x'
|
||||
String from_image = 'ci_test_nodes/cuda-9.x/ubuntu-16.04:latest'
|
||||
String inside_args = '--runtime=nvidia';
|
||||
|
||||
// Checkout source code, dependencies and version files
|
||||
String source_hip_rel = checkout_and_version( nvcc_ver )
|
||||
|
||||
// Create/reuse a docker image that represents the hip build environment
|
||||
def hip_build_image = docker_build_image( nvcc_ver, 'hip', '', source_hip_rel, from_image )
|
||||
|
||||
// Print system information for the log
|
||||
hip_build_image.inside( inside_args )
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
nvidia-smi
|
||||
nvcc --version
|
||||
"""
|
||||
}
|
||||
|
||||
// Conctruct a binary directory path based on build config
|
||||
String build_hip_rel = build_directory_rel( build_config );
|
||||
|
||||
// Build hip inside of the build environment
|
||||
docker_build_inside_image( hip_build_image, inside_args, nvcc_ver, "-DHIP_NVCC_FLAGS=--Wno-deprecated-gpu-targets", build_config, source_hip_rel, build_hip_rel )
|
||||
|
||||
// Clean docker image
|
||||
docker_clean_images( 'hip', docker_build_image_name( ) )
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,9 +35,9 @@
|
||||
| enum |***`cublasAtomicsMode_t`*** | |
|
||||
| 0 |*`CUBLAS_ATOMICS_NOT_ALLOWED`* | |
|
||||
| 1 |*`CUBLAS_ATOMICS_ALLOWED`* | |
|
||||
| enum |***`cublasAtomicsMode_t`*** | |
|
||||
| -1 |*`CUBLAS_GEMM_DFALT`* | |
|
||||
| -1 |*`CUBLAS_GEMM_DEFAULT`* | |
|
||||
| enum |***`cublasGemmAlgo_t`*** |***`hipblasGemmAlgo_t`*** |
|
||||
| -1 |*`CUBLAS_GEMM_DFALT`* |*`HIPBLAS_GEMM_DEFAULT`* | 160 |
|
||||
| -1 |*`CUBLAS_GEMM_DEFAULT`* |*`HIPBLAS_GEMM_DEFAULT`* | 160 |
|
||||
| 0 |*`CUBLAS_GEMM_ALGO0`* | |
|
||||
| 1 |*`CUBLAS_GEMM_ALGO1`* | |
|
||||
| 2 |*`CUBLAS_GEMM_ALGO2`* | |
|
||||
@@ -391,7 +391,7 @@
|
||||
|`cublasZgemm3m` | |
|
||||
|`cublasHgemm` |`hipblasHgemm` |
|
||||
|`cublasSgemmEx` | |
|
||||
|`cublasGemmEx` | |
|
||||
|`cublasGemmEx` |`hipblasGemmEx` |
|
||||
|`cublasCgemmEx` | |
|
||||
|`cublasUint8gemmBias` | |
|
||||
|`cublasSsyrk` | |
|
||||
|
||||
@@ -93,10 +93,10 @@
|
||||
|
||||
## **7. Occupancy**
|
||||
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|-------------------------------|
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|-----------------------------------------------|
|
||||
| `cudaOccupancyMaxActiveBlocksPerMultiprocessor` | `hipOccupancyMaxActiveBlocksPerMultiprocessor`|
|
||||
| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | |
|
||||
| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | |
|
||||
|
||||
## **8. Execution Control [deprecated since 7.0]**
|
||||
|
||||
@@ -326,13 +326,13 @@
|
||||
|
||||
## **25. Texture Object Management**
|
||||
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|-------------------------------|
|
||||
| `cudaCreateTextureObject` |`hipCreateTextureObject` |
|
||||
| `cudaDestroyTextureObject` |`hipDestroyTextureObject` |
|
||||
| `cudaGetTextureObjectResourceDesc` |`hipGetTextureObjectResourceDesc` |
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|--------------------------------------|
|
||||
| `cudaCreateTextureObject` |`hipCreateTextureObject` |
|
||||
| `cudaDestroyTextureObject` |`hipDestroyTextureObject` |
|
||||
| `cudaGetTextureObjectResourceDesc` |`hipGetTextureObjectResourceDesc` |
|
||||
| `cudaGetTextureObjectResourceViewDesc` |`hipGetTextureObjectResourceViewDesc` |
|
||||
| `cudaGetTextureObjectTextureDesc` |`hipGetTextureObjectTextureDesc` |
|
||||
| `cudaGetTextureObjectTextureDesc` |`hipGetTextureObjectTextureDesc` |
|
||||
|
||||
## **26. Surface Object Management**
|
||||
|
||||
@@ -352,36 +352,36 @@
|
||||
## **28. C++ API Routines**
|
||||
*(7.0 contains, 7.5 doesn’t)*
|
||||
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|-------------------------------|
|
||||
| `cudaBindSurfaceToArray` | |
|
||||
| `cudaBindTexture` | `hipBindTexture` |
|
||||
| `cudaBindTexture2D` | |
|
||||
| `cudaBindTextureToArray` | |
|
||||
| `cudaBindTextureToMipmappedArray` | |
|
||||
| `cudaCreateChannelDesc` | `hipCreateChannelDesc` |
|
||||
| `cudaFuncGetAttributes` | |
|
||||
| `cudaFuncSetCacheConfig` | |
|
||||
| `cudaGetSymbolAddress` | |
|
||||
| `cudaGetSymbolSize` | |
|
||||
| `cudaGetTextureAlignmentOffset` | |
|
||||
| `cudaLaunch` | |
|
||||
| `cudaLaunchKernel` | |
|
||||
| `cudaMallocHost` | |
|
||||
| `cudaMallocManaged` | |
|
||||
| `cudaMemcpyFromSymbol` | |
|
||||
| `cudaMemcpyFromSymbolAsync` | |
|
||||
| `cudaMemcpyToSymbol` | |
|
||||
| `cudaMemcpyToSymbolAsync` | |
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|------------------------------------------------|
|
||||
| `cudaBindSurfaceToArray` | |
|
||||
| `cudaBindTexture` | `hipBindTexture` |
|
||||
| `cudaBindTexture2D` | |
|
||||
| `cudaBindTextureToArray` | |
|
||||
| `cudaBindTextureToMipmappedArray` | |
|
||||
| `cudaCreateChannelDesc` | `hipCreateChannelDesc` |
|
||||
| `cudaFuncGetAttributes` | |
|
||||
| `cudaFuncSetCacheConfig` | |
|
||||
| `cudaGetSymbolAddress` | |
|
||||
| `cudaGetSymbolSize` | |
|
||||
| `cudaGetTextureAlignmentOffset` | |
|
||||
| `cudaLaunch` | |
|
||||
| `cudaLaunchKernel` | |
|
||||
| `cudaMallocHost` | |
|
||||
| `cudaMallocManaged` | |
|
||||
| `cudaMemcpyFromSymbol` | |
|
||||
| `cudaMemcpyFromSymbolAsync` | |
|
||||
| `cudaMemcpyToSymbol` | |
|
||||
| `cudaMemcpyToSymbolAsync` | |
|
||||
| `cudaOccupancyMaxActiveBlocksPerMultiprocessor` | `hipOccupancyMaxActiveBlocksPerMultiprocessor` |
|
||||
| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | |
|
||||
| `cudaOccupancyMaxPotentialBlockSize` | `hipOccupancyMaxPotentialBlockSize` |
|
||||
| `cudaOccupancyMaxPotentialBlockSizeVariableSMem` | |
|
||||
| `cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags` | |
|
||||
| `cudaOccupancyMaxPotentialBlockSizeWithFlags` | |
|
||||
| `cudaSetupArgument` | |
|
||||
| `cudaStreamAttachMemAsync` | |
|
||||
| `cudaUnbindTexture` | `hipUnbindTexture` |
|
||||
| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | |
|
||||
| `cudaOccupancyMaxPotentialBlockSize` | `hipOccupancyMaxPotentialBlockSize` |
|
||||
| `cudaOccupancyMaxPotentialBlockSizeVariableSMem` | |
|
||||
| `cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags` | |
|
||||
| `cudaOccupancyMaxPotentialBlockSizeWithFlags` | |
|
||||
| `cudaSetupArgument` | |
|
||||
| `cudaStreamAttachMemAsync` | |
|
||||
| `cudaUnbindTexture` | `hipUnbindTexture` |
|
||||
|
||||
## **30. Profiler Control**
|
||||
|
||||
@@ -395,8 +395,8 @@
|
||||
|
||||
## **31. Data types**
|
||||
|
||||
| **type** | **CUDA** | **HIP** |
|
||||
|-------------:|-----------------------------------------------|------------------------------------------------------|
|
||||
| **type** | **CUDA** | **HIP** |**HIP value** (if differs) |
|
||||
|-------------:|-----------------------------------------------|------------------------------------------------------|---------------------------|
|
||||
| struct | `cudaChannelFormatDesc` | `hipChannelFormatDesc` |
|
||||
| struct | `cudaDeviceProp` | `hipDeviceProp_t` |
|
||||
| struct | `cudaExtent` | `hipExtent` |
|
||||
@@ -790,3 +790,19 @@
|
||||
| define | `cudaTextureType1DLayered` | `hipTextureType1DLayered` |
|
||||
| define | `cudaTextureType2DLayered` | `hipTextureType2DLayered` |
|
||||
| define | `cudaTextureTypeCubemapLayered` | `hipTextureTypeCubemapLayered` |
|
||||
| enum |***`cudaDataType_t`*** |***`hipblasDatatype_t`*** |
|
||||
| enum |***`cudaDataType`*** |***`hipblasDatatype_t`*** |
|
||||
| 2 |*`CUDA_R_16F`* |*`HIPBLAS_R_16F`* | 150 |
|
||||
| 6 |*`CUDA_C_16F`* |*`HIPBLAS_C_16F`* | 153 |
|
||||
| 0 |*`CUDA_R_32F`* |*`HIPBLAS_R_32F`* | 151 |
|
||||
| 4 |*`CUDA_C_32F`* |*`HIPBLAS_C_32F`* | 154 |
|
||||
| 1 |*`CUDA_R_64F`* |*`HIPBLAS_R_64F`* | 152 |
|
||||
| 5 |*`CUDA_C_64F`* |*`HIPBLAS_C_64F`* | 155 |
|
||||
| 3 |*`CUDA_R_8I`* | |
|
||||
| 7 |*`CUDA_C_8I`* | |
|
||||
| 8 |*`CUDA_R_8U`* | |
|
||||
| 9 |*`CUDA_C_8U`* | |
|
||||
| 10 |*`CUDA_R_32I`* | |
|
||||
| 11 |*`CUDA_C_32I`* | |
|
||||
| 12 |*`CUDA_R_32U`* | |
|
||||
| 13 |*`CUDA_C_32U`* | |
|
||||
|
||||
@@ -239,8 +239,8 @@
|
||||
|`cudnnSetTensor` |`hipdnnSetTensor` |
|
||||
|`cudnnScaleTensor` |`hipdnnScaleTensor` |
|
||||
|`cudnnCreateFilterDescriptor` |`hipdnnCreateFilterDescriptor` |
|
||||
|`cudnnSetFilter4dDescriptor` | |
|
||||
|`cudnnGetFilter4dDescriptor` | |
|
||||
|`cudnnSetFilter4dDescriptor` |`hipdnnSetFilter4dDescriptor` |
|
||||
|`cudnnGetFilter4dDescriptor` |`hipdnnGetFilter4dDescriptor` |
|
||||
|`cudnnSetFilterNdDescriptor` |`hipdnnSetFilterNdDescriptor` |
|
||||
|`cudnnGetFilterNdDescriptor` |`hipdnnGetFilterNdDescriptor` |
|
||||
|`cudnnDestroyFilterDescriptor` |`hipdnnDestroyFilterDescriptor` |
|
||||
|
||||
@@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m
|
||||
|
||||
Following environment variables can be used to control the transfer thresholds:
|
||||
|
||||
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
|
||||
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
|
||||
|
||||
- HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.
|
||||
|
||||
- HCC_D2H_PININPLACE_THRESHOLD - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
|
||||
|
||||
## Device-Side Malloc
|
||||
|
||||
hip-hcc and hip-clang supports device-side malloc and free. Users can allocate
|
||||
memory dynamically in a kernel. The allocated memory are in global address
|
||||
space, however, different threads get different memory allocations for the same
|
||||
call of malloc. The allocated memory can be accessed or freed by other threads
|
||||
or other kernels. It persists in the life time of the HIP program until it is
|
||||
freed.
|
||||
|
||||
The memory are allocated in pages. Users can define macro
|
||||
`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro
|
||||
`__HIP_NUM_PAGES` for controlling the total number of pages that can be
|
||||
allocated.
|
||||
@@ -48,9 +48,6 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
|
||||
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
|
||||
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
|
||||
|
||||
if(HIP_COMPILER STREQUAL "hcc")
|
||||
find_dependency(hcc)
|
||||
endif()
|
||||
include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
|
||||
|
||||
set( hip_LIBRARIES hip::host hip::device)
|
||||
@@ -0,0 +1,65 @@
|
||||
@PACKAGE_INIT@
|
||||
|
||||
include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND)
|
||||
if (NOT _CMakeFindDependencyMacro_FOUND)
|
||||
macro(find_dependency dep)
|
||||
if (NOT ${dep}_FOUND)
|
||||
set(cmake_fd_version)
|
||||
if (${ARGC} GREATER 1)
|
||||
set(cmake_fd_version ${ARGV1})
|
||||
endif()
|
||||
set(cmake_fd_exact_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT)
|
||||
set(cmake_fd_exact_arg EXACT)
|
||||
endif()
|
||||
set(cmake_fd_quiet_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
|
||||
set(cmake_fd_quiet_arg QUIET)
|
||||
endif()
|
||||
set(cmake_fd_required_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
|
||||
set(cmake_fd_required_arg REQUIRED)
|
||||
endif()
|
||||
find_package(${dep} ${cmake_fd_version}
|
||||
${cmake_fd_exact_arg}
|
||||
${cmake_fd_quiet_arg}
|
||||
${cmake_fd_required_arg}
|
||||
)
|
||||
string(TOUPPER ${dep} cmake_dep_upper)
|
||||
if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.")
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False)
|
||||
return()
|
||||
endif()
|
||||
set(cmake_fd_version)
|
||||
set(cmake_fd_required_arg)
|
||||
set(cmake_fd_quiet_arg)
|
||||
set(cmake_fd_exact_arg)
|
||||
endif()
|
||||
endmacro()
|
||||
endif()
|
||||
|
||||
|
||||
set_and_check( hip_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
|
||||
set_and_check( hip_INCLUDE_DIRS "${hip_INCLUDE_DIR}" )
|
||||
set_and_check( hip_LIB_INSTALL_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
|
||||
set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
|
||||
|
||||
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
|
||||
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
|
||||
|
||||
find_dependency(hcc)
|
||||
include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
|
||||
|
||||
set( hip_LIBRARIES hip::host hip::device)
|
||||
set( hip_LIBRARY ${hip_LIBRARIES})
|
||||
|
||||
set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR})
|
||||
set(HIP_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
|
||||
set(HIP_LIB_INSTALL_DIR ${hip_LIB_INSTALL_DIR})
|
||||
set(HIP_BIN_INSTALL_DIR ${hip_BIN_INSTALL_DIR})
|
||||
set(HIP_LIBRARIES ${hip_LIBRARIES})
|
||||
set(HIP_LIBRARY ${hip_LIBRARY})
|
||||
set(HIP_HIPCC_EXECUTABLE ${hip_HIPCC_EXECUTABLE})
|
||||
set(HIP_HIPCONFIG_EXECUTABLE ${hip_HIPCONFIG_EXECUTABLE})
|
||||
|
||||
@@ -89,7 +89,8 @@ if (HIPIFY_CLANG_TESTS)
|
||||
if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
|
||||
(CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
|
||||
(CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
|
||||
(CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0"))
|
||||
(CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR
|
||||
CUDA_VERSION VERSION_EQUAL "10.0")
|
||||
message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
|
||||
if (CUDA_VERSION VERSION_LESS "7.0")
|
||||
message(STATUS "Please install CUDA 7.0 or higher.")
|
||||
|
||||
@@ -30,11 +30,16 @@
|
||||
## <a name="dependencies"></a> Dependencies
|
||||
|
||||
`hipify-clang` requires:
|
||||
1. LLVM+CLANG of at least version 3.8.0, latest stable and recommended release is 6.0.1.
|
||||
2. CUDA at least version 7.5, latest supported release is 9.0.
|
||||
1. LLVM+CLANG of at least version 3.8.0, latest stable and recommended releases:
|
||||
|
||||
6.0.1 (linux and windows),
|
||||
|
||||
| **LLVM release version** | **CUDA latest supported version** |
|
||||
|:------------------------:|:---------------------------------:|
|
||||
7.0.0 (linux only).
|
||||
|
||||
2. CUDA at least version 7.5, latest supported release is 9.2.
|
||||
|
||||
| **LLVM release version** | **CUDA latest supported version** | **Comments** |
|
||||
|:------------------------:|:---------------------------------:|:------------:|
|
||||
| 3.8.0 | 7.5 |
|
||||
| 3.8.1 | 7.5 |
|
||||
| 3.9.0 | 7.5 |
|
||||
@@ -46,6 +51,8 @@
|
||||
| 5.0.2 | 8.0 |
|
||||
| 6.0.0 | 9.0 |
|
||||
| 6.0.1 | 9.0 |
|
||||
| 7.0.0 | 9.2 | linux only |
|
||||
| | 10.0 | not yet supported |
|
||||
|
||||
In most cases, you can get a suitable version of LLVM+CLANG with your package manager.
|
||||
|
||||
|
||||
@@ -1417,22 +1417,23 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
|
||||
|
||||
/////////////////////////////// CUDA RT API ///////////////////////////////
|
||||
// Data types
|
||||
{"cudaDataType_t", {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"cudaDataType", {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_16F", {"hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_16F", {"hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_32F", {"hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_32F", {"hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_64F", {"hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_64F", {"hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_8I", {"hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_8I", {"hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_8U", {"hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_8U", {"hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_32I", {"hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_32I", {"hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_R_32U", {"hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
{"CUDA_C_32U", {"hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},
|
||||
// TODO: rename hipblasDatatype_t to hipDataType_t and move from hipBLAS to HIP
|
||||
{"cudaDataType_t", {"hipblasDatatype_t", CONV_TYPE, API_RUNTIME}},
|
||||
{"cudaDataType", {"hipblasDatatype_t", CONV_TYPE, API_RUNTIME}},
|
||||
{"CUDA_R_16F", {"HIPBLAS_R_16F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 2 // 150
|
||||
{"CUDA_C_16F", {"HIPBLAS_C_16F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 6 // 153
|
||||
{"CUDA_R_32F", {"HIPBLAS_R_32F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 0 // 151
|
||||
{"CUDA_C_32F", {"HIPBLAS_C_32F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 4 // 154
|
||||
{"CUDA_R_64F", {"HIPBLAS_R_64F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 1 // 152
|
||||
{"CUDA_C_64F", {"HIPBLAS_C_64F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 5 // 155
|
||||
{"CUDA_R_8I", {"HIPBLAS_R_8I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 3 //
|
||||
{"CUDA_C_8I", {"HIPBLAS_C_8I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 7 //
|
||||
{"CUDA_R_8U", {"HIPBLAS_R_8U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 8 //
|
||||
{"CUDA_C_8U", {"HIPBLAS_C_8U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 9 //
|
||||
{"CUDA_R_32I", {"HIPBLAS_R_32I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 10 //
|
||||
{"CUDA_C_32I", {"HIPBLAS_C_32I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 11 //
|
||||
{"CUDA_R_32U", {"HIPBLAS_R_32U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 12 //
|
||||
{"CUDA_C_32U", {"HIPBLAS_C_32U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 13 //
|
||||
|
||||
// Library property types
|
||||
// IMPORTANT: no cuda prefix
|
||||
@@ -2143,11 +2144,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
|
||||
{"CUBLAS_ATOMICS_ALLOWED", {"HIPBLAS_ATOMICS_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
|
||||
|
||||
// Blas Data Type
|
||||
{"cublasDataType_t", {"hipblasDataType_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"CUBLAS_DATA_FLOAT", {"HIPBLAS_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"CUBLAS_DATA_DOUBLE", {"HIPBLAS_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"CUBLAS_DATA_HALF", {"HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"CUBLAS_DATA_INT8", {"HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"cublasDataType_t", {"hipblasDatatype_t", CONV_TYPE, API_BLAS}},
|
||||
|
||||
// Blas Math mode/tensor operation
|
||||
{"cublasMath_t", {"hipblasMath_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}},
|
||||
@@ -2155,9 +2152,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
|
||||
{"CUBLAS_TENSOR_OP_MATH", {"HIPBLAS_TENSOR_OP_MATH", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}},
|
||||
|
||||
// Blass different GEMM algorithms
|
||||
{"cublasGemmAlgo_t", {"hipblasGemmAlgo_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"CUBLAS_GEMM_DFALT", {"HIPBLAS_GEMM_DFALT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // -1
|
||||
{"CUBLAS_GEMM_DEFAULT", {"HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // -1
|
||||
{"cublasGemmAlgo_t", {"hipblasGemmAlgo_t", CONV_TYPE, API_BLAS}},
|
||||
{"CUBLAS_GEMM_DFALT", {"HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS}}, // -1 // 160
|
||||
{"CUBLAS_GEMM_DEFAULT", {"HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS}}, // -1 // 160
|
||||
{"CUBLAS_GEMM_ALGO0", {"HIPBLAS_GEMM_ALGO0", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // 0
|
||||
{"CUBLAS_GEMM_ALGO1", {"HIPBLAS_GEMM_ALGO1", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // 1
|
||||
{"CUBLAS_GEMM_ALGO2", {"HIPBLAS_GEMM_ALGO2", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // 2
|
||||
@@ -2715,7 +2712,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
|
||||
|
||||
//IO in FP16 / FP32, computation in float
|
||||
{"cublasSgemmEx", {"hipblasSgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"cublasGemmEx", {"hipblasGemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"cublasGemmEx", {"hipblasGemmEx", CONV_MATH_FUNC, API_BLAS}},
|
||||
{"cublasGemmBatchedEx", {"hipblasGemmBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
|
||||
{"cublasGemmStridedBatchedEx", {"hipblasGemmStridedBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}},
|
||||
// IO in Int8 complex/cuComplex, computation in cuComplex
|
||||
|
||||
@@ -27,11 +27,12 @@ THE SOFTWARE.
|
||||
#include "math_fwd.h"
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <stddef.h>
|
||||
|
||||
|
||||
#include <hip/hip_vector_types.h>
|
||||
#include <hip/hcc_detail/device_library_decls.h>
|
||||
#include <hip/hcc_detail/llvm_intrinsics.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/*
|
||||
Integer Intrinsics
|
||||
*/
|
||||
@@ -1036,4 +1037,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) {
|
||||
unsigned char val8 = static_cast<unsigned char>(val);
|
||||
return __hip_hc_memset(ptr, val8, size);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -29,7 +29,7 @@ THE SOFTWARE.
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) && (__clang_major__ > 3)
|
||||
#if defined(__clang__) && (__clang_major__ > 5)
|
||||
typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
|
||||
|
||||
struct __half_raw {
|
||||
|
||||
@@ -26,6 +26,8 @@ THE SOFTWARE.
|
||||
// Half Math Functions
|
||||
// */
|
||||
|
||||
#include "host_defines.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
|
||||
|
||||
// Implementation of malloc and free device functions.
|
||||
// HIP heap is implemented as a global array with fixed size. Users may define
|
||||
// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
|
||||
|
||||
// Size of page in bytes.
|
||||
#ifndef __HIP_SIZE_OF_PAGE
|
||||
#define __HIP_SIZE_OF_PAGE 64
|
||||
#endif
|
||||
|
||||
// Total number of pages
|
||||
#ifndef __HIP_NUM_PAGES
|
||||
#define __HIP_NUM_PAGES (16 * 64 * 64)
|
||||
#endif
|
||||
|
||||
#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
|
||||
|
||||
#if __HCC__ || __HIP__
|
||||
|
||||
__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
|
||||
__attribute__((weak)) __device__
|
||||
uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
|
||||
|
||||
extern "C" inline __device__ void* __hip_malloc(size_t size) {
|
||||
char* heap = (char*)__hip_device_heap;
|
||||
if (size > __HIP_SIZE_OF_HEAP) {
|
||||
return (void*)nullptr;
|
||||
}
|
||||
uint32_t totalThreads =
|
||||
hipBlockDim_x * hipGridDim_x * hipBlockDim_y
|
||||
* hipGridDim_y * hipBlockDim_z * hipGridDim_z;
|
||||
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
|
||||
+ (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
|
||||
+ (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
|
||||
* hipBlockDim_y;
|
||||
|
||||
uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
|
||||
uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
|
||||
|
||||
uint32_t stride = size / __HIP_SIZE_OF_PAGE;
|
||||
uint32_t start = numHeapsPerWorkItem * currentWorkItem;
|
||||
|
||||
uint32_t k = 0;
|
||||
|
||||
while (__hip_device_page_flag[k] > 0) {
|
||||
k++;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < stride - 1; i++) {
|
||||
__hip_device_page_flag[i + start + k] = 1;
|
||||
}
|
||||
|
||||
__hip_device_page_flag[start + stride - 1 + k] = 2;
|
||||
|
||||
void* ptr = (void*)(heap
|
||||
+ heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ void* __hip_free(void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
|
||||
uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
|
||||
|
||||
while (__hip_device_page_flag[offsetPage] != 0) {
|
||||
if (__hip_device_page_flag[offsetPage] == 2) {
|
||||
__hip_device_page_flag[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
break;
|
||||
} else {
|
||||
__hip_device_page_flag[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
|
||||
@@ -128,7 +128,7 @@ extern int HIP_TRACE_API;
|
||||
|
||||
|
||||
// Feature tests:
|
||||
#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
|
||||
#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
|
||||
// Device compile and not host compile:
|
||||
|
||||
// 32-bit Atomics:
|
||||
@@ -245,11 +245,11 @@ static constexpr Coordinates<hc_get_workitem_id> threadIdx;
|
||||
|
||||
#endif // defined __HCC__
|
||||
#if __HCC_OR_HIP_CLANG__
|
||||
extern "C" __device__ void* __hip_hc_malloc(size_t);
|
||||
extern "C" __device__ void* __hip_hc_free(void* ptr);
|
||||
extern "C" __device__ void* __hip_malloc(size_t);
|
||||
extern "C" __device__ void* __hip_free(void* ptr);
|
||||
|
||||
static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); }
|
||||
static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); }
|
||||
static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
|
||||
static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
|
||||
|
||||
#if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
|
||||
template <typename... All>
|
||||
@@ -331,13 +331,13 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri
|
||||
typedef int hipLaunchParm;
|
||||
|
||||
template <typename... Args, typename F = void (*)(Args...)>
|
||||
inline void hipLaunchKernelGGL(F kernelName, const dim3& numblocks, const dim3& numthreads,
|
||||
inline void hipLaunchKernelGGL(F&& kernelName, const dim3& numblocks, const dim3& numthreads,
|
||||
unsigned memperblock, hipStream_t streamId, Args... args) {
|
||||
kernelName<<<numblocks, numthreads, memperblock, streamId>>>(args...);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
|
||||
inline void hipLaunchKernel(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
inline void hipLaunchKernel(F&& kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t groupMemBytes, hipStream_t stream, Args... args) {
|
||||
hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, groupMemBytes, stream, hipLaunchParm{},
|
||||
std::move(args)...);
|
||||
@@ -423,6 +423,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
|
||||
#define hipGridDim_y gridDim.y
|
||||
#define hipGridDim_z gridDim.z
|
||||
|
||||
#include <hip/hcc_detail/math_functions.h>
|
||||
|
||||
#if __HIP_HCC_COMPAT_MODE__
|
||||
// Define HCC work item functions in terms of HIP builtin variables.
|
||||
#pragma push_macro("__DEFINE_HCC_FUNC")
|
||||
@@ -462,7 +464,6 @@ hc_get_workitem_absolute_id(int dim)
|
||||
#undef __CUDA__
|
||||
#pragma pop_macro("__CUDA__")
|
||||
|
||||
#include <hip/hcc_detail/math_functions.h>
|
||||
|
||||
hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
@@ -474,4 +475,6 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
|
||||
#endif // defined(__clang__) && defined(__HIP__)
|
||||
|
||||
#include <hip/hcc_detail/hip_memory.h>
|
||||
|
||||
#endif // HIP_HCC_DETAIL_RUNTIME_H
|
||||
|
||||
@@ -22,6 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "hip_fp16_math_fwd.h"
|
||||
#include "math_fwd.h"
|
||||
|
||||
#include <hip/hcc_detail/host_defines.h>
|
||||
@@ -1159,6 +1160,16 @@ long long llabs(long long x)
|
||||
#endif
|
||||
// END INTEGER
|
||||
|
||||
__DEVICE__
|
||||
inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) {
|
||||
return __ocml_fma_f16(x, y, z);
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline float fma(float x, float y, float z) {
|
||||
return fmaf(x, y, z);
|
||||
}
|
||||
|
||||
#pragma push_macro("__DEF_FLOAT_FUN")
|
||||
#pragma push_macro("__DEF_FLOAT_FUN2")
|
||||
#pragma push_macro("__DEF_FLOAT_FUN2I")
|
||||
@@ -1352,10 +1363,10 @@ __DEVICE__ inline static unsigned long long max(long long arg1, unsigned long lo
|
||||
return max((unsigned long long) arg1, arg2);
|
||||
}*/
|
||||
#else
|
||||
__DEVICE__ inline static int min(int arg1, int arg2) {
|
||||
__DEVICE__ inline int min(int arg1, int arg2) {
|
||||
return (arg1 < arg2) ? arg1 : arg2;
|
||||
}
|
||||
__DEVICE__ inline static int max(int arg1, int arg2) {
|
||||
__DEVICE__ inline int max(int arg1, int arg2) {
|
||||
return (arg1 > arg2) ? arg1 : arg2;
|
||||
}
|
||||
|
||||
|
||||
@@ -118,6 +118,9 @@ typedef int hipLaunchParm;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __clock() clock()
|
||||
#define __clock64() clock64()
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -25,7 +25,7 @@ popd () {
|
||||
function setupENV()
|
||||
{
|
||||
sudo apt-get update
|
||||
sudo apt-get install dpkg-dev rpm doxygen libelf-dev
|
||||
sudo apt-get install dpkg-dev rpm doxygen libelf-dev rename
|
||||
}
|
||||
|
||||
function buildHIP()
|
||||
|
||||
@@ -27,70 +27,6 @@ THE SOFTWARE.
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <atomic>
|
||||
|
||||
//=================================================================================================
|
||||
/*
|
||||
Implementation of malloc and free device functions.
|
||||
|
||||
This is the best place to put them because the device
|
||||
global variables need to be initialized at the start.
|
||||
*/
|
||||
__device__ char gpuHeap[SIZE_OF_HEAP];
|
||||
__device__ uint32_t gpuFlags[NUM_PAGES];
|
||||
|
||||
__device__ void* __hip_hc_malloc(size_t size) {
|
||||
char* heap = (char*)gpuHeap;
|
||||
if (size > SIZE_OF_HEAP) {
|
||||
return (void*)nullptr;
|
||||
}
|
||||
uint32_t totalThreads =
|
||||
blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
|
||||
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
|
||||
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
|
||||
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
|
||||
|
||||
uint32_t stride = size / SIZE_OF_PAGE;
|
||||
uint32_t start = numHeapsPerWorkItem * currentWorkItem;
|
||||
|
||||
uint32_t k = 0;
|
||||
|
||||
while (gpuFlags[k] > 0) {
|
||||
k++;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < stride - 1; i++) {
|
||||
gpuFlags[i + start + k] = 1;
|
||||
}
|
||||
|
||||
gpuFlags[start + stride - 1 + k] = 2;
|
||||
|
||||
void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
__device__ void* __hip_hc_free(void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap;
|
||||
uint32_t offsetPage = offsetByte / SIZE_OF_PAGE;
|
||||
|
||||
while (gpuFlags[offsetPage] != 0) {
|
||||
if (gpuFlags[offsetPage] == 2) {
|
||||
gpuFlags[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
break;
|
||||
} else {
|
||||
gpuFlags[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// abort
|
||||
__device__ void abort() { return hc::abort(); }
|
||||
|
||||
|
||||
@@ -29,14 +29,6 @@ THE SOFTWARE.
|
||||
Heap size computation for malloc and free device functions.
|
||||
*/
|
||||
|
||||
#define NUM_PAGES_PER_THREAD 16
|
||||
#define SIZE_OF_PAGE 64
|
||||
#define NUM_THREADS_PER_CU 64
|
||||
#define NUM_CUS_PER_GPU 64 // Specific for r9 Nano
|
||||
#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU
|
||||
#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE
|
||||
#define SIZE_OF_HEAP SIZE_MALLOC
|
||||
|
||||
#define HIP_SQRT_2 1.41421356237
|
||||
#define HIP_SQRT_PI 1.77245385091
|
||||
|
||||
@@ -62,9 +54,6 @@ THE SOFTWARE.
|
||||
|
||||
#define HIP_PI 3.14159265358979323846
|
||||
|
||||
__device__ void* __hip_hc_malloc(size_t size);
|
||||
__device__ void* __hip_hc_free(void* ptr);
|
||||
|
||||
__device__ float __hip_erfinvf(float x);
|
||||
__device__ double __hip_erfinv(double x);
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
if (limit == hipLimitMallocHeapSize) {
|
||||
*pValue = (size_t)SIZE_OF_HEAP;
|
||||
*pValue = (size_t)__HIP_SIZE_OF_HEAP;
|
||||
return ihipLogStatus(hipSuccess);
|
||||
} else {
|
||||
return ihipLogStatus(hipErrorUnsupportedLimit);
|
||||
|
||||
@@ -1543,6 +1543,8 @@ hipError_t ihipMemset(void* dst, int value, size_t sizeBytes, hipStream_t strea
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
if (sizeBytes == 0) return e;
|
||||
|
||||
if (stream && (dst != NULL)) {
|
||||
if(copyDataType == ihipMemsetDataTypeChar){
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s NVCC_OPTIONS -std=c++11
|
||||
* RUN: %t EXCLUDE_HIP_PLATFORM nvcc
|
||||
* HIT_END
|
||||
*/
|
||||
#include "test_common.h"
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
// Tolerance for error
|
||||
const double tolerance = 1e-6;
|
||||
const bool verbose = false;
|
||||
|
||||
#define BLKDIM_X 64
|
||||
#define BLKDIM_Y 1
|
||||
#define BLKDIM_Z 1
|
||||
#define NUM_BLK_X 1
|
||||
#define NUM_BLK_Y 1
|
||||
#define NUM_BLK_Z 1
|
||||
|
||||
#define LEN (BLKDIM_X * BLKDIM_Y * BLKDIM_Z * NUM_BLK_X * NUM_BLK_Y * NUM_BLK_Z)
|
||||
|
||||
#define ALL_FUN \
|
||||
OP(add) \
|
||||
OP(sub) \
|
||||
OP(mul) \
|
||||
OP(div)
|
||||
|
||||
#define OP(x) CK_##x,
|
||||
enum CalcKind {
|
||||
ALL_FUN
|
||||
};
|
||||
#undef OP
|
||||
|
||||
#define OP(x) case CK_##x: return #x;
|
||||
std::string getName(enum CalcKind CK) {
|
||||
switch(CK){
|
||||
ALL_FUN
|
||||
}
|
||||
}
|
||||
#undef OP
|
||||
|
||||
// Calculates function.
|
||||
// If the function has one argument, B is ignored.
|
||||
#define ONE_ARG(func) \
|
||||
case CK_##func: \
|
||||
return std::func(A);
|
||||
|
||||
template <typename FloatT>
|
||||
__device__ __host__ FloatT calc(FloatT A, FloatT B, enum CalcKind CK) {
|
||||
switch (CK) {
|
||||
case CK_add:
|
||||
return A + B;
|
||||
case CK_sub:
|
||||
return A - B;
|
||||
case CK_mul:
|
||||
return A * B;
|
||||
case CK_div:
|
||||
return A / B;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate memory in kernel and save the address to pA and pB.
|
||||
// Copy value from A, B to allocated memory.
|
||||
template <typename FloatT>
|
||||
__global__ void kernel_alloc(FloatT* A, FloatT* B, FloatT** pA, FloatT** pB) {
|
||||
int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
|
||||
+ (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
|
||||
+ (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
|
||||
* hipBlockDim_y;
|
||||
if (tx == 0) {
|
||||
*pA = (FloatT*)malloc(sizeof(FloatT) * LEN);
|
||||
*pB = (FloatT*)malloc(sizeof(FloatT) * LEN);
|
||||
for (int i = 0; i < LEN; i++) {
|
||||
(*pA)[i] = A[i];
|
||||
(*pB)[i] = B[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Do calculation using values saved in allocated memmory. pA, pB are buffers
|
||||
// containing the address of the device-side allocated array.
|
||||
template <typename FloatT>
|
||||
__global__ void kernel_free(FloatT** pA, FloatT** pB, FloatT* C, enum CalcKind CK) {
|
||||
int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
|
||||
+ (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
|
||||
+ (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
|
||||
* hipBlockDim_y;
|
||||
C[tx] = calc<FloatT>((*pA)[tx], (*pB)[tx], CK);
|
||||
if (tx == 0) {
|
||||
free(*pA);
|
||||
free(*pB);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FloatT>
|
||||
void test() {
|
||||
FloatT *A, *Ad, *B, *Bd, *C, *Cd, *D;
|
||||
A = new FloatT[LEN];
|
||||
B = new FloatT[LEN];
|
||||
C = new FloatT[LEN];
|
||||
D = new FloatT[LEN];
|
||||
hipMalloc((void**)&Ad, sizeof(FloatT) * LEN);
|
||||
hipMalloc((void**)&Bd, sizeof(FloatT) * LEN);
|
||||
hipMalloc((void**)&Cd, sizeof(FloatT) * LEN);
|
||||
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
A[i] = (i + 1) * 1.0f;
|
||||
B[i] = A[i];
|
||||
C[i] = A[i];
|
||||
}
|
||||
hipMemcpy(Ad, A, sizeof(FloatT) * LEN, hipMemcpyHostToDevice);
|
||||
hipMemcpy(Bd, B, sizeof(FloatT) * LEN, hipMemcpyHostToDevice);
|
||||
|
||||
// Run kernel for a calculation kind and verify by comparing with host
|
||||
// calculation result. Returns false if fails.
|
||||
auto test_fun = [&](enum CalcKind CK) {
|
||||
// kernel_alloc allocates memory on device side and initialize it.
|
||||
// kernel_free uses allocated memory from kernel_alloc and does the
|
||||
// calculation then free the memory.
|
||||
// pA and pB are buffers to pass the device-side allocated memory address
|
||||
// from kernel_alloc to kernel_free.
|
||||
FloatT **pA, **pB;
|
||||
hipMalloc((FloatT***)&pA, sizeof(FloatT*));
|
||||
hipMalloc((FloatT***)&pB, sizeof(FloatT*));
|
||||
dim3 blkDim(BLKDIM_X, BLKDIM_Y, BLKDIM_Z);
|
||||
dim3 numBlk(NUM_BLK_X, NUM_BLK_Y, NUM_BLK_Z);
|
||||
hipLaunchKernelGGL(kernel_alloc<FloatT>, numBlk, blkDim, 0, 0,
|
||||
Ad, Bd, pA, pB);
|
||||
hipDeviceSynchronize();
|
||||
hipLaunchKernelGGL(kernel_free<FloatT>, numBlk, blkDim, 0, 0,
|
||||
pA, pB, Cd, CK);
|
||||
hipMemcpy(C, Cd, sizeof(FloatT) * LEN, hipMemcpyDeviceToHost);
|
||||
hipFree(pA);
|
||||
hipFree(pB);
|
||||
for (int i = 0; i < LEN; i++) {
|
||||
FloatT Expected = calc(A[i], B[i], CK);
|
||||
FloatT error = std::abs(C[i] - Expected);
|
||||
if (std::abs(Expected) > tolerance) error /= std::abs(Expected);
|
||||
bool pass = error < tolerance;
|
||||
if (verbose || !pass) {
|
||||
std::cout << "Function: " << getName(CK) << " Operands: " << A[i] << " " << B[i]
|
||||
<< " Result: " << C[i] << " Expected: " << Expected << " Error: " << error
|
||||
<< " Pass: " << pass << std::endl;
|
||||
}
|
||||
if (!pass)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
#define OP(x) assert(test_fun(CK_##x));
|
||||
ALL_FUN
|
||||
#undef OP
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
delete[] C;
|
||||
delete[] D;
|
||||
}
|
||||
|
||||
int main() {
|
||||
test<float>();
|
||||
test<double>();
|
||||
passed();
|
||||
return 0;
|
||||
}
|
||||
@@ -40,10 +40,10 @@ __global__ void floatMath(hipLaunchParm lp, float* In, float* Out) {
|
||||
Out[tid] = __exp10f(Out[tid]);
|
||||
Out[tid] = __expf(Out[tid]);
|
||||
Out[tid] = __frsqrt_rn(Out[tid]);
|
||||
Out[tid] = __fsqrt_rd(Out[tid]);
|
||||
Out[tid] = __fsqrt_rn(Out[tid]);
|
||||
Out[tid] = __fsqrt_ru(Out[tid]);
|
||||
Out[tid] = __fsqrt_rz(Out[tid]);
|
||||
//Out[tid] = __fsqrt_rd(Out[tid]);
|
||||
//Out[tid] = __fsqrt_rn(Out[tid]);
|
||||
//Out[tid] = __fsqrt_ru(Out[tid]);
|
||||
//Out[tid] = __fsqrt_rz(Out[tid]);
|
||||
Out[tid] = __log10f(Out[tid]);
|
||||
Out[tid] = __log2f(Out[tid]);
|
||||
Out[tid] = __logf(Out[tid]);
|
||||
|
||||
@@ -18,7 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 --gpu-architecture=sm_60
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -334,4 +334,4 @@ int main(int argc, char** argv) {
|
||||
hipDeviceReset();
|
||||
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,8 +33,6 @@ THE SOFTWARE.
|
||||
#define LEN 512
|
||||
#define SIZE 2048
|
||||
|
||||
struct TestClock {
|
||||
|
||||
static __global__ void kernel1(int* Ad) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = clock() + clock64() + __clock() + __clock64();
|
||||
@@ -61,9 +59,8 @@ struct TestClock {
|
||||
assert(0 != A[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
int main() {
|
||||
TestClock().run();
|
||||
run();
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/math_functions.h>
|
||||
#include <iostream>
|
||||
|
||||
#define HIP_ASSERT(status) assert(status == hipSuccess)
|
||||
|
||||
#define LEN 50
|
||||
#define SIZE (LEN * sizeof(bool))
|
||||
|
||||
__global__ void kernelTestFMA(bool *Ad) {
|
||||
float f = 1.0f / 3.0f;
|
||||
double d = f;
|
||||
int i = 0;
|
||||
auto Check = [&](bool Cond) { Ad[i++] = Cond; };
|
||||
// f * f + 3.0f will be different if promoted to double.
|
||||
float floatResult = fma(f, f, 3.0f);
|
||||
double doubleResult = fma(d, d, 3.0);
|
||||
Check(floatResult != doubleResult);
|
||||
|
||||
// check promote to float.
|
||||
Check(fma(f, f, 3) == floatResult);
|
||||
Check(fma(f, f, (char)3) == floatResult);
|
||||
Check(fma(f, f, (unsigned char)3) == floatResult);
|
||||
Check(fma(f, f, (short)3) == floatResult);
|
||||
Check(fma(f, f, (unsigned short)3) == floatResult);
|
||||
Check(fma(f, f, (int)3) == floatResult);
|
||||
Check(fma(f, f, (unsigned int)3) == floatResult);
|
||||
Check(fma(f, f, (long)3) == floatResult);
|
||||
Check(fma(f, f, (unsigned long)3) == floatResult);
|
||||
Check(fma(f, f, true) == fma(f, f, 1.0f));
|
||||
|
||||
// check promote to double.
|
||||
Check(fma(d, (double)f, 3) == doubleResult);
|
||||
Check(fma(d, (double)f, (char)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (unsigned char)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (short)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (unsigned short)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (int)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (unsigned int)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (long)3) == doubleResult);
|
||||
Check(fma(d, (double)f, (unsigned long)3) == doubleResult);
|
||||
Check(fma(d, (double)f, true) == fma((double)f, (double)f, 1.0));
|
||||
|
||||
while (i < LEN)
|
||||
Check(true);
|
||||
}
|
||||
|
||||
void runTestFMA() {
|
||||
bool *Ad;
|
||||
bool A[LEN];
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
A[i] = 0;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void **)&Ad, SIZE));
|
||||
hipLaunchKernelGGL(kernelTestFMA, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad);
|
||||
HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
|
||||
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
assert(A[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernelTestHalfFMA(bool *Ad) {
|
||||
_Float16 h = (_Float16)(1.0f/3.0f);
|
||||
float f = h;
|
||||
double d = f;
|
||||
int i = 0;
|
||||
auto Check = [&](bool Cond) { Ad[i++] = Cond; };
|
||||
// h * h + 3 will be different if promoted to float.
|
||||
_Float16 halfResult = fma(h, h, (_Float16)3);
|
||||
float floatResult = fma(f, f, 3.0f);
|
||||
double doubleResult = fma(d, d, 3.0);
|
||||
Check(halfResult != floatResult);
|
||||
Check(halfResult != doubleResult);
|
||||
|
||||
// check promote to half.
|
||||
Check(fma(h, h, 3) == halfResult);
|
||||
Check(fma(h, h, (char)3) == halfResult);
|
||||
Check(fma(h, h, (unsigned char)3) == halfResult);
|
||||
Check(fma(h, h, (short)3) == halfResult);
|
||||
Check(fma(h, h, (unsigned short)3) == halfResult);
|
||||
Check(fma(h, h, (int)3) == halfResult);
|
||||
Check(fma(h, h, (unsigned int)3) == halfResult);
|
||||
Check(fma(h, h, (long)3) == halfResult);
|
||||
Check(fma(h, h, (unsigned long)3) == halfResult);
|
||||
Check(fma(h, h, true) == fma(h, h, (_Float16)1));
|
||||
|
||||
while (i < LEN)
|
||||
Check(true);
|
||||
}
|
||||
|
||||
void runTestHalfFMA() {
|
||||
bool *Ad;
|
||||
bool A[LEN];
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
A[i] = 0;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void **)&Ad, SIZE));
|
||||
hipLaunchKernelGGL(kernelTestHalfFMA, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad);
|
||||
HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
|
||||
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
assert(A[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
runTestFMA();
|
||||
runTestHalfFMA();
|
||||
passed();
|
||||
}
|
||||
@@ -30,6 +30,7 @@ THE SOFTWARE.
|
||||
// Incorrect implementation causes compilation failure due to conflict
|
||||
// declartions.
|
||||
|
||||
#include <new>
|
||||
#include <hip/math_functions.h>
|
||||
|
||||
// Test __HIP_DEVICE_COMPILE__ is defined after math_functions.h
|
||||
@@ -45,14 +46,6 @@ __device__ __host__ inline void throw_std_bad_alloc()
|
||||
#endif
|
||||
}
|
||||
|
||||
// Test __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ and __HIP_ARCH_HAS_DYNAMIC_PARALLEL__
|
||||
// is defined. Eigen HIP/hcc/Half.h __ldg depends on this.
|
||||
#if !defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) || \
|
||||
!defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__)
|
||||
#error \
|
||||
"__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ or __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ not defined"
|
||||
#endif
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "test_common.h"
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ THE SOFTWARE.
|
||||
#define LEN 512
|
||||
#define SIZE 2048
|
||||
|
||||
struct TestPlacementNew {
|
||||
class A {
|
||||
public:
|
||||
__device__ A() {
|
||||
@@ -63,9 +62,8 @@ struct TestPlacementNew {
|
||||
assert(i == A[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
int main() {
|
||||
TestPlacementNew().run();
|
||||
run();
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -34,21 +34,26 @@ THE SOFTWARE.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x) == hipSuccess))
|
||||
|
||||
#define LEN 512
|
||||
#define SIZE LEN << 2
|
||||
|
||||
#define TEST_DEBUG (0)
|
||||
|
||||
__global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d,
|
||||
float* sin_pd, float* cos_pd) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
sin_d[tid] = __sinf(In[tid]);
|
||||
cos_d[tid] = __cosf(In[tid]);
|
||||
tan_d[tid] = __tanf(In[tid]);
|
||||
__sincosf(In[tid], &sin_pd[tid], &cos_pd[tid]);
|
||||
sin_d[tid] = sinf(In[tid]);
|
||||
cos_d[tid] = cosf(In[tid]);
|
||||
tan_d[tid] = tanf(In[tid]);
|
||||
sincosf(In[tid], &sin_pd[tid], &cos_pd[tid]);
|
||||
}
|
||||
|
||||
int main() {
|
||||
float *In, *sin_h, *cos_h, *tan_h, *sin_ph, *cos_ph;
|
||||
float *In_d, *sin_d, *cos_d, *tan_d, *sin_pd, *cos_pd;
|
||||
int errors = 0;
|
||||
In = new float[LEN];
|
||||
sin_h = new float[LEN];
|
||||
cos_h = new float[LEN];
|
||||
@@ -63,14 +68,16 @@ int main() {
|
||||
sin_ph[i] = 0.0f;
|
||||
cos_ph[i] = 0.0f;
|
||||
}
|
||||
hipMalloc((void**)&In_d, SIZE);
|
||||
hipMalloc((void**)&sin_d, SIZE);
|
||||
hipMalloc((void**)&cos_d, SIZE);
|
||||
hipMalloc((void**)&tan_d, SIZE);
|
||||
hipMalloc((void**)&sin_pd, SIZE);
|
||||
hipMalloc((void**)&cos_pd, SIZE);
|
||||
HIP_ASSERT(hipMalloc((void**)&In_d, SIZE));
|
||||
HIP_ASSERT(hipMalloc((void**)&sin_d, SIZE));
|
||||
HIP_ASSERT(hipMalloc((void**)&cos_d, SIZE));
|
||||
HIP_ASSERT(hipMalloc((void**)&tan_d, SIZE));
|
||||
HIP_ASSERT(hipMalloc((void**)&sin_pd, SIZE));
|
||||
HIP_ASSERT(hipMalloc((void**)&cos_pd, SIZE));
|
||||
|
||||
hipMemcpy(In_d, In, SIZE, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, In_d, sin_d, cos_d, tan_d,
|
||||
hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
|
||||
In_d, sin_d, cos_d, tan_d,
|
||||
sin_pd, cos_pd);
|
||||
hipMemcpy(sin_h, sin_d, SIZE, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(cos_h, cos_d, SIZE, hipMemcpyDeviceToHost);
|
||||
@@ -79,8 +86,28 @@ int main() {
|
||||
hipMemcpy(cos_ph, cos_pd, SIZE, hipMemcpyDeviceToHost);
|
||||
for (int i = 0; i < LEN; i++) {
|
||||
if (sin_h[i] != sin_ph[i] || cos_h[i] != cos_ph[i] || tan_h[i] * cos_h[i] != sin_h[i]) {
|
||||
std::cout << "Failed!" << std::endl;
|
||||
errors++;
|
||||
#if TEST_DEBUG
|
||||
std::cout << "Check Failed!" << std::endl;
|
||||
std::cout << " sin_h: " << sin_h[i] << " sin_ph: " << sin_ph[i] << "\n"
|
||||
<< " cos_h: " << cos_h[i] << " cos_ph:" << cos_ph[i] << "\n"
|
||||
<< " tan_h * cos_h: " << tan_h[i] * cos_h[i] << " sin_h[i]: " << sin_h[i] << "\n";
|
||||
#endif
|
||||
}
|
||||
}
|
||||
passed();
|
||||
|
||||
HIP_ASSERT(hipFree(In_d));
|
||||
HIP_ASSERT(hipFree(sin_d));
|
||||
HIP_ASSERT(hipFree(cos_d));
|
||||
HIP_ASSERT(hipFree(tan_d));
|
||||
HIP_ASSERT(hipFree(sin_pd));
|
||||
HIP_ASSERT(hipFree(cos_pd));
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "hip_trig FAILED!" << std::endl;
|
||||
return -1;
|
||||
} else {
|
||||
std::cout << "hip_trig PASSED!" << std::endl;
|
||||
}
|
||||
return errors;
|
||||
}
|
||||
|
||||
@@ -38,7 +38,10 @@ int getDeviceNumber() {
|
||||
string str;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
if (!(in = popen("./directed_tests/hipEnvVar -c", "r"))) {
|
||||
return 1;
|
||||
// Check at same level
|
||||
if (!(in = popen("./hipEnvVar -c", "r"))) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
while (fgets(buff, 512, in) != NULL) {
|
||||
cout << buff;
|
||||
@@ -54,7 +57,11 @@ void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) {
|
||||
str += std::to_string(deviceID);
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
if (!(in = popen(str.c_str(), "r"))) {
|
||||
exit(1);
|
||||
// Check at same level
|
||||
if (!(in = popen("./hipEnvVar -d ", "r"))) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
while (fgets(pciBusID, 100, in) != NULL) {
|
||||
cout << pciBusID;
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <iostream>
|
||||
#include "test_common.h"
|
||||
|
||||
#define HIP_ASSERT(status) assert(status == hipSuccess)
|
||||
|
||||
#define LEN 512
|
||||
#define SIZE 2048
|
||||
|
||||
__constant__ int ConstantGlobalVar = 123;
|
||||
|
||||
static __global__ void kernel(int* Ad) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = ConstantGlobalVar;
|
||||
}
|
||||
|
||||
void runTestConstantGlobalVar() {
|
||||
int *A, *Ad;
|
||||
A = new int[LEN];
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
A[i] = 0;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
|
||||
hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
|
||||
HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
|
||||
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
assert(123 == A[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ int GlobalArray[LEN];
|
||||
|
||||
static __global__ void kernelWrite() {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
GlobalArray[tid] = tid;
|
||||
}
|
||||
static __global__ void kernelRead(int* Ad) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = GlobalArray[tid];
|
||||
}
|
||||
|
||||
void runTestGlobalArray() {
|
||||
int *A, *Ad;
|
||||
A = new int[LEN];
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
A[i] = 0;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
|
||||
hipLaunchKernelGGL(kernelWrite, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0);
|
||||
hipLaunchKernelGGL(kernelRead, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
|
||||
HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
|
||||
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
assert(i == A[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
runTestConstantGlobalVar();
|
||||
runTestGlobalArray();
|
||||
passed();
|
||||
}
|
||||
@@ -170,8 +170,8 @@ void runTests(int64_t numElements) {
|
||||
// for (int waitStart=0; waitStart<2; waitStart++) {
|
||||
for (int waitStart = 1; waitStart >= 0; waitStart--) {
|
||||
unsigned W = waitStart ? 0x1000 : 0;
|
||||
test(W | 0x01, C_d, C_h, numElements, 0, waitStart, syncNone);
|
||||
test(W | 0x02, C_d, C_h, numElements, stream, waitStart, syncNone);
|
||||
test(W | 0x01, C_d, C_h, numElements, 0, 0, syncNone);
|
||||
test(W | 0x02, C_d, C_h, numElements, stream, 0, syncNone);
|
||||
test(W | 0x04, C_d, C_h, numElements, 0, waitStart, syncStream);
|
||||
test(W | 0x08, C_d, C_h, numElements, stream, waitStart, syncStream);
|
||||
test(W | 0x10, C_d, C_h, numElements, 0, waitStart, syncStopEvent);
|
||||
|
||||
@@ -56,7 +56,7 @@ int runTest() {
|
||||
int testResult = 1;
|
||||
float *texBuf;
|
||||
float val[N], output[N];
|
||||
size_t size = 0;
|
||||
size_t offset = 0;
|
||||
float *devBuf;
|
||||
for (int i = 0; i < N; i++) {
|
||||
val[i] = (float)i;
|
||||
@@ -74,7 +74,8 @@ int runTest() {
|
||||
tex.filterMode = hipFilterModePoint;
|
||||
tex.normalized = 0;
|
||||
|
||||
HIPCHECK(hipBindTexture(&size, tex, (void *)texBuf, chanDesc, N * sizeof(float)));
|
||||
HIPCHECK(hipBindTexture(&offset, tex, (void *)texBuf, chanDesc, N * sizeof(float)));
|
||||
HIPCHECK(hipGetTextureAlignmentOffset(&offset,&tex));
|
||||
|
||||
dim3 dimBlock(64, 1, 1);
|
||||
dim3 dimGrid(N / dimBlock.x, 1, 1);
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "test_common.h"
|
||||
|
||||
using namespace std;
|
||||
#define R 8 //rows, height
|
||||
#define C 8 //columns, width
|
||||
|
||||
texture<int, hipTextureType2D,hipReadModeElementType> tex;
|
||||
|
||||
bool runTest(void);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
bool testResult=runTest();
|
||||
|
||||
if (testResult) {
|
||||
passed();
|
||||
} else {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
bool runTest()
|
||||
{
|
||||
int val[R][C],i,j;
|
||||
size_t offset;
|
||||
|
||||
for(i=0;i<R;i++)
|
||||
for(j=0;j<C;j++)
|
||||
{
|
||||
val[i][j]=(i+1)*(j+1);
|
||||
}
|
||||
hipChannelFormatDesc chan_desc=hipCreateChannelDesc(32,0,0,0,hipChannelFormatKindSigned);
|
||||
hipArray *hipArray;
|
||||
HIPCHECK(hipMallocArray(&hipArray, &chan_desc,C,R,0));
|
||||
|
||||
HIPCHECK(hipMemcpyToArray(hipArray,0,0, val, R*C*sizeof(int), hipMemcpyHostToDevice));
|
||||
|
||||
tex.addressMode[0]=hipAddressModeWrap;
|
||||
tex.addressMode[1]=hipAddressModeWrap;
|
||||
tex.filterMode=hipFilterModePoint;
|
||||
tex.normalized=0;
|
||||
|
||||
HIPCHECK(hipBindTextureToArray(&tex, hipArray, &chan_desc));
|
||||
HIPCHECK(hipGetTextureAlignmentOffset(&offset,&tex));
|
||||
HIPCHECK(hipUnbindTexture(&tex));
|
||||
HIPCHECK(hipFreeArray(hipArray));
|
||||
if(offset != 0)
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
Reference in New Issue
Block a user