From 02d0e936012ef91fc0ed2173b753254f48f87b24 Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Tue, 24 Jul 2018 18:12:32 -0400 Subject: [PATCH 01/34] Support malloc/free for hip-clang --- docs/markdown/hip_programming_guide.md | 16 +- include/hip/hcc_detail/device_functions.h | 1 + include/hip/hcc_detail/hip_memory.h | 102 ++++++++++++ include/hip/hcc_detail/hip_runtime.h | 13 +- src/device_util.cpp | 64 -------- src/device_util.h | 11 -- src/hip_device.cpp | 2 +- tests/src/deviceLib/hipDeviceMalloc.cpp | 190 ++++++++++++++++++++++ 8 files changed, 317 insertions(+), 82 deletions(-) create mode 100644 include/hip/hcc_detail/hip_memory.h create mode 100644 tests/src/deviceLib/hipDeviceMalloc.cpp diff --git a/docs/markdown/hip_programming_guide.md b/docs/markdown/hip_programming_guide.md index 9313eb22e1..52d250cab5 100644 --- a/docs/markdown/hip_programming_guide.md +++ b/docs/markdown/hip_programming_guide.md @@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m Following environment variables can be used to control the transfer thresholds: -- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64. +- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64. - HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096. - HCC_D2H_PININPLACE_THRESHOLD - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024. + +## Device-Side Malloc + +hip-hcc and hip-clang supports device-side malloc and free. Users can allocate +memory dynamically in a kernel. The allocated memory are in global address +space, however, different threads get different memory allocations for the same +call of malloc. The allocated memory can be accessed or freed by other threads +or other kernels. It persists in the life time of the HIP program until it is +freed. + +The memory are allocated in pages. Users can define macro +`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro +`__HIP_NUM_PAGES` for controlling the total number of pages that can be +allocated. \ No newline at end of file diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index 6455fa6cd1..34a9a194e2 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -1029,4 +1029,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) { unsigned char val8 = static_cast(val); return __hip_hc_memset(ptr, val8, size); } + #endif diff --git a/include/hip/hcc_detail/hip_memory.h b/include/hip/hcc_detail/hip_memory.h new file mode 100644 index 0000000000..9167baba38 --- /dev/null +++ b/include/hip/hcc_detail/hip_memory.h @@ -0,0 +1,102 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H + +// Implementation of malloc and free device functions. +// HIP heap is implemented as a global array with fixed size. Users may define +// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap. + +// Size of page in bytes. +#ifndef __HIP_SIZE_OF_PAGE +#define __HIP_SIZE_OF_PAGE 64 +#endif + +// Total number of pages +#ifndef __HIP_NUM_PAGES +#define __HIP_NUM_PAGES (16 * 64 * 64) +#endif + +#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE) + +__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP]; +__attribute__((weak)) __device__ + uint32_t __hip_device_page_flag[__HIP_NUM_PAGES]; + +extern "C" inline __device__ void* __hip_malloc(size_t size) { + char* heap = (char*)__hip_device_heap; + if (size > __HIP_SIZE_OF_HEAP) { + return (void*)nullptr; + } + uint32_t totalThreads = + hipBlockDim_x * hipGridDim_x * hipBlockDim_y + * hipGridDim_y * hipBlockDim_z * hipGridDim_z; + uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x; + + uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads; + uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads; + + uint32_t stride = size / __HIP_SIZE_OF_PAGE; + uint32_t start = numHeapsPerWorkItem * currentWorkItem; + + uint32_t k = 0; + + while (__hip_device_page_flag[k] > 0) { + k++; + } + + for (uint32_t i = 0; i < stride - 1; i++) { + __hip_device_page_flag[i + start + k] = 1; + } + + __hip_device_page_flag[start + stride - 1 + k] = 2; + + void* ptr = (void*)(heap + + heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE); + + return ptr; +} + +extern "C" inline __device__ void* __hip_free(void* ptr) { + if (ptr == nullptr) { + return nullptr; + } + + uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap; + uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE; + + while (__hip_device_page_flag[offsetPage] != 0) { + if (__hip_device_page_flag[offsetPage] == 2) { + __hip_device_page_flag[offsetPage] = 0; + offsetPage++; + break; + } else { + __hip_device_page_flag[offsetPage] = 0; + offsetPage++; + } + } + + return nullptr; +} + +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index c2ae6e8e4f..8734feaf5d 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -260,11 +260,11 @@ static constexpr Coordinates threadIdx; #endif // defined __HCC__ #if __HCC_OR_HIP_CLANG__ -extern "C" __device__ void* __hip_hc_malloc(size_t); -extern "C" __device__ void* __hip_hc_free(void* ptr); +extern "C" __device__ void* __hip_malloc(size_t); +extern "C" __device__ void* __hip_free(void* ptr); -static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); } -static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); } +static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); } +static inline __device__ void* free(void* ptr) { return __hip_free(ptr); } #ifdef __HCC_ACCELERATOR__ @@ -438,6 +438,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; #define hipGridDim_y gridDim.y #define hipGridDim_z gridDim.z +#include + // Support std::complex. #pragma push_macro("__CUDA__") #define __CUDA__ @@ -448,8 +450,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; #undef __CUDA__ #pragma pop_macro("__CUDA__") -#include #endif +#include + #endif // HIP_HCC_DETAIL_RUNTIME_H diff --git a/src/device_util.cpp b/src/device_util.cpp index 87fbe0fcbc..7fa77dc5fe 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -28,70 +28,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include -//================================================================================================= -/* - Implementation of malloc and free device functions. - - This is the best place to put them because the device - global variables need to be initialized at the start. -*/ -__device__ char gpuHeap[SIZE_OF_HEAP]; -__device__ uint32_t gpuFlags[NUM_PAGES]; - -__device__ void* __hip_hc_malloc(size_t size) { - char* heap = (char*)gpuHeap; - if (size > SIZE_OF_HEAP) { - return (void*)nullptr; - } - uint32_t totalThreads = - blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z; - uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x; - - uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads; - uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads; - - uint32_t stride = size / SIZE_OF_PAGE; - uint32_t start = numHeapsPerWorkItem * currentWorkItem; - - uint32_t k = 0; - - while (gpuFlags[k] > 0) { - k++; - } - - for (uint32_t i = 0; i < stride - 1; i++) { - gpuFlags[i + start + k] = 1; - } - - gpuFlags[start + stride - 1 + k] = 2; - - void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE); - - return ptr; -} - -__device__ void* __hip_hc_free(void* ptr) { - if (ptr == nullptr) { - return nullptr; - } - - uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap; - uint32_t offsetPage = offsetByte / SIZE_OF_PAGE; - - while (gpuFlags[offsetPage] != 0) { - if (gpuFlags[offsetPage] == 2) { - gpuFlags[offsetPage] = 0; - offsetPage++; - break; - } else { - gpuFlags[offsetPage] = 0; - offsetPage++; - } - } - - return nullptr; -} - // abort __device__ void abort() { return hc::abort(); } diff --git a/src/device_util.h b/src/device_util.h index 8fa96da9d9..84dbbf71ed 100644 --- a/src/device_util.h +++ b/src/device_util.h @@ -29,14 +29,6 @@ THE SOFTWARE. Heap size computation for malloc and free device functions. */ -#define NUM_PAGES_PER_THREAD 16 -#define SIZE_OF_PAGE 64 -#define NUM_THREADS_PER_CU 64 -#define NUM_CUS_PER_GPU 64 // Specific for r9 Nano -#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU -#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE -#define SIZE_OF_HEAP SIZE_MALLOC - #define HIP_SQRT_2 1.41421356237 #define HIP_SQRT_PI 1.77245385091 @@ -62,9 +54,6 @@ THE SOFTWARE. #define HIP_PI 3.14159265358979323846 -__device__ void* __hip_hc_malloc(size_t size); -__device__ void* __hip_hc_free(void* ptr); - __device__ float __hip_erfinvf(float x); __device__ double __hip_erfinv(double x); diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 72150c3f54..2aae7cf2a8 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) { return ihipLogStatus(hipErrorInvalidValue); } if (limit == hipLimitMallocHeapSize) { - *pValue = (size_t)SIZE_OF_HEAP; + *pValue = (size_t)__HIP_SIZE_OF_HEAP; return ihipLogStatus(hipSuccess); } else { return ihipLogStatus(hipErrorUnsupportedLimit); diff --git a/tests/src/deviceLib/hipDeviceMalloc.cpp b/tests/src/deviceLib/hipDeviceMalloc.cpp new file mode 100644 index 0000000000..8eb8cdcc3c --- /dev/null +++ b/tests/src/deviceLib/hipDeviceMalloc.cpp @@ -0,0 +1,190 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* HIT_START + * BUILD: %t %s NVCC_OPTIONS -std=c++11 + * RUN: %t EXCLUDE_HIP_PLATFORM nvcc + * HIT_END + */ +#include "test_common.h" +#include +#include + +// Tolerance for error +const double tolerance = 1e-6; +const bool verbose = false; + +#define LEN 64 + +#define ALL_FUN \ + OP(add) \ + OP(sub) \ + OP(mul) \ + OP(div) + +#define OP(x) CK_##x, +enum CalcKind { + ALL_FUN +}; +#undef OP + +#define OP(x) case CK_##x: return #x; +std::string getName(enum CalcKind CK) { + switch(CK){ + ALL_FUN + } +} +#undef OP + +// Calculates function. +// If the function has one argument, B is ignored. +// If the function returns real number, converts it to a complex number. +#define ONE_ARG(func) \ + case CK_##func: \ + return std::complex(std::func(A)); + +template +__device__ __host__ std::complex calc(std::complex A, + std::complex B, + enum CalcKind CK) { + switch(CK) { + case CK_add: + return A + B; + case CK_sub: + return A - B; + case CK_mul: + return A * B; + case CK_div: + return A / B; + + } +} + +// Allocate memory in kernel and save the address to pA and pB. +// Copy value from A, B to allocated memory. +template +__global__ void kernel_alloc(std::complex* A, + std::complex* B, + std::complex** pA, + std::complex** pB) { + typedef std::complex CFloatT; + int tx = threadIdx.x + blockIdx.x * blockDim.x; + if (tx == 0) { + *pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN); + *pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN); + for (int i = 0; i < LEN; i++) { + (*pA)[i] = A[i]; + (*pB)[i] = B[i]; + } + } +} + +// Do calculation using values saved in allocated memmory. pA, pB are buffers +// containing the address of the device-side allocated array. +template +__global__ void kernel_free(std::complex** pA, + std::complex** pB, std::complex* C, + enum CalcKind CK) { + typedef std::complex CFloatT; + int tx = threadIdx.x + blockIdx.x * blockDim.x; + C[tx] = calc((*pA)[tx], (*pB)[tx], CK); + if (tx == 0) { + free(*pA); + free(*pB); + } +} + +template +void test() { + typedef std::complex ComplexT; + ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D; + A = new ComplexT[LEN]; + B = new ComplexT[LEN]; + C = new ComplexT[LEN]; + D = new ComplexT[LEN]; + hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN); + hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN); + hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN); + + for (uint32_t i = 0; i < LEN; i++) { + A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f); + B[i] = A[i]; + C[i] = A[i]; + } + hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice); + hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice); + + // Run kernel for a calculation kind and verify by comparing with host + // calculation result. Returns false if fails. + auto test_fun = [&](enum CalcKind CK) { + // kernel_alloc allocates memory on device side and initialize it. + // kernel_free uses allocated memory from kernel_alloc and does the + // calculation then free the memory. + // pA and pB are buffers to pass the device-side allocated memory address + // from kernel_alloc to kernel_free. + ComplexT **pA, **pB; + hipMalloc((ComplexT***)&pA, sizeof(ComplexT*)); + hipMalloc((ComplexT***)&pB, sizeof(ComplexT*)); + hipLaunchKernelGGL(kernel_alloc, dim3(1), dim3(LEN), 0, 0, + Ad, Bd, pA, pB); + hipDeviceSynchronize(); + hipLaunchKernelGGL(kernel_free, dim3(1), dim3(LEN), 0, 0, + pA, pB, Cd, CK); + hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost); + hipFree(pA); + hipFree(pB); + for (int i = 0; i < LEN; i++) { + ComplexT Expected = calc(A[i], B[i], CK); + FloatT error = std::abs(C[i] - Expected); + if (std::abs(Expected) > tolerance) + error /= std::abs(Expected); + bool pass = error < tolerance; + if (verbose || !pass) { + std::cout << "Function: " << getName(CK) + << " Operands: " << A[i] << " " << B[i] + << " Result: " << C[i] + << " Expected: " << Expected + << " Error: " << error + << " Pass: " << pass + << std::endl; + } + if (!pass) + return false; + } + return true; + }; + +#define OP(x) assert(test_fun(CK_##x)); + ALL_FUN +#undef OP + + hipFree(Ad); + hipFree(Bd); + hipFree(Cd); + delete[] A; + delete[] B; + delete[] C; + delete[] D; +} + +int main() { + test(); + test(); + passed(); + return 0; +} From 03320890de9c1499737a66662af0df1bb4cf01ba Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Wed, 25 Jul 2018 16:56:39 -0400 Subject: [PATCH 02/34] Fix thread index calculation in __hip_malloc --- include/hip/hcc_detail/hip_memory.h | 5 ++++- tests/src/deviceLib/hipDeviceMalloc.cpp | 25 ++++++++++++++++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/include/hip/hcc_detail/hip_memory.h b/include/hip/hcc_detail/hip_memory.h index 9167baba38..2394a05d0f 100644 --- a/include/hip/hcc_detail/hip_memory.h +++ b/include/hip/hcc_detail/hip_memory.h @@ -51,7 +51,10 @@ extern "C" inline __device__ void* __hip_malloc(size_t size) { uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z; - uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x; + uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x + + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x + + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x + * hipBlockDim_y; uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads; uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads; diff --git a/tests/src/deviceLib/hipDeviceMalloc.cpp b/tests/src/deviceLib/hipDeviceMalloc.cpp index 8eb8cdcc3c..a9d62db025 100644 --- a/tests/src/deviceLib/hipDeviceMalloc.cpp +++ b/tests/src/deviceLib/hipDeviceMalloc.cpp @@ -29,7 +29,14 @@ THE SOFTWARE. const double tolerance = 1e-6; const bool verbose = false; -#define LEN 64 +#define BLKDIM_X 64 +#define BLKDIM_Y 1 +#define BLKDIM_Z 1 +#define NUM_BLK_X 1 +#define NUM_BLK_Y 1 +#define NUM_BLK_Z 1 + +#define LEN (BLKDIM_X * BLKDIM_Y * BLKDIM_Z * NUM_BLK_X * NUM_BLK_Y * NUM_BLK_Z) #define ALL_FUN \ OP(add) \ @@ -83,7 +90,10 @@ __global__ void kernel_alloc(std::complex* A, std::complex** pA, std::complex** pB) { typedef std::complex CFloatT; - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x + + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x + + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x + * hipBlockDim_y; if (tx == 0) { *pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN); *pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN); @@ -101,7 +111,10 @@ __global__ void kernel_free(std::complex** pA, std::complex** pB, std::complex* C, enum CalcKind CK) { typedef std::complex CFloatT; - int tx = threadIdx.x + blockIdx.x * blockDim.x; + int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x + + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x + + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x + * hipBlockDim_y; C[tx] = calc((*pA)[tx], (*pB)[tx], CK); if (tx == 0) { free(*pA); @@ -140,10 +153,12 @@ void test() { ComplexT **pA, **pB; hipMalloc((ComplexT***)&pA, sizeof(ComplexT*)); hipMalloc((ComplexT***)&pB, sizeof(ComplexT*)); - hipLaunchKernelGGL(kernel_alloc, dim3(1), dim3(LEN), 0, 0, + dim3 blkDim(BLKDIM_X, BLKDIM_Y, BLKDIM_Z); + dim3 numBlk(NUM_BLK_X, NUM_BLK_Y, NUM_BLK_Z); + hipLaunchKernelGGL(kernel_alloc, numBlk, blkDim, 0, 0, Ad, Bd, pA, pB); hipDeviceSynchronize(); - hipLaunchKernelGGL(kernel_free, dim3(1), dim3(LEN), 0, 0, + hipLaunchKernelGGL(kernel_free, numBlk, blkDim, 0, 0, pA, pB, Cd, CK); hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost); hipFree(pA); From f06894e2f0abb89c852ff6c3bdf27b9758cc4922 Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Fri, 27 Jul 2018 17:07:00 -0400 Subject: [PATCH 03/34] Do not use std::complex in test hipDeviceMalloc --- tests/src/deviceLib/hipDeviceMalloc.cpp | 105 ++++++++++-------------- 1 file changed, 44 insertions(+), 61 deletions(-) diff --git a/tests/src/deviceLib/hipDeviceMalloc.cpp b/tests/src/deviceLib/hipDeviceMalloc.cpp index a9d62db025..4ec10077c5 100644 --- a/tests/src/deviceLib/hipDeviceMalloc.cpp +++ b/tests/src/deviceLib/hipDeviceMalloc.cpp @@ -60,43 +60,35 @@ std::string getName(enum CalcKind CK) { // Calculates function. // If the function has one argument, B is ignored. -// If the function returns real number, converts it to a complex number. -#define ONE_ARG(func) \ - case CK_##func: \ - return std::complex(std::func(A)); +#define ONE_ARG(func) \ + case CK_##func: \ + return std::func(A); -template -__device__ __host__ std::complex calc(std::complex A, - std::complex B, - enum CalcKind CK) { - switch(CK) { - case CK_add: - return A + B; - case CK_sub: - return A - B; - case CK_mul: - return A * B; - case CK_div: - return A / B; - - } +template +__device__ __host__ FloatT calc(FloatT A, FloatT B, enum CalcKind CK) { + switch (CK) { + case CK_add: + return A + B; + case CK_sub: + return A - B; + case CK_mul: + return A * B; + case CK_div: + return A / B; + } } // Allocate memory in kernel and save the address to pA and pB. // Copy value from A, B to allocated memory. -template -__global__ void kernel_alloc(std::complex* A, - std::complex* B, - std::complex** pA, - std::complex** pB) { - typedef std::complex CFloatT; +template +__global__ void kernel_alloc(FloatT* A, FloatT* B, FloatT** pA, FloatT** pB) { int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x * hipBlockDim_y; if (tx == 0) { - *pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN); - *pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN); + *pA = (FloatT*)malloc(sizeof(FloatT) * LEN); + *pB = (FloatT*)malloc(sizeof(FloatT) * LEN); for (int i = 0; i < LEN; i++) { (*pA)[i] = A[i]; (*pB)[i] = B[i]; @@ -106,11 +98,8 @@ __global__ void kernel_alloc(std::complex* A, // Do calculation using values saved in allocated memmory. pA, pB are buffers // containing the address of the device-side allocated array. -template -__global__ void kernel_free(std::complex** pA, - std::complex** pB, std::complex* C, - enum CalcKind CK) { - typedef std::complex CFloatT; +template +__global__ void kernel_free(FloatT** pA, FloatT** pB, FloatT* C, enum CalcKind CK) { int tx = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x @@ -124,23 +113,22 @@ __global__ void kernel_free(std::complex** pA, template void test() { - typedef std::complex ComplexT; - ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D; - A = new ComplexT[LEN]; - B = new ComplexT[LEN]; - C = new ComplexT[LEN]; - D = new ComplexT[LEN]; - hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN); - hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN); - hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN); + FloatT *A, *Ad, *B, *Bd, *C, *Cd, *D; + A = new FloatT[LEN]; + B = new FloatT[LEN]; + C = new FloatT[LEN]; + D = new FloatT[LEN]; + hipMalloc((void**)&Ad, sizeof(FloatT) * LEN); + hipMalloc((void**)&Bd, sizeof(FloatT) * LEN); + hipMalloc((void**)&Cd, sizeof(FloatT) * LEN); for (uint32_t i = 0; i < LEN; i++) { - A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f); + A[i] = (i + 1) * 1.0f; B[i] = A[i]; C[i] = A[i]; } - hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice); - hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice); + hipMemcpy(Ad, A, sizeof(FloatT) * LEN, hipMemcpyHostToDevice); + hipMemcpy(Bd, B, sizeof(FloatT) * LEN, hipMemcpyHostToDevice); // Run kernel for a calculation kind and verify by comparing with host // calculation result. Returns false if fails. @@ -150,9 +138,9 @@ void test() { // calculation then free the memory. // pA and pB are buffers to pass the device-side allocated memory address // from kernel_alloc to kernel_free. - ComplexT **pA, **pB; - hipMalloc((ComplexT***)&pA, sizeof(ComplexT*)); - hipMalloc((ComplexT***)&pB, sizeof(ComplexT*)); + FloatT **pA, **pB; + hipMalloc((FloatT***)&pA, sizeof(FloatT*)); + hipMalloc((FloatT***)&pB, sizeof(FloatT*)); dim3 blkDim(BLKDIM_X, BLKDIM_Y, BLKDIM_Z); dim3 numBlk(NUM_BLK_X, NUM_BLK_Y, NUM_BLK_Z); hipLaunchKernelGGL(kernel_alloc, numBlk, blkDim, 0, 0, @@ -160,23 +148,18 @@ void test() { hipDeviceSynchronize(); hipLaunchKernelGGL(kernel_free, numBlk, blkDim, 0, 0, pA, pB, Cd, CK); - hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost); + hipMemcpy(C, Cd, sizeof(FloatT) * LEN, hipMemcpyDeviceToHost); hipFree(pA); hipFree(pB); for (int i = 0; i < LEN; i++) { - ComplexT Expected = calc(A[i], B[i], CK); - FloatT error = std::abs(C[i] - Expected); - if (std::abs(Expected) > tolerance) - error /= std::abs(Expected); - bool pass = error < tolerance; - if (verbose || !pass) { - std::cout << "Function: " << getName(CK) - << " Operands: " << A[i] << " " << B[i] - << " Result: " << C[i] - << " Expected: " << Expected - << " Error: " << error - << " Pass: " << pass - << std::endl; + FloatT Expected = calc(A[i], B[i], CK); + FloatT error = std::abs(C[i] - Expected); + if (std::abs(Expected) > tolerance) error /= std::abs(Expected); + bool pass = error < tolerance; + if (verbose || !pass) { + std::cout << "Function: " << getName(CK) << " Operands: " << A[i] << " " << B[i] + << " Result: " << C[i] << " Expected: " << Expected << " Error: " << error + << " Pass: " << pass << std::endl; } if (!pass) return false; From a6c7aeed7237b71122c85c18663f8d740902e9ea Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Sat, 28 Jul 2018 09:02:38 -0400 Subject: [PATCH 04/34] Add HIP directed test hipTestGlobalVariable.cpp --- tests/src/kernel/hipTestGlobalVariable.cpp | 97 ++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 tests/src/kernel/hipTestGlobalVariable.cpp diff --git a/tests/src/kernel/hipTestGlobalVariable.cpp b/tests/src/kernel/hipTestGlobalVariable.cpp new file mode 100644 index 0000000000..2209e2c254 --- /dev/null +++ b/tests/src/kernel/hipTestGlobalVariable.cpp @@ -0,0 +1,97 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include +#include +#include +#include "test_common.h" + +#define HIP_ASSERT(status) assert(status == hipSuccess) + +#define LEN 512 +#define SIZE 2048 + +struct TestConstantGlobalVar { + static __constant__ int ConstantGlobalVar; + + static __global__ void kernel(int* Ad) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + Ad[tid] = ConstantGlobalVar; + } + + void run() { + int *A, *Ad; + A = new int[LEN]; + for (unsigned i = 0; i < LEN; i++) { + A[i] = 0; + } + + HIP_ASSERT(hipMalloc((void**)&Ad, SIZE)); + hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad); + HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); + + for (unsigned i = 0; i < LEN; i++) { + assert(123 == A[i]); + } + } +}; +__constant__ int TestConstantGlobalVar::ConstantGlobalVar = 123; + +struct TestGlobalArray { + static __device__ int GlobalArray[LEN]; + + static __global__ void kernelWrite() { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + GlobalArray[tid] = tid; + } + static __global__ void kernelRead(int* Ad) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + Ad[tid] = GlobalArray[tid]; + } + + void run() { + int *A, *Ad; + A = new int[LEN]; + for (unsigned i = 0; i < LEN; i++) { + A[i] = 0; + } + + HIP_ASSERT(hipMalloc((void**)&Ad, SIZE)); + hipLaunchKernelGGL(kernelWrite, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0); + hipLaunchKernelGGL(kernelRead, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad); + HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); + + for (unsigned i = 0; i < LEN; i++) { + assert(i == A[i]); + } + } +}; +__device__ int TestGlobalArray::GlobalArray[LEN]; + +int main() { + TestConstantGlobalVar().run(); + TestGlobalArray().run(); + passed(); +} From 8e0e373f69b33dd1d9f4b0ef8e4bf974d3e08188 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Fri, 17 Aug 2018 12:14:42 -0700 Subject: [PATCH 05/34] Check for hipEnvVar at the same level if directed_tests location fails --- tests/src/hipEnvVarDriver.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/src/hipEnvVarDriver.cpp b/tests/src/hipEnvVarDriver.cpp index 599b138565..e52aa46063 100644 --- a/tests/src/hipEnvVarDriver.cpp +++ b/tests/src/hipEnvVarDriver.cpp @@ -38,7 +38,10 @@ int getDeviceNumber() { string str; std::this_thread::sleep_for(std::chrono::milliseconds(10)); if (!(in = popen("./directed_tests/hipEnvVar -c", "r"))) { - return 1; + // Check at same level + if (!(in = popen("./hipEnvVar -c", "r"))) { + return 1; + } } while (fgets(buff, 512, in) != NULL) { cout << buff; @@ -54,7 +57,11 @@ void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) { str += std::to_string(deviceID); std::this_thread::sleep_for(std::chrono::milliseconds(10)); if (!(in = popen(str.c_str(), "r"))) { - exit(1); + // Check at same level + if (!(in = popen("./hipEnvVar -d ", "r"))) { + exit(1); + } + } while (fgets(pciBusID, 100, in) != NULL) { cout << pciBusID; From 1daee67eb682c6962013109f679a65d20a5615b3 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Fri, 31 Aug 2018 12:54:23 -0700 Subject: [PATCH 06/34] Fix record_event and hipStreamSync2 tests. The test should expect null stream to complete if synchrionize is called as per the spec --- tests/src/runtimeApi/event/record_event.cpp | 4 ++-- tests/src/runtimeApi/stream/hipStreamSync2.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/src/runtimeApi/event/record_event.cpp b/tests/src/runtimeApi/event/record_event.cpp index afd6bef2ef..3eb54e1735 100644 --- a/tests/src/runtimeApi/event/record_event.cpp +++ b/tests/src/runtimeApi/event/record_event.cpp @@ -170,8 +170,8 @@ void runTests(int64_t numElements) { // for (int waitStart=0; waitStart<2; waitStart++) { for (int waitStart = 1; waitStart >= 0; waitStart--) { unsigned W = waitStart ? 0x1000 : 0; - test(W | 0x01, C_d, C_h, numElements, 0, waitStart, syncNone); - test(W | 0x02, C_d, C_h, numElements, stream, waitStart, syncNone); + test(W | 0x01, C_d, C_h, numElements, 0, 0, syncNone); + test(W | 0x02, C_d, C_h, numElements, stream, 0, syncNone); test(W | 0x04, C_d, C_h, numElements, 0, waitStart, syncStream); test(W | 0x08, C_d, C_h, numElements, stream, waitStart, syncStream); test(W | 0x10, C_d, C_h, numElements, 0, waitStart, syncStopEvent); diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp index cf25d0bd2b..652c799792 100644 --- a/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -178,7 +178,7 @@ void runTests(int64_t numElements) { { test(0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/); test(0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); - test(0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); + test(0x04, C_d, C_h, numElements, syncOtherStream, false /*expectMismatch*/); test(0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); // Sending a marker to to null stream may synchronize the otherStream From cf12a9c049cf9254cd7d1f0e62cee161d71820d7 Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Thu, 13 Sep 2018 13:55:43 -0500 Subject: [PATCH 07/34] Only LLVM6 and higher contain the necessary intrinsics. --- include/hip/hcc_detail/hip_fp16.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index 68f0e35f5f..5a6e650069 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -29,7 +29,7 @@ THE SOFTWARE. #include #endif -#if defined(__clang__) && (__clang_major__ > 3) +#if defined(__clang__) && (__clang_major__ > 5) typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2))); struct __half_raw { From 9b2107749cd31c34434be03388a136fd17aee69e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Sat, 15 Sep 2018 13:23:38 +0530 Subject: [PATCH 08/34] Revert changes to runtime/stream/hipStreamSync2 --- tests/src/runtimeApi/stream/hipStreamSync2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp index 652c799792..cf25d0bd2b 100644 --- a/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -178,7 +178,7 @@ void runTests(int64_t numElements) { { test(0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/); test(0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); - test(0x04, C_d, C_h, numElements, syncOtherStream, false /*expectMismatch*/); + test(0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); test(0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); // Sending a marker to to null stream may synchronize the otherStream From 0b211bb4292954230d9576ac50b9491e1ef7def7 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 17 Sep 2018 15:15:49 +0530 Subject: [PATCH 09/34] [dtests] Fix deviceLib/hipTestIncludeMath on nvcc path Change-Id: Ifb121886bec6c1134e98bd194ff1713f28454d33 --- tests/src/deviceLib/hipTestIncludeMath.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/src/deviceLib/hipTestIncludeMath.cpp b/tests/src/deviceLib/hipTestIncludeMath.cpp index 6063eee76c..92b93eda36 100644 --- a/tests/src/deviceLib/hipTestIncludeMath.cpp +++ b/tests/src/deviceLib/hipTestIncludeMath.cpp @@ -30,6 +30,7 @@ THE SOFTWARE. // Incorrect implementation causes compilation failure due to conflict // declartions. +#include #include // Test __HIP_DEVICE_COMPILE__ is defined after math_functions.h @@ -45,14 +46,6 @@ __device__ __host__ inline void throw_std_bad_alloc() #endif } -// Test __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ and __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ -// is defined. Eigen HIP/hcc/Half.h __ldg depends on this. -#if !defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) || \ - !defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__) -#error \ - "__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ or __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ not defined" -#endif - #include #include "test_common.h" From cef5261fa9e1e63672d2857e22c63839b629aea0 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 17 Sep 2018 15:23:30 +0530 Subject: [PATCH 10/34] Add mappings for __clock* in nvcc_detail/hip_runtime.h Change-Id: Ibcecf52f3e69298268d921efc036090544fa0ed0 --- include/hip/nvcc_detail/hip_runtime.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/hip/nvcc_detail/hip_runtime.h b/include/hip/nvcc_detail/hip_runtime.h index fe0b19bf8c..19d740a1ee 100644 --- a/include/hip/nvcc_detail/hip_runtime.h +++ b/include/hip/nvcc_detail/hip_runtime.h @@ -118,6 +118,9 @@ typedef int hipLaunchParm; } #endif +#define __clock() clock() +#define __clock64() clock64() + #endif #endif From 94a0589eb5de05a05c6ae8e3295b7a4b418face9 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 17 Sep 2018 15:26:45 +0530 Subject: [PATCH 11/34] [dtests] Fix deviceLib/hipSimpleAtomicsTest for nvcc Change-Id: I23cbea2820d41da6f6a1bcab4bc3b59ac21799dd --- tests/src/deviceLib/hipSimpleAtomicsTest.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp index 129d7c1aa8..ec38a5d327 100644 --- a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp +++ b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 --gpu-architecture=sm_60 * RUN: %t * HIT_END */ @@ -215,6 +215,7 @@ template< typename T, typename enable_if< is_same{} || is_same{}>::type* = nullptr> +__device__ void testKernelSub(T* g_odata) { // Atomic subtraction (final should be 0) atomicSub(&g_odata[1], 10); @@ -333,4 +334,4 @@ int main(int argc, char** argv) { hipDeviceReset(); printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); -} \ No newline at end of file +} From cca2c5afc80818601a338b25c2c353a31f65a05e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 17 Sep 2018 15:32:05 +0530 Subject: [PATCH 12/34] [dests] Fix hipTestClock, hipTestNew & hipTestGlobalVariable tests for nvcc nvcc does not support global kernels in struct/class Change-Id: I2d7297e0c3725564215e20dbdd31c0bb8d7a07de --- tests/src/deviceLib/hipTestClock.cpp | 5 +---- tests/src/deviceLib/hipTestNew.cpp | 4 +--- tests/src/kernel/hipTestGlobalVariable.cpp | 18 ++++++------------ 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/tests/src/deviceLib/hipTestClock.cpp b/tests/src/deviceLib/hipTestClock.cpp index 46f64e35a3..ee6dca8a42 100644 --- a/tests/src/deviceLib/hipTestClock.cpp +++ b/tests/src/deviceLib/hipTestClock.cpp @@ -33,8 +33,6 @@ THE SOFTWARE. #define LEN 512 #define SIZE 2048 -struct TestClock { - static __global__ void kernel1(int* Ad) { int tid = threadIdx.x + blockIdx.x * blockDim.x; Ad[tid] = clock() + clock64() + __clock() + __clock64(); @@ -61,9 +59,8 @@ struct TestClock { assert(0 != A[i]); } } -}; int main() { - TestClock().run(); + run(); passed(); } diff --git a/tests/src/deviceLib/hipTestNew.cpp b/tests/src/deviceLib/hipTestNew.cpp index 60774ff21d..d644f8b483 100644 --- a/tests/src/deviceLib/hipTestNew.cpp +++ b/tests/src/deviceLib/hipTestNew.cpp @@ -33,7 +33,6 @@ THE SOFTWARE. #define LEN 512 #define SIZE 2048 -struct TestPlacementNew { class A { public: __device__ A() { @@ -63,9 +62,8 @@ struct TestPlacementNew { assert(i == A[i]); } } -}; int main() { - TestPlacementNew().run(); + run(); passed(); } diff --git a/tests/src/kernel/hipTestGlobalVariable.cpp b/tests/src/kernel/hipTestGlobalVariable.cpp index 2209e2c254..8ab8bef9c2 100644 --- a/tests/src/kernel/hipTestGlobalVariable.cpp +++ b/tests/src/kernel/hipTestGlobalVariable.cpp @@ -33,15 +33,14 @@ THE SOFTWARE. #define LEN 512 #define SIZE 2048 -struct TestConstantGlobalVar { - static __constant__ int ConstantGlobalVar; + __constant__ int ConstantGlobalVar = 123; static __global__ void kernel(int* Ad) { int tid = threadIdx.x + blockIdx.x * blockDim.x; Ad[tid] = ConstantGlobalVar; } - void run() { + void runTestConstantGlobalVar() { int *A, *Ad; A = new int[LEN]; for (unsigned i = 0; i < LEN; i++) { @@ -56,11 +55,8 @@ struct TestConstantGlobalVar { assert(123 == A[i]); } } -}; -__constant__ int TestConstantGlobalVar::ConstantGlobalVar = 123; -struct TestGlobalArray { - static __device__ int GlobalArray[LEN]; + __device__ int GlobalArray[LEN]; static __global__ void kernelWrite() { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -71,7 +67,7 @@ struct TestGlobalArray { Ad[tid] = GlobalArray[tid]; } - void run() { + void runTestGlobalArray() { int *A, *Ad; A = new int[LEN]; for (unsigned i = 0; i < LEN; i++) { @@ -87,11 +83,9 @@ struct TestGlobalArray { assert(i == A[i]); } } -}; -__device__ int TestGlobalArray::GlobalArray[LEN]; int main() { - TestConstantGlobalVar().run(); - TestGlobalArray().run(); + runTestConstantGlobalVar(); + runTestGlobalArray(); passed(); } From 30acc99e134314e43e77b72fa6dc10a5fdc3acb9 Mon Sep 17 00:00:00 2001 From: VincentSC Date: Mon, 17 Sep 2018 13:25:12 +0200 Subject: [PATCH 13/34] Update install.sh 'rename' added as a Linux prerequisite. --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index f8ad640798..c7dc48e1a8 100755 --- a/install.sh +++ b/install.sh @@ -25,7 +25,7 @@ popd () { function setupENV() { sudo apt-get update - sudo apt-get install dpkg-dev rpm doxygen libelf-dev + sudo apt-get install dpkg-dev rpm doxygen libelf-dev rename } function buildHIP() From fc228c7ea6f36b8a3ec75505ae1500be02606635 Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Mon, 17 Sep 2018 11:19:35 -0400 Subject: [PATCH 14/34] Fix hipLaunchKernelGGL for hip-clang Do not decay function pointer type of the kernel argument passed to hipLaunchKernelGGL and hipLaunchKernel, otherwise some type information is lost which may cause type inference failure for the template. This issue caused compilation error of FeatureLPPooling in Caffe2/PyTorch and this patch fixes that. --- include/hip/hcc_detail/hip_runtime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index e5f0fb52fa..d8093e6646 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -334,13 +334,13 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri typedef int hipLaunchParm; template -inline void hipLaunchKernelGGL(F kernelName, const dim3& numblocks, const dim3& numthreads, +inline void hipLaunchKernelGGL(F&& kernelName, const dim3& numblocks, const dim3& numthreads, unsigned memperblock, hipStream_t streamId, Args... args) { kernelName<<>>(args...); } template -inline void hipLaunchKernel(F kernel, const dim3& numBlocks, const dim3& dimBlocks, +inline void hipLaunchKernel(F&& kernel, const dim3& numBlocks, const dim3& dimBlocks, std::uint32_t groupMemBytes, hipStream_t stream, Args... args) { hipLaunchKernelGGL(kernel, numBlocks, dimBlocks, groupMemBytes, stream, hipLaunchParm{}, std::move(args)...); From cdfd82f1dee9b180b46bd256b81184600532c53a Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Mon, 17 Sep 2018 16:50:42 -0400 Subject: [PATCH 15/34] Disable device code for gcc in hip_memory.h These device code should only be seen by HCC or hip-clang. They causd build failure for HIP-VDI runtime and should be disabled for gcc. --- include/hip/hcc_detail/hip_memory.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/hip/hcc_detail/hip_memory.h b/include/hip/hcc_detail/hip_memory.h index 2394a05d0f..739e488ca3 100644 --- a/include/hip/hcc_detail/hip_memory.h +++ b/include/hip/hcc_detail/hip_memory.h @@ -39,6 +39,8 @@ THE SOFTWARE. #define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE) +#if __HCC__ || __HIP__ + __attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP]; __attribute__((weak)) __device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES]; @@ -102,4 +104,6 @@ extern "C" inline __device__ void* __hip_free(void* ptr) { return nullptr; } +#endif + #endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H From 8da3a96f2bfb1b4aeb5ce0bcf0c7f7a009c076cf Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 18 Sep 2018 14:10:03 +0530 Subject: [PATCH 16/34] Fixed get alignment offset test for NVCC - Removed dtest hipGetAlgntoffset2D.cpp - hipGetTextureAlignmentOffset is being tested now in hipBindTexRef1DFetch.cpp --- tests/src/texture/hipBindTexRef1DFetch.cpp | 5 +- tests/src/texture/hipGetAlgntoffset2D.cpp | 78 ---------------------- 2 files changed, 3 insertions(+), 80 deletions(-) delete mode 100644 tests/src/texture/hipGetAlgntoffset2D.cpp diff --git a/tests/src/texture/hipBindTexRef1DFetch.cpp b/tests/src/texture/hipBindTexRef1DFetch.cpp index 5d9a1b103b..fa63efc1bb 100644 --- a/tests/src/texture/hipBindTexRef1DFetch.cpp +++ b/tests/src/texture/hipBindTexRef1DFetch.cpp @@ -56,7 +56,7 @@ int runTest() { int testResult = 1; float *texBuf; float val[N], output[N]; - size_t size = 0; + size_t offset = 0; float *devBuf; for (int i = 0; i < N; i++) { val[i] = (float)i; @@ -74,7 +74,8 @@ int runTest() { tex.filterMode = hipFilterModePoint; tex.normalized = 0; - HIPCHECK(hipBindTexture(&size, tex, (void *)texBuf, chanDesc, N * sizeof(float))); + HIPCHECK(hipBindTexture(&offset, tex, (void *)texBuf, chanDesc, N * sizeof(float))); + HIPCHECK(hipGetTextureAlignmentOffset(&offset,&tex)); dim3 dimBlock(64, 1, 1); dim3 dimGrid(N / dimBlock.x, 1, 1); diff --git a/tests/src/texture/hipGetAlgntoffset2D.cpp b/tests/src/texture/hipGetAlgntoffset2D.cpp deleted file mode 100644 index 3a4cc7dfc8..0000000000 --- a/tests/src/texture/hipGetAlgntoffset2D.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* -Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../test_common.cpp - * RUN: %t - * HIT_END - */ -#include -#include "test_common.h" - -using namespace std; -#define R 8 //rows, height -#define C 8 //columns, width - -texture tex; - -bool runTest(void); - -int main(int argc, char** argv) { - bool testResult=runTest(); - - if (testResult) { - passed(); - } else { - exit(EXIT_FAILURE); - } -} - -bool runTest() -{ -int val[R][C],i,j; -size_t offset; - -for(i=0;i Date: Fri, 17 Aug 2018 11:34:45 -0400 Subject: [PATCH 17/34] Fix build failure of hipTestHalf and hipTestIncludeMath for hip-clang --- include/hip/hcc_detail/device_functions.h | 5 +++-- include/hip/hcc_detail/hip_runtime.h | 2 +- include/hip/hcc_detail/math_functions.h | 10 ++++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index b60a38aeea..b319f26e03 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -27,11 +27,12 @@ THE SOFTWARE. #include "math_fwd.h" #include +#include + + #include #include #include -#include - /* Integer Intrinsics */ diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index d8093e6646..d4207c56fe 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -131,7 +131,7 @@ extern int HIP_TRACE_API; // Feature tests: -#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0) +#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__ // Device compile and not host compile: // 32-bit Atomics: diff --git a/include/hip/hcc_detail/math_functions.h b/include/hip/hcc_detail/math_functions.h index 6c8510fcbb..702c120b86 100644 --- a/include/hip/hcc_detail/math_functions.h +++ b/include/hip/hcc_detail/math_functions.h @@ -22,14 +22,16 @@ THE SOFTWARE. #pragma once -#include "math_fwd.h" - -#include - #include #include #include #include +#include + +#include + +#include "hip_fp16_math_fwd.h" +#include "math_fwd.h" // HCC's own math functions should be included first, otherwise there will // be conflicts when hip/math_functions.h is included before hip/hip_runtime.h. From 2a5a2c66c47072f035de0873b1155418b69a46ba Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 19 Sep 2018 15:06:22 +0530 Subject: [PATCH 18/34] Update Jenkinsfile [ci] Update list of disable tests for automation --- Jenkinsfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2432cea38e..3eb24eebdd 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -167,7 +167,8 @@ def docker_build_inside_image( def build_image, String inside_args, String platf } // Cap the maximum amount of testing, in case of hangs - // Excluding hipPrintfKernel test from automation; variable fails on CI test machines + // Excluding hipVectorTypes test from automation; due to regression from HCC commit 2367133 + // Excluding hipFloatMath test from automation; due to regression from ROCDL commit 2fc04e1 timeout(time: 1, unit: 'HOURS') { stage("${platform} unit testing") @@ -177,7 +178,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf cd ${build_dir_rel} make install -j\$(nproc) make build_tests -i -j\$(nproc) - ctest -E hipVectorTypes + ctest -E "(hipVectorTypes.tst|hipVectorTypesDevice.tst|hipFloatMath.tst)" """ // If unit tests output a junit or xunit file in the future, jenkins can parse that file // to display test results on the dashboard From bd622a4b4ab4679c8d902f797b3cd60b95f13d9c Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Fri, 10 Aug 2018 11:12:53 -0400 Subject: [PATCH 19/34] Add fma function with float and _Float16 arguments --- include/hip/hcc_detail/math_functions.h | 10 ++ tests/src/deviceLib/hipTestFMA.cpp | 142 ++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 tests/src/deviceLib/hipTestFMA.cpp diff --git a/include/hip/hcc_detail/math_functions.h b/include/hip/hcc_detail/math_functions.h index 702c120b86..c1adef68fd 100644 --- a/include/hip/hcc_detail/math_functions.h +++ b/include/hip/hcc_detail/math_functions.h @@ -1166,6 +1166,16 @@ long long llabs(long long x) #endif // END INTEGER +__DEVICE__ +inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) { + return __ocml_fma_f16(x, y, z); +} + +__DEVICE__ +inline float fma(float x, float y, float z) { + return fmaf(x, y, z); +} + #pragma push_macro("__DEF_FLOAT_FUN") #pragma push_macro("__DEF_FLOAT_FUN2") #pragma push_macro("__DEF_FLOAT_FUN2I") diff --git a/tests/src/deviceLib/hipTestFMA.cpp b/tests/src/deviceLib/hipTestFMA.cpp new file mode 100644 index 0000000000..5e1913a5c7 --- /dev/null +++ b/tests/src/deviceLib/hipTestFMA.cpp @@ -0,0 +1,142 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include "test_common.h" +#include +#include +#include + +#define HIP_ASSERT(status) assert(status == hipSuccess) + +#define LEN 50 +#define SIZE (LEN * sizeof(bool)) + +struct TestFMA { + static __global__ void kernel(bool *Ad) { + float f = 1.0f / 3.0f; + double d = f; + int i = 0; + auto Check = [&](bool Cond) { Ad[i++] = Cond; }; + // f * f + 3.0f will be different if promoted to double. + float floatResult = fma(f, f, 3.0f); + double doubleResult = fma(d, d, 3.0); + Check(floatResult != doubleResult); + + // check promote to float. + Check(fma(f, f, 3) == floatResult); + Check(fma(f, f, (char)3) == floatResult); + Check(fma(f, f, (unsigned char)3) == floatResult); + Check(fma(f, f, (short)3) == floatResult); + Check(fma(f, f, (unsigned short)3) == floatResult); + Check(fma(f, f, (int)3) == floatResult); + Check(fma(f, f, (unsigned int)3) == floatResult); + Check(fma(f, f, (long)3) == floatResult); + Check(fma(f, f, (unsigned long)3) == floatResult); + Check(fma(f, f, true) == fma(f, f, 1.0f)); + + // check promote to double. + Check(fma(d, (double)f, 3) == doubleResult); + Check(fma(d, (double)f, (char)3) == doubleResult); + Check(fma(d, (double)f, (unsigned char)3) == doubleResult); + Check(fma(d, (double)f, (short)3) == doubleResult); + Check(fma(d, (double)f, (unsigned short)3) == doubleResult); + Check(fma(d, (double)f, (int)3) == doubleResult); + Check(fma(d, (double)f, (unsigned int)3) == doubleResult); + Check(fma(d, (double)f, (long)3) == doubleResult); + Check(fma(d, (double)f, (unsigned long)3) == doubleResult); + Check(fma(d, (double)f, true) == fma((double)f, (double)f, 1.0)); + + while (i < LEN) + Check(true); + } + void run() { + bool *Ad; + bool A[LEN]; + for (unsigned i = 0; i < LEN; i++) { + A[i] = 0; + } + + HIP_ASSERT(hipMalloc((void **)&Ad, SIZE)); + hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad); + HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); + + for (unsigned i = 0; i < LEN; i++) { + assert(A[i]); + } + } +}; + +struct TestHalfFMA { + static __global__ void kernel(bool *Ad) { + _Float16 h = (_Float16)(1.0f/3.0f); + float f = h; + double d = f; + int i = 0; + auto Check = [&](bool Cond) { Ad[i++] = Cond; }; + // h * h + 3 will be different if promoted to float. + _Float16 halfResult = fma(h, h, (_Float16)3); + float floatResult = fma(f, f, 3.0f); + double doubleResult = fma(d, d, 3.0); + Check(halfResult != floatResult); + Check(halfResult != doubleResult); + + // check promote to half. + Check(fma(h, h, 3) == halfResult); + Check(fma(h, h, (char)3) == halfResult); + Check(fma(h, h, (unsigned char)3) == halfResult); + Check(fma(h, h, (short)3) == halfResult); + Check(fma(h, h, (unsigned short)3) == halfResult); + Check(fma(h, h, (int)3) == halfResult); + Check(fma(h, h, (unsigned int)3) == halfResult); + Check(fma(h, h, (long)3) == halfResult); + Check(fma(h, h, (unsigned long)3) == halfResult); + Check(fma(h, h, true) == fma(h, h, (_Float16)1)); + + while (i < LEN) + Check(true); + } + + void run() { + bool *Ad; + bool A[LEN]; + for (unsigned i = 0; i < LEN; i++) { + A[i] = 0; + } + + HIP_ASSERT(hipMalloc((void **)&Ad, SIZE)); + hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad); + HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); + + for (unsigned i = 0; i < LEN; i++) { + assert(A[i]); + } + } +}; + +int main() { + TestFMA().run(); + TestHalfFMA().run(); + passed(); +} From a5c961e26c0f6220cf1b2634d86f75a5e8426a5c Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Wed, 19 Sep 2018 10:38:48 -0400 Subject: [PATCH 20/34] Silent warnings about duplicate static keyword static is already in __DEVICE__, so should be removed. --- include/hip/hcc_detail/math_functions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/math_functions.h b/include/hip/hcc_detail/math_functions.h index 702c120b86..b618790133 100644 --- a/include/hip/hcc_detail/math_functions.h +++ b/include/hip/hcc_detail/math_functions.h @@ -1359,10 +1359,10 @@ __DEVICE__ inline static unsigned long long max(long long arg1, unsigned long lo return max((unsigned long long) arg1, arg2); }*/ #else -__DEVICE__ inline static int min(int arg1, int arg2) { +__DEVICE__ inline int min(int arg1, int arg2) { return (arg1 < arg2) ? arg1 : arg2; } -__DEVICE__ inline static int max(int arg1, int arg2) { +__DEVICE__ inline int max(int arg1, int arg2) { return (arg1 > arg2) ? arg1 : arg2; } From e01c53b185c3e530530ebeebeffb307749f47790 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Wed, 19 Sep 2018 18:29:56 +0000 Subject: [PATCH 21/34] Create separate config files for hcc and clang The state of HIP_COMPILER is not passing into config files, therefore config files cannot use if statements to determine dependency. For HIP-Clang, we should remove find_dependency(hcc), so we create separate config files depending on compiler path. This fixes issue in apps that uses hip-config.cmake. --- CMakeLists.txt | 12 +++- ...nfig.cmake.in => hip-config-clang.cmake.in | 3 - hip-config-hcc.cmake.in | 65 +++++++++++++++++++ 3 files changed, 76 insertions(+), 4 deletions(-) rename hip-config.cmake.in => hip-config-clang.cmake.in (97%) create mode 100644 hip-config-hcc.cmake.in diff --git a/CMakeLists.txt b/CMakeLists.txt index edb8c2d238..3c62ea4365 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -288,12 +288,22 @@ if(HIP_PLATFORM STREQUAL "hcc") install(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::) include(CMakePackageConfigHelpers) + if(HIP_COMPILER STREQUAL "hcc") configure_package_config_file( - hip-config.cmake.in + hip-config-hcc.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR ) + elseif(HIP_COMPILER STREQUAL "clang") + configure_package_config_file( + hip-config-clang.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake + INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} + PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR + ) + endif() + write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake VERSION "${HIP_VERSION}" diff --git a/hip-config.cmake.in b/hip-config-clang.cmake.in similarity index 97% rename from hip-config.cmake.in rename to hip-config-clang.cmake.in index d5dc6803fc..240f01f60e 100644 --- a/hip-config.cmake.in +++ b/hip-config-clang.cmake.in @@ -48,9 +48,6 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" ) set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc") set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig") -if(HIP_COMPILER STREQUAL "hcc") - find_dependency(hcc) -endif() include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" ) set( hip_LIBRARIES hip::host hip::device) diff --git a/hip-config-hcc.cmake.in b/hip-config-hcc.cmake.in new file mode 100644 index 0000000000..efcdf708bb --- /dev/null +++ b/hip-config-hcc.cmake.in @@ -0,0 +1,65 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND) +if (NOT _CMakeFindDependencyMacro_FOUND) + macro(find_dependency dep) + if (NOT ${dep}_FOUND) + set(cmake_fd_version) + if (${ARGC} GREATER 1) + set(cmake_fd_version ${ARGV1}) + endif() + set(cmake_fd_exact_arg) + if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT) + set(cmake_fd_exact_arg EXACT) + endif() + set(cmake_fd_quiet_arg) + if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) + set(cmake_fd_quiet_arg QUIET) + endif() + set(cmake_fd_required_arg) + if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) + set(cmake_fd_required_arg REQUIRED) + endif() + find_package(${dep} ${cmake_fd_version} + ${cmake_fd_exact_arg} + ${cmake_fd_quiet_arg} + ${cmake_fd_required_arg} + ) + string(TOUPPER ${dep} cmake_dep_upper) + if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND) + set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.") + set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False) + return() + endif() + set(cmake_fd_version) + set(cmake_fd_required_arg) + set(cmake_fd_quiet_arg) + set(cmake_fd_exact_arg) + endif() + endmacro() +endif() + + +set_and_check( hip_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" ) +set_and_check( hip_INCLUDE_DIRS "${hip_INCLUDE_DIR}" ) +set_and_check( hip_LIB_INSTALL_DIR "@PACKAGE_LIB_INSTALL_DIR@" ) +set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" ) + +set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc") +set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig") + +find_dependency(hcc) +include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" ) + +set( hip_LIBRARIES hip::host hip::device) +set( hip_LIBRARY ${hip_LIBRARIES}) + +set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR}) +set(HIP_INCLUDE_DIRS ${hip_INCLUDE_DIRS}) +set(HIP_LIB_INSTALL_DIR ${hip_LIB_INSTALL_DIR}) +set(HIP_BIN_INSTALL_DIR ${hip_BIN_INSTALL_DIR}) +set(HIP_LIBRARIES ${hip_LIBRARIES}) +set(HIP_LIBRARY ${hip_LIBRARY}) +set(HIP_HIPCC_EXECUTABLE ${hip_HIPCC_EXECUTABLE}) +set(HIP_HIPCONFIG_EXECUTABLE ${hip_HIPCONFIG_EXECUTABLE}) + From ecd6a212c7e1cbac802b415adf3bd9ecb72648f8 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 20 Sep 2018 11:23:51 +0530 Subject: [PATCH 22/34] Update hipTestFMA.cpp Fix the test so that it works on nvcc path as well. --- tests/src/deviceLib/hipTestFMA.cpp | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/src/deviceLib/hipTestFMA.cpp b/tests/src/deviceLib/hipTestFMA.cpp index 5e1913a5c7..1f1a5bc921 100644 --- a/tests/src/deviceLib/hipTestFMA.cpp +++ b/tests/src/deviceLib/hipTestFMA.cpp @@ -33,8 +33,7 @@ THE SOFTWARE. #define LEN 50 #define SIZE (LEN * sizeof(bool)) -struct TestFMA { - static __global__ void kernel(bool *Ad) { + __global__ void kernelTestFMA(bool *Ad) { float f = 1.0f / 3.0f; double d = f; int i = 0; @@ -71,7 +70,8 @@ struct TestFMA { while (i < LEN) Check(true); } - void run() { + + void runTestFMA() { bool *Ad; bool A[LEN]; for (unsigned i = 0; i < LEN; i++) { @@ -79,17 +79,15 @@ struct TestFMA { } HIP_ASSERT(hipMalloc((void **)&Ad, SIZE)); - hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad); + hipLaunchKernelGGL(kernelTestFMA, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad); HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); for (unsigned i = 0; i < LEN; i++) { assert(A[i]); } } -}; -struct TestHalfFMA { - static __global__ void kernel(bool *Ad) { + __global__ void kernelTestHalfFMA(bool *Ad) { _Float16 h = (_Float16)(1.0f/3.0f); float f = h; double d = f; @@ -118,7 +116,7 @@ struct TestHalfFMA { Check(true); } - void run() { + void runTestHalfFMA() { bool *Ad; bool A[LEN]; for (unsigned i = 0; i < LEN; i++) { @@ -126,17 +124,16 @@ struct TestHalfFMA { } HIP_ASSERT(hipMalloc((void **)&Ad, SIZE)); - hipLaunchKernelGGL(kernel, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad); + hipLaunchKernelGGL(kernelTestHalfFMA, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, Ad); HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); for (unsigned i = 0; i < LEN; i++) { assert(A[i]); } } -}; int main() { - TestFMA().run(); - TestHalfFMA().run(); + runTestFMA(); + runTestHalfFMA(); passed(); } From c11220f22432aa620688ecfec5bda52869c10d2b Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Thu, 20 Sep 2018 16:33:32 -0400 Subject: [PATCH 23/34] Disable non-default-rounded functions Device library has removed the non-default-rounded functions, so hipFloatMath will fail to build. These include the removal of __ocml_sqrt_rte, __ocml_sqrt_rtn, __ocml_sqrt_rtp, and __ocml_sqrt_rtz. As seen here: https://github.com/RadeonOpenCompute/ROCm-Device-Libs/commit/2fc04e10e1354edee331ce700f98a60f8255effb . Disable these function tests for now, until they are re-enabled, or deleted completely. --- tests/src/deviceLib/hipFloatMath.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/src/deviceLib/hipFloatMath.cpp b/tests/src/deviceLib/hipFloatMath.cpp index 157ea046ec..d363efc271 100644 --- a/tests/src/deviceLib/hipFloatMath.cpp +++ b/tests/src/deviceLib/hipFloatMath.cpp @@ -39,10 +39,10 @@ __global__ void floatMath(hipLaunchParm lp, float* In, float* Out) { Out[tid] = __exp10f(Out[tid]); Out[tid] = __expf(Out[tid]); Out[tid] = __frsqrt_rn(Out[tid]); - Out[tid] = __fsqrt_rd(Out[tid]); - Out[tid] = __fsqrt_rn(Out[tid]); - Out[tid] = __fsqrt_ru(Out[tid]); - Out[tid] = __fsqrt_rz(Out[tid]); + //Out[tid] = __fsqrt_rd(Out[tid]); + //Out[tid] = __fsqrt_rn(Out[tid]); + //Out[tid] = __fsqrt_ru(Out[tid]); + //Out[tid] = __fsqrt_rz(Out[tid]); Out[tid] = __log10f(Out[tid]); Out[tid] = __log2f(Out[tid]); Out[tid] = __logf(Out[tid]); From 6d794cd5057508c5f33d9ad966c32bfde3b60f64 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 21 Sep 2018 10:02:26 +0530 Subject: [PATCH 24/34] [ci] Renable nvcc testing Change-Id: I7d720b41a3ddc99453fee8b9be30494bfec3a808 --- Jenkinsfile | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 3eb24eebdd..ab6dd6d67f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -386,4 +386,40 @@ rocm_head: docker_clean_images( job_name, hip_image_name ) */ } +}, +cuda_9_x: +{ + node('hip-cuda') + { + //////////////////////////////////////////////////////////////////////// + // Block of string constants customizing behavior for cuda + String nvcc_ver = 'nvcc-9.x' + String from_image = 'ci_test_nodes/cuda-9.x/ubuntu-16.04:latest' + String inside_args = '--runtime=nvidia'; + + // Checkout source code, dependencies and version files + String source_hip_rel = checkout_and_version( nvcc_ver ) + + // Create/reuse a docker image that represents the hip build environment + def hip_build_image = docker_build_image( nvcc_ver, 'hip', '', source_hip_rel, from_image ) + + // Print system information for the log + hip_build_image.inside( inside_args ) + { + sh """#!/usr/bin/env bash + set -x + nvidia-smi + nvcc --version + """ + } + + // Conctruct a binary directory path based on build config + String build_hip_rel = build_directory_rel( build_config ); + + // Build hip inside of the build environment + docker_build_inside_image( hip_build_image, inside_args, nvcc_ver, "-DHIP_NVCC_FLAGS=--Wno-deprecated-gpu-targets", build_config, source_hip_rel, build_hip_rel ) + + // Clean docker image + docker_clean_images( 'hip', docker_build_image_name( ) ) + } } From 255589ae15467b5e9d1be454588f1fc7ca72cd58 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 21 Sep 2018 11:00:08 +0530 Subject: [PATCH 25/34] Update hipTestFMA.cpp --- tests/src/deviceLib/hipTestFMA.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/deviceLib/hipTestFMA.cpp b/tests/src/deviceLib/hipTestFMA.cpp index 1f1a5bc921..2771fac585 100644 --- a/tests/src/deviceLib/hipTestFMA.cpp +++ b/tests/src/deviceLib/hipTestFMA.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../test_common.cpp + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * RUN: %t * HIT_END */ From 357b6844fa9958a98bf9b5eba16fd4efa00765a2 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Fri, 21 Sep 2018 15:28:10 -0400 Subject: [PATCH 26/34] Improve hip_trig test case Hip_trig is failing but returning a passing value on HIP-Clang. Also update with debug output and free arrays used. --- tests/src/deviceLib/hip_trig.cpp | 43 +++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index 3f200fcf8f..b7542c25a4 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -35,6 +35,8 @@ THE SOFTWARE. #define LEN 512 #define SIZE LEN << 2 +#define TEST_DEBUG (0) + __global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d, float* sin_pd, float* cos_pd) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -47,6 +49,7 @@ __global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* co int main() { float *In, *sin_h, *cos_h, *tan_h, *sin_ph, *cos_ph; float *In_d, *sin_d, *cos_d, *tan_d, *sin_pd, *cos_pd; + int errors = 0; In = new float[LEN]; sin_h = new float[LEN]; cos_h = new float[LEN]; @@ -61,14 +64,16 @@ int main() { sin_ph[i] = 0.0f; cos_ph[i] = 0.0f; } - hipMalloc((void**)&In_d, SIZE); - hipMalloc((void**)&sin_d, SIZE); - hipMalloc((void**)&cos_d, SIZE); - hipMalloc((void**)&tan_d, SIZE); - hipMalloc((void**)&sin_pd, SIZE); - hipMalloc((void**)&cos_pd, SIZE); + HIP_ASSERT(hipMalloc((void**)&In_d, SIZE)); + HIP_ASSERT(hipMalloc((void**)&sin_d, SIZE)); + HIP_ASSERT(hipMalloc((void**)&cos_d, SIZE)); + HIP_ASSERT(hipMalloc((void**)&tan_d, SIZE)); + HIP_ASSERT(hipMalloc((void**)&sin_pd, SIZE)); + HIP_ASSERT(hipMalloc((void**)&cos_pd, SIZE)); + hipMemcpy(In_d, In, SIZE, hipMemcpyHostToDevice); - hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, In_d, sin_d, cos_d, tan_d, + hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, + In_d, sin_d, cos_d, tan_d, sin_pd, cos_pd); hipMemcpy(sin_h, sin_d, SIZE, hipMemcpyDeviceToHost); hipMemcpy(cos_h, cos_d, SIZE, hipMemcpyDeviceToHost); @@ -77,8 +82,28 @@ int main() { hipMemcpy(cos_ph, cos_pd, SIZE, hipMemcpyDeviceToHost); for (int i = 0; i < LEN; i++) { if (sin_h[i] != sin_ph[i] || cos_h[i] != cos_ph[i] || tan_h[i] * cos_h[i] != sin_h[i]) { - std::cout << "Failed!" << std::endl; + errors++; +#if TEST_DEBUG + std::cout << "Check Failed!" << std::endl; + std::cout << " sin_h: " << sin_h[i] << " sin_ph: " << sin_ph[i] << "\n" + << " cos_h: " << cos_h[i] << " cos_ph:" << cos_ph[i] << "\n" + << " tan_h * cos_h: " << tan_h[i] * cos_h[i] << " sin_h[i]: " << sin_h[i] << "\n"; +#endif } } - passed(); + + HIP_ASSERT(hipFree(In_d)); + HIP_ASSERT(hipFree(sin_d)); + HIP_ASSERT(hipFree(cos_d)); + HIP_ASSERT(hipFree(tan_d)); + HIP_ASSERT(hipFree(sin_pd)); + HIP_ASSERT(hipFree(cos_pd)); + + if (errors != 0) { + std::cout << "hip_trig FAILED!" << std::endl; + return -1; + } else { + std::cout << "hip_trig PASSED!" << std::endl; + } + return errors; } From 3d3820272a45a9be58d0c77ee4de18c4c620eb24 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Mon, 24 Sep 2018 15:01:24 +0000 Subject: [PATCH 27/34] Fix missing HIP_ASSERT in hip_trig --- tests/src/deviceLib/hip_trig.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index b7542c25a4..29c24cf5b4 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -32,6 +32,8 @@ THE SOFTWARE. #include "test_common.h" #include +#define HIP_ASSERT(x) (assert((x) == hipSuccess)) + #define LEN 512 #define SIZE LEN << 2 From 4e06ab07e272fd3e4d5a6c6983e7fed5ee27f46c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 25 Sep 2018 16:50:40 +0300 Subject: [PATCH 28/34] [HIPIFY][docs] Update CUDNN_API_supported_by_HIP.md --- docs/markdown/CUDNN_API_supported_by_HIP.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/markdown/CUDNN_API_supported_by_HIP.md b/docs/markdown/CUDNN_API_supported_by_HIP.md index ec640fecdd..ffdeb881e5 100644 --- a/docs/markdown/CUDNN_API_supported_by_HIP.md +++ b/docs/markdown/CUDNN_API_supported_by_HIP.md @@ -239,8 +239,8 @@ |`cudnnSetTensor` |`hipdnnSetTensor` | |`cudnnScaleTensor` |`hipdnnScaleTensor` | |`cudnnCreateFilterDescriptor` |`hipdnnCreateFilterDescriptor` | -|`cudnnSetFilter4dDescriptor` | | -|`cudnnGetFilter4dDescriptor` | | +|`cudnnSetFilter4dDescriptor` |`hipdnnSetFilter4dDescriptor` | +|`cudnnGetFilter4dDescriptor` |`hipdnnGetFilter4dDescriptor` | |`cudnnSetFilterNdDescriptor` |`hipdnnSetFilterNdDescriptor` | |`cudnnGetFilterNdDescriptor` |`hipdnnGetFilterNdDescriptor` | |`cudnnDestroyFilterDescriptor` |`hipdnnDestroyFilterDescriptor` | From d279c7a1dde54b407903240aaf95f918e5769d00 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 25 Sep 2018 18:46:23 +0300 Subject: [PATCH 29/34] [HIPIFY][BLAS] Add support of hipblasGemmEx and corresponding types TODO (hipBLAS/HIP): rename hipblasDatatype_t to hipDataType_t and move it from hipBLAS to HIP, as Data types are used not only in BLAS library. --- docs/markdown/CUBLAS_API_supported_by_HIP.md | 8 +- ..._Runtime_API_functions_supported_by_HIP.md | 96 +++++++++++-------- hipify-clang/src/CUDA2HipMap.cpp | 47 +++++---- 3 files changed, 82 insertions(+), 69 deletions(-) diff --git a/docs/markdown/CUBLAS_API_supported_by_HIP.md b/docs/markdown/CUBLAS_API_supported_by_HIP.md index 932092612b..750703f4e1 100644 --- a/docs/markdown/CUBLAS_API_supported_by_HIP.md +++ b/docs/markdown/CUBLAS_API_supported_by_HIP.md @@ -35,9 +35,9 @@ | enum |***`cublasAtomicsMode_t`*** | | | 0 |*`CUBLAS_ATOMICS_NOT_ALLOWED`* | | | 1 |*`CUBLAS_ATOMICS_ALLOWED`* | | -| enum |***`cublasAtomicsMode_t`*** | | -| -1 |*`CUBLAS_GEMM_DFALT`* | | -| -1 |*`CUBLAS_GEMM_DEFAULT`* | | +| enum |***`cublasGemmAlgo_t`*** |***`hipblasGemmAlgo_t`*** | +| -1 |*`CUBLAS_GEMM_DFALT`* |*`HIPBLAS_GEMM_DEFAULT`* | 160 | +| -1 |*`CUBLAS_GEMM_DEFAULT`* |*`HIPBLAS_GEMM_DEFAULT`* | 160 | | 0 |*`CUBLAS_GEMM_ALGO0`* | | | 1 |*`CUBLAS_GEMM_ALGO1`* | | | 2 |*`CUBLAS_GEMM_ALGO2`* | | @@ -391,7 +391,7 @@ |`cublasZgemm3m` | | |`cublasHgemm` |`hipblasHgemm` | |`cublasSgemmEx` | | -|`cublasGemmEx` | | +|`cublasGemmEx` |`hipblasGemmEx` | |`cublasCgemmEx` | | |`cublasUint8gemmBias` | | |`cublasSsyrk` | | diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md index dca2683b12..7f87981000 100644 --- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md @@ -93,10 +93,10 @@ ## **7. Occupancy** -| **CUDA** | **HIP** | -|-----------------------------------------------------------|-------------------------------| +| **CUDA** | **HIP** | +|-----------------------------------------------------------|-----------------------------------------------| | `cudaOccupancyMaxActiveBlocksPerMultiprocessor` | `hipOccupancyMaxActiveBlocksPerMultiprocessor`| -| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | | +| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | | ## **8. Execution Control [deprecated since 7.0]** @@ -326,13 +326,13 @@ ## **25. Texture Object Management** -| **CUDA** | **HIP** | -|-----------------------------------------------------------|-------------------------------| -| `cudaCreateTextureObject` |`hipCreateTextureObject` | -| `cudaDestroyTextureObject` |`hipDestroyTextureObject` | -| `cudaGetTextureObjectResourceDesc` |`hipGetTextureObjectResourceDesc` | +| **CUDA** | **HIP** | +|-----------------------------------------------------------|--------------------------------------| +| `cudaCreateTextureObject` |`hipCreateTextureObject` | +| `cudaDestroyTextureObject` |`hipDestroyTextureObject` | +| `cudaGetTextureObjectResourceDesc` |`hipGetTextureObjectResourceDesc` | | `cudaGetTextureObjectResourceViewDesc` |`hipGetTextureObjectResourceViewDesc` | -| `cudaGetTextureObjectTextureDesc` |`hipGetTextureObjectTextureDesc` | +| `cudaGetTextureObjectTextureDesc` |`hipGetTextureObjectTextureDesc` | ## **26. Surface Object Management** @@ -352,36 +352,36 @@ ## **28. C++ API Routines** *(7.0 contains, 7.5 doesn’t)* -| **CUDA** | **HIP** | -|-----------------------------------------------------------|-------------------------------| -| `cudaBindSurfaceToArray` | | -| `cudaBindTexture` | `hipBindTexture` | -| `cudaBindTexture2D` | | -| `cudaBindTextureToArray` | | -| `cudaBindTextureToMipmappedArray` | | -| `cudaCreateChannelDesc` | `hipCreateChannelDesc` | -| `cudaFuncGetAttributes` | | -| `cudaFuncSetCacheConfig` | | -| `cudaGetSymbolAddress` | | -| `cudaGetSymbolSize` | | -| `cudaGetTextureAlignmentOffset` | | -| `cudaLaunch` | | -| `cudaLaunchKernel` | | -| `cudaMallocHost` | | -| `cudaMallocManaged` | | -| `cudaMemcpyFromSymbol` | | -| `cudaMemcpyFromSymbolAsync` | | -| `cudaMemcpyToSymbol` | | -| `cudaMemcpyToSymbolAsync` | | +| **CUDA** | **HIP** | +|-----------------------------------------------------------|------------------------------------------------| +| `cudaBindSurfaceToArray` | | +| `cudaBindTexture` | `hipBindTexture` | +| `cudaBindTexture2D` | | +| `cudaBindTextureToArray` | | +| `cudaBindTextureToMipmappedArray` | | +| `cudaCreateChannelDesc` | `hipCreateChannelDesc` | +| `cudaFuncGetAttributes` | | +| `cudaFuncSetCacheConfig` | | +| `cudaGetSymbolAddress` | | +| `cudaGetSymbolSize` | | +| `cudaGetTextureAlignmentOffset` | | +| `cudaLaunch` | | +| `cudaLaunchKernel` | | +| `cudaMallocHost` | | +| `cudaMallocManaged` | | +| `cudaMemcpyFromSymbol` | | +| `cudaMemcpyFromSymbolAsync` | | +| `cudaMemcpyToSymbol` | | +| `cudaMemcpyToSymbolAsync` | | | `cudaOccupancyMaxActiveBlocksPerMultiprocessor` | `hipOccupancyMaxActiveBlocksPerMultiprocessor` | -| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | | -| `cudaOccupancyMaxPotentialBlockSize` | `hipOccupancyMaxPotentialBlockSize` | -| `cudaOccupancyMaxPotentialBlockSizeVariableSMem` | | -| `cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags` | | -| `cudaOccupancyMaxPotentialBlockSizeWithFlags` | | -| `cudaSetupArgument` | | -| `cudaStreamAttachMemAsync` | | -| `cudaUnbindTexture` | `hipUnbindTexture` | +| `cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | | +| `cudaOccupancyMaxPotentialBlockSize` | `hipOccupancyMaxPotentialBlockSize` | +| `cudaOccupancyMaxPotentialBlockSizeVariableSMem` | | +| `cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags` | | +| `cudaOccupancyMaxPotentialBlockSizeWithFlags` | | +| `cudaSetupArgument` | | +| `cudaStreamAttachMemAsync` | | +| `cudaUnbindTexture` | `hipUnbindTexture` | ## **30. Profiler Control** @@ -395,8 +395,8 @@ ## **31. Data types** -| **type** | **CUDA** | **HIP** | -|-------------:|-----------------------------------------------|------------------------------------------------------| +| **type** | **CUDA** | **HIP** |**HIP value** (if differs) | +|-------------:|-----------------------------------------------|------------------------------------------------------|---------------------------| | struct | `cudaChannelFormatDesc` | `hipChannelFormatDesc` | | struct | `cudaDeviceProp` | `hipDeviceProp_t` | | struct | `cudaExtent` | `hipExtent` | @@ -790,3 +790,19 @@ | define | `cudaTextureType1DLayered` | `hipTextureType1DLayered` | | define | `cudaTextureType2DLayered` | `hipTextureType2DLayered` | | define | `cudaTextureTypeCubemapLayered` | `hipTextureTypeCubemapLayered` | +| enum |***`cudaDataType_t`*** |***`hipblasDatatype_t`*** | +| enum |***`cudaDataType`*** |***`hipblasDatatype_t`*** | +| 2 |*`CUDA_R_16F`* |*`HIPBLAS_R_16F`* | 150 | +| 6 |*`CUDA_C_16F`* |*`HIPBLAS_C_16F`* | 153 | +| 0 |*`CUDA_R_32F`* |*`HIPBLAS_R_32F`* | 151 | +| 4 |*`CUDA_C_32F`* |*`HIPBLAS_C_32F`* | 154 | +| 1 |*`CUDA_R_64F`* |*`HIPBLAS_R_64F`* | 152 | +| 5 |*`CUDA_C_64F`* |*`HIPBLAS_C_64F`* | 155 | +| 3 |*`CUDA_R_8I`* | | +| 7 |*`CUDA_C_8I`* | | +| 8 |*`CUDA_R_8U`* | | +| 9 |*`CUDA_C_8U`* | | +| 10 |*`CUDA_R_32I`* | | +| 11 |*`CUDA_C_32I`* | | +| 12 |*`CUDA_R_32U`* | | +| 13 |*`CUDA_C_32U`* | | diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp index ca53738abb..d8859dfaa3 100644 --- a/hipify-clang/src/CUDA2HipMap.cpp +++ b/hipify-clang/src/CUDA2HipMap.cpp @@ -1389,22 +1389,23 @@ const std::map CUDA_IDENTIFIER_MAP{ /////////////////////////////// CUDA RT API /////////////////////////////// // Data types - {"cudaDataType_t", {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"cudaDataType", {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_16F", {"hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_16F", {"hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_32F", {"hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_32F", {"hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_64F", {"hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_64F", {"hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_8I", {"hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_8I", {"hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_8U", {"hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_8U", {"hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_32I", {"hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_32I", {"hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_R_32U", {"hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, - {"CUDA_C_32U", {"hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, + // TODO: rename hipblasDatatype_t to hipDataType_t and move from hipBLAS to HIP + {"cudaDataType_t", {"hipblasDatatype_t", CONV_TYPE, API_RUNTIME}}, + {"cudaDataType", {"hipblasDatatype_t", CONV_TYPE, API_RUNTIME}}, + {"CUDA_R_16F", {"HIPBLAS_R_16F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 2 // 150 + {"CUDA_C_16F", {"HIPBLAS_C_16F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 6 // 153 + {"CUDA_R_32F", {"HIPBLAS_R_32F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 0 // 151 + {"CUDA_C_32F", {"HIPBLAS_C_32F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 4 // 154 + {"CUDA_R_64F", {"HIPBLAS_R_64F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 1 // 152 + {"CUDA_C_64F", {"HIPBLAS_C_64F", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 5 // 155 + {"CUDA_R_8I", {"HIPBLAS_R_8I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 3 // + {"CUDA_C_8I", {"HIPBLAS_C_8I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 7 // + {"CUDA_R_8U", {"HIPBLAS_R_8U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 8 // + {"CUDA_C_8U", {"HIPBLAS_C_8U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 9 // + {"CUDA_R_32I", {"HIPBLAS_R_32I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 10 // + {"CUDA_C_32I", {"HIPBLAS_C_32I", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 11 // + {"CUDA_R_32U", {"HIPBLAS_R_32U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 12 // + {"CUDA_C_32U", {"HIPBLAS_C_32U", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 13 // // Library property types // IMPORTANT: no cuda prefix @@ -2115,11 +2116,7 @@ const std::map CUDA_IDENTIFIER_MAP{ {"CUBLAS_ATOMICS_ALLOWED", {"HIPBLAS_ATOMICS_ALLOWED", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // Blas Data Type - {"cublasDataType_t", {"hipblasDataType_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}}, - {"CUBLAS_DATA_FLOAT", {"HIPBLAS_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, - {"CUBLAS_DATA_DOUBLE", {"HIPBLAS_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, - {"CUBLAS_DATA_HALF", {"HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, - {"CUBLAS_DATA_INT8", {"HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, + {"cublasDataType_t", {"hipblasDatatype_t", CONV_TYPE, API_BLAS}}, // Blas Math mode/tensor operation {"cublasMath_t", {"hipblasMath_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}}, @@ -2127,9 +2124,9 @@ const std::map CUDA_IDENTIFIER_MAP{ {"CUBLAS_TENSOR_OP_MATH", {"HIPBLAS_TENSOR_OP_MATH", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // Blass different GEMM algorithms - {"cublasGemmAlgo_t", {"hipblasGemmAlgo_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED}}, - {"CUBLAS_GEMM_DFALT", {"HIPBLAS_GEMM_DFALT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // -1 - {"CUBLAS_GEMM_DEFAULT", {"HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // -1 + {"cublasGemmAlgo_t", {"hipblasGemmAlgo_t", CONV_TYPE, API_BLAS}}, + {"CUBLAS_GEMM_DFALT", {"HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS}}, // -1 // 160 + {"CUBLAS_GEMM_DEFAULT", {"HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS}}, // -1 // 160 {"CUBLAS_GEMM_ALGO0", {"HIPBLAS_GEMM_ALGO0", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // 0 {"CUBLAS_GEMM_ALGO1", {"HIPBLAS_GEMM_ALGO1", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // 1 {"CUBLAS_GEMM_ALGO2", {"HIPBLAS_GEMM_ALGO2", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED}}, // 2 @@ -2687,7 +2684,7 @@ const std::map CUDA_IDENTIFIER_MAP{ //IO in FP16 / FP32, computation in float {"cublasSgemmEx", {"hipblasSgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}}, - {"cublasGemmEx", {"hipblasGemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}}, + {"cublasGemmEx", {"hipblasGemmEx", CONV_MATH_FUNC, API_BLAS}}, {"cublasGemmBatchedEx", {"hipblasGemmBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}}, {"cublasGemmStridedBatchedEx", {"hipblasGemmStridedBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}}, // IO in Int8 complex/cuComplex, computation in cuComplex From 5179a72cda7dfebc45c167755e5d439974f7da92 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Tue, 25 Sep 2018 15:58:36 +0000 Subject: [PATCH 30/34] Use trig functions from ocml instead --- tests/src/deviceLib/hip_trig.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index 29c24cf5b4..7b076065fa 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -42,10 +42,10 @@ THE SOFTWARE. __global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d, float* sin_pd, float* cos_pd) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - sin_d[tid] = __sinf(In[tid]); - cos_d[tid] = __cosf(In[tid]); - tan_d[tid] = __tanf(In[tid]); - __sincosf(In[tid], &sin_pd[tid], &cos_pd[tid]); + sin_d[tid] = sinf(In[tid]); + cos_d[tid] = cosf(In[tid]); + tan_d[tid] = tanf(In[tid]); + sincosf(In[tid], &sin_pd[tid], &cos_pd[tid]); } int main() { From 90f57d452a6b75d3c7a70c41072809f57792319e Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 26 Sep 2018 12:47:36 +0530 Subject: [PATCH 31/34] Return hipSuccess when sizeBytes=0 in hipMemset --- src/hip_memory.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index dc5390f014..bda6ad2650 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1543,6 +1543,8 @@ hipError_t ihipMemset(void* dst, int value, size_t sizeBytes, hipStream_t strea { hipError_t e = hipSuccess; + if (sizeBytes == 0) return e; + if (stream && (dst != NULL)) { if(copyDataType == ihipMemsetDataTypeChar){ if ((sizeBytes & 0x3) == 0) { From 221ef1db1f343479639da79bd26b4cb8e9530a2f Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 26 Sep 2018 17:01:59 +0300 Subject: [PATCH 32/34] [HIPIFY][doc] Update README.md due to new LLVM 7.0.0 and CUDA 10.0 releases. --- hipify-clang/README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index fdb37d2e2d..d89a50ffbe 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -29,11 +29,16 @@ ##Dependencies `hipify-clang` requires: -1. LLVM+CLANG of at least version 3.8.0, latest stable and recommended release is 6.0.1. -2. CUDA at least version 7.5, latest supported release is 9.0. +1. LLVM+CLANG of at least version 3.8.0, latest stable and recommended releases: + + 6.0.1 (linux and windows), -| **LLVM release version** | **CUDA latest supported version** | -|:------------------------:|:---------------------------------:| + 7.0.0 (linux only). + +2. CUDA at least version 7.5, latest supported release is 9.2. + +| **LLVM release version** | **CUDA latest supported version** |**Comments**| +|:------------------------:|:---------------------------------:|:-------- -:| | 3.8.0 | 7.5 | | 3.8.1 | 7.5 | | 3.9.0 | 7.5 | @@ -45,6 +50,8 @@ | 5.0.2 | 8.0 | | 6.0.0 | 9.0 | | 6.0.1 | 9.0 | +| 7.0.0 | 9.2 | linux only | +| | 10.0 | not yet supported | In most cases, you can get a suitable version of LLVM+CLANG with your package manager. From c943885a65d60909021cec9bdd9d245995de07a1 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 26 Sep 2018 17:26:25 +0300 Subject: [PATCH 33/34] [HIPIFY][docs] Fix typos in Readme.md --- hipify-clang/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index 061250a807..02bc9cfed0 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -38,8 +38,8 @@ 2. CUDA at least version 7.5, latest supported release is 9.2. -| **LLVM release version** | **CUDA latest supported version** |**Comments**| -|:------------------------:|:---------------------------------:|:-------- -:| +| **LLVM release version** | **CUDA latest supported version** | **Comments** | +|:------------------------:|:---------------------------------:|:------------:| | 3.8.0 | 7.5 | | 3.8.1 | 7.5 | | 3.9.0 | 7.5 | From d0c683fcdeb97ad441b6ca8ce0d3729bb24036aa Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 27 Sep 2018 19:05:22 +0300 Subject: [PATCH 34/34] [HIPIFY][cmake] CUDA 10.0 is not supported. --- hipify-clang/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index 5d9070be28..e2703356ee 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -89,7 +89,8 @@ if (HIPIFY_CLANG_TESTS) if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR (CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR (CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR - (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0")) + (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0") OR + CUDA_VERSION VERSION_EQUAL "10.0") message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.") if (CUDA_VERSION VERSION_LESS "7.0") message(STATUS "Please install CUDA 7.0 or higher.")