From 097fa4f2008bd0cfdaf7ed08fdb73e8ca83efbba Mon Sep 17 00:00:00 2001 From: emankov Date: Wed, 2 May 2018 22:11:05 +0300 Subject: [PATCH 1/2] [HIPIFY] Initial cuDNN support - based on https://github.com/ROCmSoftwarePlatform/hipDNN. - lit testing was supplemented with CUDA_DNN_ROOT_DIR for cudnn testing. - single cuDNN test was added. [ROCm/hip commit: 2569972dde51db2c383604661625243dcaba1a26] --- projects/hip/hipify-clang/src/CUDA2HipMap.cpp | 80 ++++++ projects/hip/hipify-clang/src/Statistics.h | 1 + .../cuDNN/cudnn_convolution_forward.cu | 254 ++++++++++++++++++ projects/hip/tests/hipify-clang/lit.cfg | 4 +- .../hip/tests/hipify-clang/lit.site.cfg.in | 1 + 5 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 projects/hip/tests/hipify-clang/cuDNN/cudnn_convolution_forward.cu diff --git a/projects/hip/hipify-clang/src/CUDA2HipMap.cpp b/projects/hip/hipify-clang/src/CUDA2HipMap.cpp index 47358802e9..cb96f7b225 100644 --- a/projects/hip/hipify-clang/src/CUDA2HipMap.cpp +++ b/projects/hip/hipify-clang/src/CUDA2HipMap.cpp @@ -403,6 +403,9 @@ const std::map CUDA_INCLUDE_MAP{ {"curand_precalc.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}}, {"curand_uniform.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}}, + // CUDNN includes + {"cudnn.h", {"hipDNN.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND}}, + // HIP includes // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h // {"cudacommon.h", {"hipcommon.h", CONV_INCLUDE, API_RUNTIME}}, @@ -2880,6 +2883,83 @@ const std::map CUDA_IDENTIFIER_MAP{ {"curand_poisson4", {"hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND}}, {"curand_Philox4x32_10", {"hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED}}, // unchanged function names: skipahead, skipahead_sequence, skipahead_subsequence + + ///////////////////////////// cuDNN ///////////////////////////// + {"cudnnContext", {"hipdnnContext", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnHandle_t", {"hipdnnHandle_t", CONV_TYPE, API_DNN}}, + {"cudnnStatus_t", {"hipdnnStatus_t", CONV_TYPE, API_DNN}}, + {"CUDNN_STATUS_SUCCESS", {"HIPDNN_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_STATUS_NOT_INITIALIZED", {"HIPDNN_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_STATUS_ALLOC_FAILED", {"HIPDNN_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_STATUS_BAD_PARAM", {"HIPDNN_STATUS_BAD_PARAM", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_STATUS_INTERNAL_ERROR", {"HIPDNN_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_STATUS_INVALID_VALUE", {"HIPDNN_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"CUDNN_STATUS_ARCH_MISMATCH", {"HIPDNN_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_DNN}}, // 6 + {"CUDNN_STATUS_MAPPING_ERROR", {"HIPDNN_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_DNN}}, // 7 + {"CUDNN_STATUS_EXECUTION_FAILED", {"HIPDNN_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_DNN}}, // 8 + {"CUDNN_STATUS_NOT_SUPPORTED", {"HIPDNN_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_DNN}}, // 9 + {"CUDNN_STATUS_LICENSE_ERROR", {"HIPDNN_STATUS_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_DNN}}, // 10 + {"CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING", {"HIPDNN_STATUS_RUNTIME_PREREQUISITE_MISSING", CONV_NUMERIC_LITERAL, API_DNN}}, // 11 + {"CUDNN_STATUS_RUNTIME_IN_PROGRESS", {"HIPDNN_STATUS_RUNTIME_IN_PROGRESS", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 12 + {"CUDNN_STATUS_RUNTIME_FP_OVERFLOW", {"HIPDNN_STATUS_RUNTIME_FP_OVERFLOW", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 13 + {"cudnnRuntimeTag_t", {"hipdnnRuntimeTag_t", CONV_TYPE, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnTensorDescriptor_t", {"hipdnnTensorDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnConvolutionDescriptor_t", {"hipdnnConvolutionDescriptor_t", CONV_TYPE, API_DNN}}, + {"cudnnConvolutionMode_t", {"hipdnnConvolutionMode_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION", {"HIPDNN_CONVOLUTION", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CROSS_CORRELATION", {"HIPDNN_CROSS_CORRELATION", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"cudnnTensorFormat_t", {"hipdnnTensorFormat_t", CONV_TYPE, API_DNN}}, + {"CUDNN_TENSOR_NCHW", {"HIPDNN_TENSOR_NCHW", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_TENSOR_NHWC", {"HIPDNN_TENSOR_NHWC", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_TENSOR_NCHW_VECT_C", {"HIPDNN_TENSOR_NCHW_VECT_C", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"cudnnDataType_t", {"hipdnnDataType_t", CONV_TYPE, API_DNN}}, + {"CUDNN_DATA_FLOAT", {"HIPDNN_DATA_FLOAT", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_DATA_DOUBLE", {"HIPDNN_DATA_DOUBLE", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_DATA_HALF", {"HIPDNN_DATA_HALF", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_DATA_INT8", {"HIPDNN_DATA_INT8", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_DATA_INT32", {"HIPDNN_DATA_INT32", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_DATA_INT8x4", {"HIPDNN_DATA_INT8x4", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"CUDNN_DATA_UINT8", {"HIPDNN_DATA_UINT8", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 6 + {"CUDNN_DATA_UINT8x4", {"HIPDNN_DATA_UINT8x4", CONV_NUMERIC_LITERAL, API_DNN, HIP_UNSUPPORTED}}, // 7 + + {"cudnnConvolutionFwdAlgo_t", {"hipdnnConvolutionFwdAlgo_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", {"HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", {"HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_CONVOLUTION_FWD_ALGO_GEMM", {"HIPDNN_CONVOLUTION_FWD_ALGO_GEMM", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + {"CUDNN_CONVOLUTION_FWD_ALGO_DIRECT", {"HIPDNN_CONVOLUTION_FWD_ALGO_DIRECT", CONV_NUMERIC_LITERAL, API_DNN}}, // 3 + {"CUDNN_CONVOLUTION_FWD_ALGO_FFT", {"HIPDNN_CONVOLUTION_FWD_ALGO_FFT", CONV_NUMERIC_LITERAL, API_DNN}}, // 4 + {"CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", {"HIPDNN_CONVOLUTION_FWD_ALGO_FFT_TILING", CONV_NUMERIC_LITERAL, API_DNN}}, // 5 + {"CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", {"HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD", CONV_NUMERIC_LITERAL, API_DNN}}, // 6 + {"CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", {"HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED", CONV_NUMERIC_LITERAL, API_DNN}}, // 7 + {"CUDNN_CONVOLUTION_FWD_ALGO_COUNT", {"HIPDNN_CONVOLUTION_FWD_ALGO_COUNT", CONV_NUMERIC_LITERAL, API_DNN}}, // 8 + + {"cudnnConvolutionFwdPreference_t", {"hipdnnConvolutionFwdPreference_t", CONV_TYPE, API_DNN}}, + {"CUDNN_CONVOLUTION_FWD_NO_WORKSPACE", {"HIPDNN_CONVOLUTION_FWD_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_DNN}}, // 0 + {"CUDNN_CONVOLUTION_FWD_PREFER_FASTEST", {"HIPDNN_CONVOLUTION_FWD_PREFER_FASTEST", CONV_NUMERIC_LITERAL, API_DNN}}, // 1 + {"CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT", {"HIPDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT", CONV_NUMERIC_LITERAL, API_DNN}}, // 2 + + {"cudnnFilterDescriptor_t", {"hipdnnFilterDescriptor_t", CONV_TYPE, API_DNN}}, + + {"cudnnGetVersion", {"hipdnnGetVersion", CONV_VERSION, API_DNN}}, + {"cudnnGetCudartVersion", {"hipdnnGetCudartVersion", CONV_VERSION, API_DNN, HIP_UNSUPPORTED}}, + {"cudnnGetErrorString", {"hipdnnGetErrorString", CONV_ERROR, API_DNN}}, + {"cudnnCreate", {"hipdnnCreate", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateTensorDescriptor", {"hipdnnCreateTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetTensor4dDescriptor", {"hipdnnSetTensor4dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetConvolution2dDescriptor", {"hipdnnSetConvolution2dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolution2dForwardOutputDim", {"hipdnnGetConvolution2dForwardOutputDim", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateFilterDescriptor", {"hipdnnCreateFilterDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnSetFilter4dDescriptor", {"hipdnnSetFilter4dDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnCreateConvolutionDescriptor", {"hipdnnCreateConvolutionDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionForwardAlgorithm", {"hipdnnGetConvolutionForwardAlgorithm", CONV_MATH_FUNC, API_DNN}}, + {"cudnnConvolutionForward", {"hipdnnConvolutionForward", CONV_MATH_FUNC, API_DNN}}, + {"cudnnGetConvolutionForwardWorkspaceSize", {"hipdnnGetConvolutionForwardWorkspaceSize", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyTensorDescriptor", {"hipdnnDestroyTensorDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyConvolutionDescriptor", {"hipdnnDestroyConvolutionDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroyFilterDescriptor", {"hipdnnDestroyFilterDescriptor", CONV_MATH_FUNC, API_DNN}}, + {"cudnnDestroy", {"hipdnnDestroy", CONV_MATH_FUNC, API_DNN}}, + }; const std::map& CUDA_RENAMES_MAP() { diff --git a/projects/hip/hipify-clang/src/Statistics.h b/projects/hip/hipify-clang/src/Statistics.h index 81be7b09a8..77bf9b0518 100644 --- a/projects/hip/hipify-clang/src/Statistics.h +++ b/projects/hip/hipify-clang/src/Statistics.h @@ -56,6 +56,7 @@ enum ApiTypes { API_RUNTIME, API_BLAS, API_RAND, + API_DNN, API_LAST }; constexpr int NUM_API_TYPES = (int) ApiTypes::API_LAST; diff --git a/projects/hip/tests/hipify-clang/cuDNN/cudnn_convolution_forward.cu b/projects/hip/tests/hipify-clang/cuDNN/cudnn_convolution_forward.cu new file mode 100644 index 0000000000..e58116e22a --- /dev/null +++ b/projects/hip/tests/hipify-clang/cuDNN/cudnn_convolution_forward.cu @@ -0,0 +1,254 @@ +// RUN: %run_test hipify "%s" "%t" %cuda_args + +#include +#include +#include +#include + +// CHECK: #include +#include +// CHECK: #include "hipDNN.h" +#include "cudnn.h" + +// CHECK: hipError_t err = (f); \ +// CHECK: if (err != hipSuccess) { \ + +#define CUDA_CALL(f) { \ + cudaError_t err = (f); \ + if (err != cudaSuccess) { \ + std::cout \ + << " Error occurred: " << err << std::endl; \ + std::exit(1); \ + } \ +} +// CHECK: hipdnnStatus_t err = (f); \ +// CHECK: if (err != HIPDNN_STATUS_SUCCESS) { \ + +#define CUDNN_CALL(f) { \ + cudnnStatus_t err = (f); \ + if (err != CUDNN_STATUS_SUCCESS) { \ + std::cout \ + << " Error occurred: " << err << std::endl; \ + std::exit(1); \ + } \ +} + +__global__ void dev_const(float *px, float k) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + px[tid] = k; +} + +__global__ void dev_iota(float *px) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + px[tid] = tid; +} + +void print(const float *data, int n, int c, int h, int w) { + std::vector buffer(1 << 20); + // CHECK: CUDA_CALL(hipMemcpy( + CUDA_CALL(cudaMemcpy( + buffer.data(), data, + n * c * h * w * sizeof(float), + // CHECK: hipMemcpyDeviceToHost)); + cudaMemcpyDeviceToHost)); + int a = 0; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < c; ++j) { + std::cout << "n=" << i << ", c=" << j << ":" << std::endl; + for (int k = 0; k < h; ++k) { + for (int l = 0; l < w; ++l) { + std::cout << std::setw(4) << std::right << buffer[a]; + ++a; + } + std::cout << std::endl; + } + } + } + std::cout << std::endl; +} + +int main() { + // CHECK: hipdnnHandle_t cudnn; + cudnnHandle_t cudnn; + // CHECK: CUDNN_CALL(hipdnnCreate(&cudnn)); + CUDNN_CALL(cudnnCreate(&cudnn)); + + // input + const int in_n = 1; + const int in_c = 1; + const int in_h = 5; + const int in_w = 5; + std::cout << "in_n: " << in_n << std::endl; + std::cout << "in_c: " << in_c << std::endl; + std::cout << "in_h: " << in_h << std::endl; + std::cout << "in_w: " << in_w << std::endl; + std::cout << std::endl; + // CHECK: hipdnnTensorDescriptor_t in_desc; + cudnnTensorDescriptor_t in_desc; + // CHECK: CUDNN_CALL(hipdnnCreateTensorDescriptor(&in_desc)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc)); + // CHECK: CUDNN_CALL(hipdnnSetTensor4dDescriptor( + CUDNN_CALL(cudnnSetTensor4dDescriptor( + // CHECK: in_desc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, + in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, + in_n, in_c, in_h, in_w)); + + float *in_data; + // CHECK: CUDA_CALL(hipMalloc( + CUDA_CALL(cudaMalloc( + &in_data, in_n * in_c * in_h * in_w * sizeof(float))); + + // filter + const int filt_k = 1; + const int filt_c = 1; + const int filt_h = 2; + const int filt_w = 2; + std::cout << "filt_k: " << filt_k << std::endl; + std::cout << "filt_c: " << filt_c << std::endl; + std::cout << "filt_h: " << filt_h << std::endl; + std::cout << "filt_w: " << filt_w << std::endl; + std::cout << std::endl; + + // CHECK: hipdnnFilterDescriptor_t filt_desc; + cudnnFilterDescriptor_t filt_desc; + // CHECK: CUDNN_CALL(hipdnnCreateFilterDescriptor(&filt_desc)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc)); + // CHECK: CUDNN_CALL(hipdnnSetFilter4dDescriptor( + CUDNN_CALL(cudnnSetFilter4dDescriptor( + // CHECK: filt_desc, HIPDNN_DATA_FLOAT, HIPDNN_TENSOR_NCHW, + filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, + filt_k, filt_c, filt_h, filt_w)); + + float *filt_data; + // CUDA_CALL(hipMalloc( + CUDA_CALL(cudaMalloc( + &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float))); + + // convolution + const int pad_h = 1; + const int pad_w = 1; + const int str_h = 1; + const int str_w = 1; + const int dil_h = 1; + const int dil_w = 1; + std::cout << "pad_h: " << pad_h << std::endl; + std::cout << "pad_w: " << pad_w << std::endl; + std::cout << "str_h: " << str_h << std::endl; + std::cout << "str_w: " << str_w << std::endl; + std::cout << "dil_h: " << dil_h << std::endl; + std::cout << "dil_w: " << dil_w << std::endl; + std::cout << std::endl; + + // CHECK: hipdnnConvolutionDescriptor_t conv_desc; + cudnnConvolutionDescriptor_t conv_desc; + // CUDNN_CALL(hipdnnCreateConvolutionDescriptor(&conv_desc)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc)); + // CHECK: CUDNN_CALL(hipdnnSetConvolution2dDescriptor( + CUDNN_CALL(cudnnSetConvolution2dDescriptor( + conv_desc, + pad_h, pad_w, str_h, str_w, dil_h, dil_w, + // CHECK: HIPDNN_CONVOLUTION, HIPDNN_DATA_FLOAT)); + CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT)); + + // output + int out_n; + int out_c; + int out_h; + int out_w; + + // CHECK: CUDNN_CALL(hipdnnGetConvolution2dForwardOutputDim( + CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim( + conv_desc, in_desc, filt_desc, + &out_n, &out_c, &out_h, &out_w)); + + std::cout << "out_n: " << out_n << std::endl; + std::cout << "out_c: " << out_c << std::endl; + std::cout << "out_h: " << out_h << std::endl; + std::cout << "out_w: " << out_w << std::endl; + std::cout << std::endl; + // CHECK: hipdnnTensorDescriptor_t out_desc; + cudnnTensorDescriptor_t out_desc; + // CHECK: CUDNN_CALL(hipdnnCreateTensorDescriptor(&out_desc)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc)); + // CHECK: CUDNN_CALL(hipdnnSetTensor4dDescriptor( + CUDNN_CALL(cudnnSetTensor4dDescriptor( + // CHECK: out_desc, HIPDNN_TENSOR_NCHW, HIPDNN_DATA_FLOAT, + out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, + out_n, out_c, out_h, out_w)); + + float *out_data; + // CHECK: CUDA_CALL(hipMalloc( + CUDA_CALL(cudaMalloc( + &out_data, out_n * out_c * out_h * out_w * sizeof(float))); + + // algorithm + // CHECK: hipdnnConvolutionFwdAlgo_t algo; + cudnnConvolutionFwdAlgo_t algo; + // CHECK: CUDNN_CALL(hipdnnGetConvolutionForwardAlgorithm( + CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm( + cudnn, + in_desc, filt_desc, conv_desc, out_desc, + // CHECK: HIPDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); + + std::cout << "Convolution algorithm: " << algo << std::endl; + std::cout << std::endl; + + // workspace + size_t ws_size; + // CHECK: CUDNN_CALL(hipdnnGetConvolutionForwardWorkspaceSize( + CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize( + cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size)); + + float *ws_data; + // CHECK: CUDA_CALL(hipMalloc(&ws_data, ws_size)); + CUDA_CALL(cudaMalloc(&ws_data, ws_size)); + + std::cout << "Workspace size: " << ws_size << std::endl; + std::cout << std::endl; + + // perform + float alpha = 1.f; + float beta = 0.f; + // CHECK: hipLaunchKernelGGL(dev_iota, dim3(in_w * in_h), dim3(in_n * in_c), 0, 0, in_data); + // CHECK: hipLaunchKernelGGL(dev_const, dim3(filt_w * filt_h), dim3(filt_k * filt_c), 0, 0, filt_data, 1.f); + dev_iota<<>>(in_data); + dev_const<<>>(filt_data, 1.f); + // CHECK: CUDNN_CALL(hipdnnConvolutionForward( + CUDNN_CALL(cudnnConvolutionForward( + cudnn, + &alpha, in_desc, in_data, filt_desc, filt_data, + conv_desc, algo, ws_data, ws_size, + &beta, out_desc, out_data)); + + // results + std::cout << "in_data:" << std::endl; + print(in_data, in_n, in_c, in_h, in_w); + + std::cout << "filt_data:" << std::endl; + print(filt_data, filt_k, filt_c, filt_h, filt_w); + + std::cout << "out_data:" << std::endl; + print(out_data, out_n, out_c, out_h, out_w); + + // finalizing + // CHECK: CUDA_CALL(hipFree(ws_data)); + CUDA_CALL(cudaFree(ws_data)); + // CHECK: CUDA_CALL(hipFree(out_data)); + CUDA_CALL(cudaFree(out_data)); + // CHECK: CUDNN_CALL(hipdnnDestroyTensorDescriptor(out_desc)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc)); + // CHECK: CUDNN_CALL(hipdnnDestroyConvolutionDescriptor(conv_desc)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc)); + // CHECK: CUDA_CALL(hipFree(filt_data)); + CUDA_CALL(cudaFree(filt_data)); + // CHECK: CUDNN_CALL(hipdnnDestroyFilterDescriptor(filt_desc)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc)); + // CHECK: CUDA_CALL(hipFree(in_data)); + CUDA_CALL(cudaFree(in_data)); + // CHECK: CUDNN_CALL(hipdnnDestroyTensorDescriptor(in_desc)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc)); + // CHECK: CUDNN_CALL(hipdnnDestroy(cudnn)); + CUDNN_CALL(cudnnDestroy(cudnn)); + return 0; +} diff --git a/projects/hip/tests/hipify-clang/lit.cfg b/projects/hip/tests/hipify-clang/lit.cfg index 104cee8311..98961fe166 100644 --- a/projects/hip/tests/hipify-clang/lit.cfg +++ b/projects/hip/tests/hipify-clang/lit.cfg @@ -57,6 +57,8 @@ else: run_test_ext = ".sh" clang_args += " -isystem'%s'/samples/common/inc" -config.substitutions.append(("%cuda_args", clang_args % (config.cuda_root, config.cuda_sdk_root))) +clang_args += " -I'%s'/include" + +config.substitutions.append(("%cuda_args", clang_args % (config.cuda_root, config.cuda_sdk_root, config.cuda_dnn_root))) config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"')) config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"')) diff --git a/projects/hip/tests/hipify-clang/lit.site.cfg.in b/projects/hip/tests/hipify-clang/lit.site.cfg.in index e52db7b026..99aa92b49f 100644 --- a/projects/hip/tests/hipify-clang/lit.site.cfg.in +++ b/projects/hip/tests/hipify-clang/lit.site.cfg.in @@ -4,6 +4,7 @@ import os config.llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@" config.obj_root = "@CMAKE_CURRENT_BINARY_DIR@" config.cuda_root = "@CUDA_TOOLKIT_ROOT_DIR@" +config.cuda_dnn_root = "@CUDA_DNN_ROOT_DIR@" if sys.platform in ['win32']: config.cuda_sdk_root = "@CUDA_SDK_ROOT_DIR@" if not config.cuda_sdk_root or config.cuda_sdk_root == "CUDA_SDK_ROOT_DIR-NOTFOUND": From 2ad9863d31a48b9d5474c2ceaff728f1da03aef8 Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 4 May 2018 22:23:16 +0300 Subject: [PATCH 2/2] [HIPIFY][test] Undo commit "Apply .clangformat to all repo source files" Commit broke tests due to code and comments formatting changes, thus FileCheck fails on checks, which are in comments. [ROCm/hip commit: 21b79cd467149621382115ecc1ae334b8be381a3] --- .../cuRAND/benchmark_curand_generate.cpp | 33 +- .../cuRAND/benchmark_curand_kernel.cpp | 513 +++++++++++------- 2 files changed, 320 insertions(+), 226 deletions(-) diff --git a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp index 01ea661204..af30969c6f 100644 --- a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp +++ b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp @@ -35,7 +35,7 @@ // CHECK: #include #include -// CHECK: if((x)!=hipSuccess) { +// CHECK: if ((x) != hipSuccess) { #define CUDA_CALL(x) \ do { \ if ((x) != cudaSuccess) { \ @@ -43,7 +43,7 @@ exit(EXIT_FAILURE); \ } \ } while (0) -// CHECK: if((x)!=HIPRAND_STATUS_SUCCESS) { +// CHECK: if ((x) != HIPRAND_STATUS_SUCCESS) { #define CURAND_CALL(x) \ do { \ if ((x) != CURAND_STATUS_SUCCESS) { \ @@ -59,9 +59,8 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; // CHECK: typedef hiprandRngType_t rng_type_t; typedef curandRngType rng_type_t; -// CHECK: using generate_func_type = std::function; template +// CHECK: using generate_func_type = std::function; using generate_func_type = std::function; template @@ -71,7 +70,7 @@ void run_benchmark(const cli::Parser& parser, const rng_type_t rng_type, const size_t trials = parser.get("trials"); T* data; - // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T))); + // CHECK: CUDA_CALL(hipMalloc((void**)&data, size * sizeof(T))); CUDA_CALL(cudaMalloc((void**)&data, size * sizeof(T))); // CHECK: hiprandGenerator_t generator; @@ -80,8 +79,8 @@ void run_benchmark(const cli::Parser& parser, const rng_type_t rng_type, CURAND_CALL(curandCreateGenerator(&generator, rng_type)); const size_t dimensions = parser.get("dimensions"); - // CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, - // dimensions); CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR) + // CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, dimensions); + // CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR) curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions); if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random { @@ -123,12 +122,12 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, const std::string& distribution) { if (distribution == "uniform-uint") { // CHECK: if (rng_type != HIPRAND_RNG_QUASI_SOBOL64 && - // CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64) + // CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64) { if (rng_type != CURAND_RNG_QUASI_SOBOL64 && rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64) { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, unsigned int * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, unsigned int* data, size_t size) { // CHECK: return hiprandGenerate(gen, data, size); [](curandGenerator_t gen, unsigned int* data, size_t size) { return curandGenerate(gen, data, size); @@ -142,7 +141,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64) { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, unsigned long long * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, unsigned long long* data, size_t size) { [](curandGenerator_t gen, unsigned long long* data, size_t size) { // curandGenerateLongLong is yet unsupported by HIP // CHECK-NOT: return hiprandGenerateLongLong(gen, data, size); @@ -152,7 +151,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "uniform-float") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, float* data, size_t size) { // CHECK: return hiprandGenerateUniform(gen, data, size); [](curandGenerator_t gen, float* data, size_t size) { return curandGenerateUniform(gen, data, size); @@ -160,7 +159,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "uniform-double") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, double* data, size_t size) { // CHECK: return hiprandGenerateUniformDouble(gen, data, size); [](curandGenerator_t gen, double* data, size_t size) { return curandGenerateUniformDouble(gen, data, size); @@ -168,7 +167,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "normal-float") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, float* data, size_t size) { // CHECK: return hiprandGenerateNormal(gen, data, size, 0.0f, 1.0f); [](curandGenerator_t gen, float* data, size_t size) { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); @@ -177,7 +176,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, if (distribution == "normal-double") { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, double* data, size_t size) { // CHECK: return hiprandGenerateNormalDouble(gen, data, size, 0.0, 1.0); [](curandGenerator_t gen, double* data, size_t size) { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); @@ -185,7 +184,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "log-normal-float") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, float* data, size_t size) { // CHECK: return hiprandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); [](curandGenerator_t gen, float* data, size_t size) { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); @@ -194,7 +193,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, if (distribution == "log-normal-double") { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, double* data, size_t size) { // CHECK: return hiprandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); [](curandGenerator_t gen, double* data, size_t size) { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); @@ -207,7 +206,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; run_benchmark( parser, rng_type, - // CHECK: [lambda](hiprandGenerator_t gen, unsigned int * data, size_t size) { + // CHECK: [lambda](hiprandGenerator_t gen, unsigned int* data, size_t size) { // CHECK: return hiprandGeneratePoisson(gen, data, size, lambda); [lambda](curandGenerator_t gen, unsigned int* data, size_t size) { return curandGeneratePoisson(gen, data, size, lambda); diff --git a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp index 13eba909f8..222e30570a 100644 --- a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp +++ b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp @@ -42,16 +42,15 @@ #include #include -// CHECK: hipError_t error = (x); -// CHECK: if(error!=hipSuccess) { +// CHECK: if ((x) != hipSuccess) { #define CUDA_CALL(x) \ do { \ - cudaError_t error = (x); \ - if (error != cudaSuccess) { \ - printf("Error %d at %s:%d\n", error, __FILE__, __LINE__); \ + if ((x) != cudaSuccess) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ exit(EXIT_FAILURE); \ } \ } while (0) +// CHECK: if ((x) != HIPRAND_STATUS_SUCCESS) { #define CURAND_CALL(x) \ do { \ if ((x) != CURAND_STATUS_SUCCESS) { \ @@ -64,17 +63,22 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; #endif -size_t next_power2(size_t x) { +size_t next_power2(size_t x) +{ size_t power = 1; - while (power < x) { + while (power < x) + { power *= 2; } return power; } -template -__global__ void init_kernel(GeneratorState* states, const unsigned long long seed, - const unsigned long long offset) { +template +__global__ +void init_kernel(GeneratorState * states, + const unsigned long long seed, + const unsigned long long offset) +{ const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; GeneratorState state; // CHECK: hiprand_init(seed, state_id, offset, &state); @@ -82,32 +86,42 @@ __global__ void init_kernel(GeneratorState* states, const unsigned long long see states[state_id] = state; } -template -__global__ void generate_kernel(GeneratorState* states, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { +template +__global__ +void generate_kernel(GeneratorState * states, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) +{ const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int stride = gridDim.x * blockDim.x; GeneratorState state = states[state_id]; unsigned int index = state_id; - while (index < size) { + while(index < size) + { data[index] = generate_func(&state, extra); index += stride; } states[state_id] = state; } -template -struct runner { - GeneratorState* states; +template +struct runner +{ + GeneratorState * states; - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long seed, const unsigned long long offset) { + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { const size_t states_size = blocks * threads; // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState))); - CUDA_CALL(cudaMalloc((void**)&states, states_size * sizeof(GeneratorState))); - // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, - // offset); + CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState))); + // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset); init_kernel<<>>(states, seed, offset); // CHECK: CUDA_CALL(hipPeekAtLastError()); // CHECK: CUDA_CALL(hipDeviceSynchronize()); @@ -115,21 +129,33 @@ struct runner { CUDA_CALL(cudaDeviceSynchronize()); } - ~runner() { CUDA_CALL(cudaFree(states)); } + ~runner() + { + CUDA_CALL(cudaFree(states)); + } - template - void generate(const size_t blocks, const size_t threads, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { - // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, - // data, size, generate_func, extra); + template + void generate(const size_t blocks, + const size_t threads, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) + { + // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra); generate_kernel<<>>(states, data, size, generate_func, extra); } }; // CHECK: void generate_kernel(hiprandStateMtgp32_t * states, -template -__global__ void generate_kernel(curandStateMtgp32_t* states, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { +template +__global__ +void generate_kernel(curandStateMtgp32_t * states, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) +{ const unsigned int state_id = blockIdx.x; const unsigned int thread_id = threadIdx.x; unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -137,67 +163,80 @@ __global__ void generate_kernel(curandStateMtgp32_t* states, T* data, const size // CHECK: __shared__ hiprandStateMtgp32_t state; __shared__ curandStateMtgp32_t state; - if (thread_id == 0) state = states[state_id]; + if (thread_id == 0) + state = states[state_id]; __syncthreads(); - const size_t r = size % blockDim.x; + const size_t r = size%blockDim.x; const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r); - while (index < size_rounded_up) { + while(index < size_rounded_up) + { auto value = generate_func(&state, extra); - if (index < size) data[index] = value; + if(index < size) + data[index] = value; index += stride; } __syncthreads(); - if (thread_id == 0) states[state_id] = state; + if (thread_id == 0) + states[state_id] = state; } // CHECK: struct runner -template <> -struct runner { +template<> +struct runner +{ // CHECK: hiprandStateMtgp32_t * states; - curandStateMtgp32_t* states; - mtgp32_kernel_params_t* d_param; + curandStateMtgp32_t * states; + mtgp32_kernel_params_t * d_param; - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long seed, const unsigned long long offset) { + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { const size_t states_size = std::min((size_t)200, blocks); - // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * - // sizeof(hiprandStateMtgp32_t))); - CUDA_CALL(cudaMalloc((void**)&states, states_size * sizeof(curandStateMtgp32_t))); + // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateMtgp32_t))); + CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateMtgp32_t))); // CHECK: CUDA_CALL(hipMalloc((void **)&d_param, sizeof(mtgp32_kernel_params))); - CUDA_CALL(cudaMalloc((void**)&d_param, sizeof(mtgp32_kernel_params))); + CUDA_CALL(cudaMalloc((void **)&d_param, sizeof(mtgp32_kernel_params))); // curandMakeMTGP32Constants is yet unsupported by HIP // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param)); CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param)); // curandMakeMTGP32KernelState is yet unsupported by HIP - // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, - // d_param, states_size, seed)); - CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, - states_size, seed)); + // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed)); + CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed)); } - ~runner() { + ~runner() + { // CHECK: CUDA_CALL(hipFree(states)); // CHECK: CUDA_CALL(hipFree(d_param)); CUDA_CALL(cudaFree(states)); CUDA_CALL(cudaFree(d_param)); } - template - void generate(const size_t blocks, const size_t threads, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { - // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), - // dim3(256), 0, 0, states, data, size, generate_func, extra); - generate_kernel<<>>(states, data, size, generate_func, - extra); + template + void generate(const size_t blocks, + const size_t threads, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) + { + // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra); + generate_kernel<<>>(states, data, size, generate_func, extra); } }; // CHECK: void init_kernel(hiprandStateSobol32_t * states, -template -__global__ void init_kernel(curandStateSobol32_t* states, const Directions directions, - const unsigned long long offset) { +template +__global__ +void init_kernel(curandStateSobol32_t * states, + const Directions directions, + const unsigned long long offset) +{ const unsigned int dimension = blockIdx.y; const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; // CHECK: hiprandStateSobol32_t state; @@ -208,9 +247,14 @@ __global__ void init_kernel(curandStateSobol32_t* states, const Directions direc } // CHECK: void generate_kernel(hiprandStateSobol32_t * states, -template -__global__ void generate_kernel(curandStateSobol32_t* states, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { +template +__global__ +void generate_kernel(curandStateSobol32_t * states, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) +{ const unsigned int dimension = blockIdx.y; const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int stride = gridDim.x * blockDim.x; @@ -218,7 +262,8 @@ __global__ void generate_kernel(curandStateSobol32_t* states, T* data, const siz curandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id]; const unsigned int offset = dimension * size; unsigned int index = state_id; - while (index < size) { + while(index < size) + { data[offset + index] = generate_func(&state, extra); skipahead(stride - 1, &state); index += stride; @@ -229,39 +274,39 @@ __global__ void generate_kernel(curandStateSobol32_t* states, T* data, const siz } // CHECK: struct runner -template <> -struct runner { +template<> +struct runner +{ // CHECK: hiprandStateSobol32_t * states; - curandStateSobol32_t* states; + curandStateSobol32_t * states; size_t dimensions; - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long seed, const unsigned long long offset) { + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { this->dimensions = dimensions; - // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * - // sizeof(hiprandStateSobol32_t))); + // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateSobol32_t))); const size_t states_size = blocks * threads * dimensions; - CUDA_CALL(cudaMalloc((void**)&states, states_size * sizeof(curandStateSobol32_t))); + CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateSobol32_t))); // CHECK: hiprandDirectionVectors32_t * directions; - curandDirectionVectors32_t* directions; + curandDirectionVectors32_t * directions; // CHECK: const size_t size = dimensions * sizeof(hiprandDirectionVectors32_t); const size_t size = dimensions * sizeof(curandDirectionVectors32_t); // CHECK: CUDA_CALL(hipMalloc((void **)&directions, size)); - CUDA_CALL(cudaMalloc((void**)&directions, size)); + CUDA_CALL(cudaMalloc((void **)&directions, size)); // CHECK: hiprandDirectionVectors32_t * h_directions; - curandDirectionVectors32_t* h_directions; - // hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of - // hiprandDirectionVectorSet_t) are yet unsupported by HIP CHECK-NOT: - // CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, - // HIPRAND_DIRECTION_VECTORS_32_JOEKUO6)); - CURAND_CALL( - curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6)); + curandDirectionVectors32_t * h_directions; + // hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of hiprandDirectionVectorSet_t) are yet unsupported by HIP + // CHECK-NOT: CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, HIPRAND_DIRECTION_VECTORS_32_JOEKUO6)); + CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6)); // CHECK: CUDA_CALL(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice)); const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), - // 0, 0, states, directions, offset); + // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset); init_kernel<<>>(states, directions, offset); // CHECK: CUDA_CALL(hipPeekAtLastError()); // CHECK: CUDA_CALL(hipDeviceSynchronize()); @@ -271,25 +316,31 @@ struct runner { CUDA_CALL(cudaFree(directions)); } - ~runner() { + ~runner() + { // CHECK: CUDA_CALL(hipFree(states)); CUDA_CALL(cudaFree(states)); } - template - void generate(const size_t blocks, const size_t threads, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { + template + void generate(const size_t blocks, + const size_t threads, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) + { const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), - // dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra); - generate_kernel<<>>(states, data, size / dimensions, - generate_func, extra); + // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra); + generate_kernel<<>>(states, data, size / dimensions, generate_func, extra); } }; -template -void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, - const Extra extra) { +template +void run_benchmark(const cli::Parser& parser, + const GenerateFunc& generate_func, + const Extra extra) +{ const size_t size = parser.get("size"); const size_t dimensions = parser.get("dimensions"); const size_t trials = parser.get("trials"); @@ -297,14 +348,15 @@ void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, const size_t blocks = parser.get("blocks"); const size_t threads = parser.get("threads"); - T* data; + T * data; // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T))); - CUDA_CALL(cudaMalloc((void**)&data, size * sizeof(T))); + CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T))); runner r(dimensions, blocks, threads, 12345ULL, 6789ULL); // Warm-up - for (size_t i = 0; i < 5; i++) { + for (size_t i = 0; i < 5; i++) + { r.generate(blocks, threads, data, size, generate_func, extra); // CHECK: CUDA_CALL(hipPeekAtLastError()); // CHECK: CUDA_CALL(hipDeviceSynchronize()); @@ -316,7 +368,8 @@ void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, // Measurement auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < trials; i++) { + for (size_t i = 0; i < trials; i++) + { r.generate(blocks, threads, data, size, generate_func, extra); } // CHECK: CUDA_CALL(hipPeekAtLastError()); @@ -326,132 +379,147 @@ void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; - std::cout << std::fixed << std::setprecision(3) << " " - << "Throughput = " << std::setw(8) - << (trials * size * sizeof(T)) / (elapsed.count() / 1e3 * (1 << 30)) - << " GB/s, Samples = " << std::setw(8) - << (trials * size) / (elapsed.count() / 1e3 * (1 << 30)) - << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed.count() / trials - << " ms, Time (all) = " << std::setw(8) << elapsed.count() << " ms, Size = " << size + std::cout << std::fixed << std::setprecision(3) + << " " + << "Throughput = " + << std::setw(8) << (trials * size * sizeof(T)) / + (elapsed.count() / 1e3 * (1 << 30)) + << " GB/s, Samples = " + << std::setw(8) << (trials * size) / + (elapsed.count() / 1e3 * (1 << 30)) + << " GSample/s, AvgTime (1 trial) = " + << std::setw(8) << elapsed.count() / trials + << " ms, Time (all) = " + << std::setw(8) << elapsed.count() + << " ms, Size = " << size << std::endl; // CHECK: CUDA_CALL(hipFree(data)); CUDA_CALL(cudaFree(data)); } -template -void run_benchmarks(const cli::Parser& parser, const std::string& distribution) { - if (distribution == "uniform-uint") { +template +void run_benchmarks(const cli::Parser& parser, + const std::string& distribution) +{ + if (distribution == "uniform-uint") + { // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP // CHECK-NOT: if (!std::is_same::value && // CHECK-NOT: !std::is_same::value) if (!std::is_same::value && - !std::is_same::value) { + !std::is_same::value) + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand(state); - return curand(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand(state); + return curand(state); + }, 0 + ); } } - if (distribution == "uniform-long-long") { + if (distribution == "uniform-long-long") + { // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP // CHECK-NOT: if (!std::is_same::value && // CHECK-NOT: !std::is_same::value) if (std::is_same::value || - std::is_same::value) { - run_benchmark( - parser, - [] __device__(GeneratorState * state, int) { + std::is_same::value) + { + run_benchmark(parser, + [] __device__ (GeneratorState * state, int) { // CHECK: return hiprand(state); return curand(state); - }, - 0); + }, 0 + ); } } - if (distribution == "uniform-float") { + if (distribution == "uniform-float") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_uniform(state); - return curand_uniform(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_uniform(state); + return curand_uniform(state); + }, 0 + ); } - if (distribution == "uniform-double") { + if (distribution == "uniform-double") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_uniform_double(state); - return curand_uniform_double(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_uniform_double(state); + return curand_uniform_double(state); + }, 0 + ); } - if (distribution == "normal-float") { + if (distribution == "normal-float") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_normal(state); - return curand_normal(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_normal(state); + return curand_normal(state); + }, 0 + ); } - if (distribution == "normal-double") { + if (distribution == "normal-double") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_normal_double(state); - return curand_normal_double(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_normal_double(state); + return curand_normal_double(state); + }, 0 + ); } - if (distribution == "log-normal-float") { + if (distribution == "log-normal-float") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_log_normal(state, - // 0.0f, 1.0f); - return curand_log_normal(state, 0.0f, 1.0f); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_log_normal(state, 0.0f, 1.0f); + return curand_log_normal(state, 0.0f, 1.0f); + }, 0 + ); } - if (distribution == "log-normal-double") { + if (distribution == "log-normal-double") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_log_normal_double(state, - // 0.0, 1.0); - return curand_log_normal_double(state, 0.0, 1.0); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_log_normal_double(state, 0.0, 1.0); + return curand_log_normal_double(state, 0.0, 1.0); + }, 0 + ); } - if (distribution == "poisson") { + if (distribution == "poisson") + { const auto lambdas = parser.get>("lambda"); - for (double lambda : lambdas) { - std::cout << " " - << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; - run_benchmark( - parser, - [] __device__(GeneratorState * state, double lambda) { + for (double lambda : lambdas) + { + std::cout << " " << "lambda " + << std::fixed << std::setprecision(1) << lambda << std::endl; + run_benchmark(parser, + [] __device__ (GeneratorState * state, double lambda) { // CHECK: return hiprand_poisson(state, lambda); return curand_poisson(state, lambda); - }, - lambda); + }, lambda + ); } } - if (distribution == "discrete-poisson") { + if (distribution == "discrete-poisson") + { const auto lambdas = parser.get>("lambda"); - for (double lambda : lambdas) { - std::cout << " " - << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; + for (double lambda : lambdas) + { + std::cout << " " << "lambda " + << std::fixed << std::setprecision(1) << lambda << std::endl; // CHECK: hiprandDiscreteDistribution_t discrete_distribution; curandDiscreteDistribution_t discrete_distribution; // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(lambda, &discrete_distribution)); CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution)); - run_benchmark( - parser, - // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t - // discrete_distribution) { - [] __device__(GeneratorState * state, - curandDiscreteDistribution_t discrete_distribution) { + run_benchmark(parser, + // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t discrete_distribution) { + [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) { // CHECK: return hiprand_discrete(state, discrete_distribution); return curand_discrete(state, discrete_distribution); - }, - discrete_distribution); + }, discrete_distribution + ); // CHECK: CURAND_CALL(hiprandDestroyDistribution(discrete_distribution)); CURAND_CALL(curandDestroyDistribution(discrete_distribution)); } @@ -459,9 +527,12 @@ void run_benchmarks(const cli::Parser& parser, const std::string& distribution) } const std::vector all_engines = { - "xorwow", "mrg32k3a", "mtgp32", + "xorwow", + "mrg32k3a", + "mtgp32", // "mt19937", - "philox", "sobol32", + "philox", + "sobol32", // "scrambled_sobol32", // "sobol64", // "scrambled_sobol64", @@ -480,42 +551,50 @@ const std::vector all_distributions = { "discrete-poisson", }; -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) +{ cli::Parser parser(argc, argv); const std::string distribution_desc = "space-separated list of distributions:" + std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(), - [](std::string a, std::string b) { return a + "\n " + b; }) + + [](std::string a, std::string b) { + return a + "\n " + b; + } + ) + "\n or all"; const std::string engine_desc = "space-separated list of random number engines:" + std::accumulate(all_engines.begin(), all_engines.end(), std::string(), - [](std::string a, std::string b) { return a + "\n " + b; }) + + [](std::string a, std::string b) { + return a + "\n " + b; + } + ) + "\n or all"; parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("dimensions", "dimensions", 1, - "number of dimensions of quasi-random values"); + parser.set_optional("dimensions", "dimensions", 1, "number of dimensions of quasi-random values"); parser.set_optional("trials", "trials", 20, "number of trials"); parser.set_optional("blocks", "blocks", 256, "number of blocks"); parser.set_optional("threads", "threads", 256, "number of threads in each block"); - parser.set_optional>("dis", "dis", {"uniform-uint"}, - distribution_desc.c_str()); - parser.set_optional>("engine", "engine", {"philox"}, - engine_desc.c_str()); - parser.set_optional>( - "lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution"); + parser.set_optional>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str()); + parser.set_optional>("engine", "engine", {"philox"}, engine_desc.c_str()); + parser.set_optional>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution"); parser.run_and_exit_if_error(); std::vector engines; { auto es = parser.get>("engine"); - if (std::find(es.begin(), es.end(), "all") != es.end()) { + if (std::find(es.begin(), es.end(), "all") != es.end()) + { engines = all_engines; - } else { - for (auto e : all_engines) { - if (std::find(es.begin(), es.end(), e) != es.end()) engines.push_back(e); + } + else + { + for (auto e : all_engines) + { + if (std::find(es.begin(), es.end(), e) != es.end()) + engines.push_back(e); } } } @@ -523,11 +602,16 @@ int main(int argc, char* argv[]) { std::vector distributions; { auto ds = parser.get>("dis"); - if (std::find(ds.begin(), ds.end(), "all") != ds.end()) { + if (std::find(ds.begin(), ds.end(), "all") != ds.end()) + { distributions = all_distributions; - } else { - for (auto d : all_distributions) { - if (std::find(ds.begin(), ds.end(), d) != ds.end()) distributions.push_back(d); + } + else + { + for (auto d : all_distributions) + { + if (std::find(ds.begin(), ds.end(), d) != ds.end()) + distributions.push_back(d); } } } @@ -552,24 +636,35 @@ int main(int argc, char* argv[]) { std::cout << "Device: " << props.name; std::cout << std::endl << std::endl; - for (auto engine : engines) { + for (auto engine : engines) + { std::cout << engine << ":" << std::endl; - for (auto distribution : distributions) { + for (auto distribution : distributions) + { std::cout << " " << distribution << ":" << std::endl; const std::string plot_name = engine + "-" + distribution; - if (engine == "xorwow") { + if (engine == "xorwow") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "mrg32k3a") { + } + else if (engine == "mrg32k3a") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "philox") { + } + else if (engine == "philox") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "sobol32") { + } + else if (engine == "sobol32") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "mtgp32") { + } + else if (engine == "mtgp32") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); }