diff --git a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp index 01ea661204..af30969c6f 100644 --- a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp +++ b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp @@ -35,7 +35,7 @@ // CHECK: #include #include -// CHECK: if((x)!=hipSuccess) { +// CHECK: if ((x) != hipSuccess) { #define CUDA_CALL(x) \ do { \ if ((x) != cudaSuccess) { \ @@ -43,7 +43,7 @@ exit(EXIT_FAILURE); \ } \ } while (0) -// CHECK: if((x)!=HIPRAND_STATUS_SUCCESS) { +// CHECK: if ((x) != HIPRAND_STATUS_SUCCESS) { #define CURAND_CALL(x) \ do { \ if ((x) != CURAND_STATUS_SUCCESS) { \ @@ -59,9 +59,8 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; // CHECK: typedef hiprandRngType_t rng_type_t; typedef curandRngType rng_type_t; -// CHECK: using generate_func_type = std::function; template +// CHECK: using generate_func_type = std::function; using generate_func_type = std::function; template @@ -71,7 +70,7 @@ void run_benchmark(const cli::Parser& parser, const rng_type_t rng_type, const size_t trials = parser.get("trials"); T* data; - // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T))); + // CHECK: CUDA_CALL(hipMalloc((void**)&data, size * sizeof(T))); CUDA_CALL(cudaMalloc((void**)&data, size * sizeof(T))); // CHECK: hiprandGenerator_t generator; @@ -80,8 +79,8 @@ void run_benchmark(const cli::Parser& parser, const rng_type_t rng_type, CURAND_CALL(curandCreateGenerator(&generator, rng_type)); const size_t dimensions = parser.get("dimensions"); - // CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, - // dimensions); CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR) + // CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, dimensions); + // CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR) curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions); if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random { @@ -123,12 +122,12 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, const std::string& distribution) { if (distribution == "uniform-uint") { // CHECK: if (rng_type != HIPRAND_RNG_QUASI_SOBOL64 && - // CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64) + // CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64) { if (rng_type != CURAND_RNG_QUASI_SOBOL64 && rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64) { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, unsigned int * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, unsigned int* data, size_t size) { // CHECK: return hiprandGenerate(gen, data, size); [](curandGenerator_t gen, unsigned int* data, size_t size) { return curandGenerate(gen, data, size); @@ -142,7 +141,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64) { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, unsigned long long * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, unsigned long long* data, size_t size) { [](curandGenerator_t gen, unsigned long long* data, size_t size) { // curandGenerateLongLong is yet unsupported by HIP // CHECK-NOT: return hiprandGenerateLongLong(gen, data, size); @@ -152,7 +151,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "uniform-float") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, float* data, size_t size) { // CHECK: return hiprandGenerateUniform(gen, data, size); [](curandGenerator_t gen, float* data, size_t size) { return curandGenerateUniform(gen, data, size); @@ -160,7 +159,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "uniform-double") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, double* data, size_t size) { // CHECK: return hiprandGenerateUniformDouble(gen, data, size); [](curandGenerator_t gen, double* data, size_t size) { return curandGenerateUniformDouble(gen, data, size); @@ -168,7 +167,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "normal-float") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, float* data, size_t size) { // CHECK: return hiprandGenerateNormal(gen, data, size, 0.0f, 1.0f); [](curandGenerator_t gen, float* data, size_t size) { return curandGenerateNormal(gen, data, size, 0.0f, 1.0f); @@ -177,7 +176,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, if (distribution == "normal-double") { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, double* data, size_t size) { // CHECK: return hiprandGenerateNormalDouble(gen, data, size, 0.0, 1.0); [](curandGenerator_t gen, double* data, size_t size) { return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0); @@ -185,7 +184,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, } if (distribution == "log-normal-float") { run_benchmark(parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, float* data, size_t size) { // CHECK: return hiprandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); [](curandGenerator_t gen, float* data, size_t size) { return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f); @@ -194,7 +193,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, if (distribution == "log-normal-double") { run_benchmark( parser, rng_type, - // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) { + // CHECK: [](hiprandGenerator_t gen, double* data, size_t size) { // CHECK: return hiprandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); [](curandGenerator_t gen, double* data, size_t size) { return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0); @@ -207,7 +206,7 @@ void run_benchmarks(const cli::Parser& parser, const rng_type_t rng_type, << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; run_benchmark( parser, rng_type, - // CHECK: [lambda](hiprandGenerator_t gen, unsigned int * data, size_t size) { + // CHECK: [lambda](hiprandGenerator_t gen, unsigned int* data, size_t size) { // CHECK: return hiprandGeneratePoisson(gen, data, size, lambda); [lambda](curandGenerator_t gen, unsigned int* data, size_t size) { return curandGeneratePoisson(gen, data, size, lambda); diff --git a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp index 13eba909f8..222e30570a 100644 --- a/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp +++ b/projects/hip/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp @@ -42,16 +42,15 @@ #include #include -// CHECK: hipError_t error = (x); -// CHECK: if(error!=hipSuccess) { +// CHECK: if ((x) != hipSuccess) { #define CUDA_CALL(x) \ do { \ - cudaError_t error = (x); \ - if (error != cudaSuccess) { \ - printf("Error %d at %s:%d\n", error, __FILE__, __LINE__); \ + if ((x) != cudaSuccess) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ exit(EXIT_FAILURE); \ } \ } while (0) +// CHECK: if ((x) != HIPRAND_STATUS_SUCCESS) { #define CURAND_CALL(x) \ do { \ if ((x) != CURAND_STATUS_SUCCESS) { \ @@ -64,17 +63,22 @@ const size_t DEFAULT_RAND_N = 1024 * 1024 * 128; #endif -size_t next_power2(size_t x) { +size_t next_power2(size_t x) +{ size_t power = 1; - while (power < x) { + while (power < x) + { power *= 2; } return power; } -template -__global__ void init_kernel(GeneratorState* states, const unsigned long long seed, - const unsigned long long offset) { +template +__global__ +void init_kernel(GeneratorState * states, + const unsigned long long seed, + const unsigned long long offset) +{ const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; GeneratorState state; // CHECK: hiprand_init(seed, state_id, offset, &state); @@ -82,32 +86,42 @@ __global__ void init_kernel(GeneratorState* states, const unsigned long long see states[state_id] = state; } -template -__global__ void generate_kernel(GeneratorState* states, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { +template +__global__ +void generate_kernel(GeneratorState * states, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) +{ const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int stride = gridDim.x * blockDim.x; GeneratorState state = states[state_id]; unsigned int index = state_id; - while (index < size) { + while(index < size) + { data[index] = generate_func(&state, extra); index += stride; } states[state_id] = state; } -template -struct runner { - GeneratorState* states; +template +struct runner +{ + GeneratorState * states; - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long seed, const unsigned long long offset) { + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { const size_t states_size = blocks * threads; // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState))); - CUDA_CALL(cudaMalloc((void**)&states, states_size * sizeof(GeneratorState))); - // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, - // offset); + CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState))); + // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset); init_kernel<<>>(states, seed, offset); // CHECK: CUDA_CALL(hipPeekAtLastError()); // CHECK: CUDA_CALL(hipDeviceSynchronize()); @@ -115,21 +129,33 @@ struct runner { CUDA_CALL(cudaDeviceSynchronize()); } - ~runner() { CUDA_CALL(cudaFree(states)); } + ~runner() + { + CUDA_CALL(cudaFree(states)); + } - template - void generate(const size_t blocks, const size_t threads, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { - // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, - // data, size, generate_func, extra); + template + void generate(const size_t blocks, + const size_t threads, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) + { + // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra); generate_kernel<<>>(states, data, size, generate_func, extra); } }; // CHECK: void generate_kernel(hiprandStateMtgp32_t * states, -template -__global__ void generate_kernel(curandStateMtgp32_t* states, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { +template +__global__ +void generate_kernel(curandStateMtgp32_t * states, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) +{ const unsigned int state_id = blockIdx.x; const unsigned int thread_id = threadIdx.x; unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -137,67 +163,80 @@ __global__ void generate_kernel(curandStateMtgp32_t* states, T* data, const size // CHECK: __shared__ hiprandStateMtgp32_t state; __shared__ curandStateMtgp32_t state; - if (thread_id == 0) state = states[state_id]; + if (thread_id == 0) + state = states[state_id]; __syncthreads(); - const size_t r = size % blockDim.x; + const size_t r = size%blockDim.x; const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r); - while (index < size_rounded_up) { + while(index < size_rounded_up) + { auto value = generate_func(&state, extra); - if (index < size) data[index] = value; + if(index < size) + data[index] = value; index += stride; } __syncthreads(); - if (thread_id == 0) states[state_id] = state; + if (thread_id == 0) + states[state_id] = state; } // CHECK: struct runner -template <> -struct runner { +template<> +struct runner +{ // CHECK: hiprandStateMtgp32_t * states; - curandStateMtgp32_t* states; - mtgp32_kernel_params_t* d_param; + curandStateMtgp32_t * states; + mtgp32_kernel_params_t * d_param; - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long seed, const unsigned long long offset) { + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { const size_t states_size = std::min((size_t)200, blocks); - // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * - // sizeof(hiprandStateMtgp32_t))); - CUDA_CALL(cudaMalloc((void**)&states, states_size * sizeof(curandStateMtgp32_t))); + // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateMtgp32_t))); + CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateMtgp32_t))); // CHECK: CUDA_CALL(hipMalloc((void **)&d_param, sizeof(mtgp32_kernel_params))); - CUDA_CALL(cudaMalloc((void**)&d_param, sizeof(mtgp32_kernel_params))); + CUDA_CALL(cudaMalloc((void **)&d_param, sizeof(mtgp32_kernel_params))); // curandMakeMTGP32Constants is yet unsupported by HIP // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param)); CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param)); // curandMakeMTGP32KernelState is yet unsupported by HIP - // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, - // d_param, states_size, seed)); - CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, - states_size, seed)); + // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed)); + CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed)); } - ~runner() { + ~runner() + { // CHECK: CUDA_CALL(hipFree(states)); // CHECK: CUDA_CALL(hipFree(d_param)); CUDA_CALL(cudaFree(states)); CUDA_CALL(cudaFree(d_param)); } - template - void generate(const size_t blocks, const size_t threads, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { - // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), - // dim3(256), 0, 0, states, data, size, generate_func, extra); - generate_kernel<<>>(states, data, size, generate_func, - extra); + template + void generate(const size_t blocks, + const size_t threads, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) + { + // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra); + generate_kernel<<>>(states, data, size, generate_func, extra); } }; // CHECK: void init_kernel(hiprandStateSobol32_t * states, -template -__global__ void init_kernel(curandStateSobol32_t* states, const Directions directions, - const unsigned long long offset) { +template +__global__ +void init_kernel(curandStateSobol32_t * states, + const Directions directions, + const unsigned long long offset) +{ const unsigned int dimension = blockIdx.y; const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; // CHECK: hiprandStateSobol32_t state; @@ -208,9 +247,14 @@ __global__ void init_kernel(curandStateSobol32_t* states, const Directions direc } // CHECK: void generate_kernel(hiprandStateSobol32_t * states, -template -__global__ void generate_kernel(curandStateSobol32_t* states, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { +template +__global__ +void generate_kernel(curandStateSobol32_t * states, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) +{ const unsigned int dimension = blockIdx.y; const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int stride = gridDim.x * blockDim.x; @@ -218,7 +262,8 @@ __global__ void generate_kernel(curandStateSobol32_t* states, T* data, const siz curandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id]; const unsigned int offset = dimension * size; unsigned int index = state_id; - while (index < size) { + while(index < size) + { data[offset + index] = generate_func(&state, extra); skipahead(stride - 1, &state); index += stride; @@ -229,39 +274,39 @@ __global__ void generate_kernel(curandStateSobol32_t* states, T* data, const siz } // CHECK: struct runner -template <> -struct runner { +template<> +struct runner +{ // CHECK: hiprandStateSobol32_t * states; - curandStateSobol32_t* states; + curandStateSobol32_t * states; size_t dimensions; - runner(const size_t dimensions, const size_t blocks, const size_t threads, - const unsigned long long seed, const unsigned long long offset) { + runner(const size_t dimensions, + const size_t blocks, + const size_t threads, + const unsigned long long seed, + const unsigned long long offset) + { this->dimensions = dimensions; - // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * - // sizeof(hiprandStateSobol32_t))); + // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateSobol32_t))); const size_t states_size = blocks * threads * dimensions; - CUDA_CALL(cudaMalloc((void**)&states, states_size * sizeof(curandStateSobol32_t))); + CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateSobol32_t))); // CHECK: hiprandDirectionVectors32_t * directions; - curandDirectionVectors32_t* directions; + curandDirectionVectors32_t * directions; // CHECK: const size_t size = dimensions * sizeof(hiprandDirectionVectors32_t); const size_t size = dimensions * sizeof(curandDirectionVectors32_t); // CHECK: CUDA_CALL(hipMalloc((void **)&directions, size)); - CUDA_CALL(cudaMalloc((void**)&directions, size)); + CUDA_CALL(cudaMalloc((void **)&directions, size)); // CHECK: hiprandDirectionVectors32_t * h_directions; - curandDirectionVectors32_t* h_directions; - // hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of - // hiprandDirectionVectorSet_t) are yet unsupported by HIP CHECK-NOT: - // CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, - // HIPRAND_DIRECTION_VECTORS_32_JOEKUO6)); - CURAND_CALL( - curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6)); + curandDirectionVectors32_t * h_directions; + // hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of hiprandDirectionVectorSet_t) are yet unsupported by HIP + // CHECK-NOT: CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, HIPRAND_DIRECTION_VECTORS_32_JOEKUO6)); + CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6)); // CHECK: CUDA_CALL(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice)); CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice)); const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), - // 0, 0, states, directions, offset); + // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset); init_kernel<<>>(states, directions, offset); // CHECK: CUDA_CALL(hipPeekAtLastError()); // CHECK: CUDA_CALL(hipDeviceSynchronize()); @@ -271,25 +316,31 @@ struct runner { CUDA_CALL(cudaFree(directions)); } - ~runner() { + ~runner() + { // CHECK: CUDA_CALL(hipFree(states)); CUDA_CALL(cudaFree(states)); } - template - void generate(const size_t blocks, const size_t threads, T* data, const size_t size, - const GenerateFunc& generate_func, const Extra extra) { + template + void generate(const size_t blocks, + const size_t threads, + T * data, + const size_t size, + const GenerateFunc& generate_func, + const Extra extra) + { const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions); - // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), - // dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra); - generate_kernel<<>>(states, data, size / dimensions, - generate_func, extra); + // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra); + generate_kernel<<>>(states, data, size / dimensions, generate_func, extra); } }; -template -void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, - const Extra extra) { +template +void run_benchmark(const cli::Parser& parser, + const GenerateFunc& generate_func, + const Extra extra) +{ const size_t size = parser.get("size"); const size_t dimensions = parser.get("dimensions"); const size_t trials = parser.get("trials"); @@ -297,14 +348,15 @@ void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, const size_t blocks = parser.get("blocks"); const size_t threads = parser.get("threads"); - T* data; + T * data; // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T))); - CUDA_CALL(cudaMalloc((void**)&data, size * sizeof(T))); + CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T))); runner r(dimensions, blocks, threads, 12345ULL, 6789ULL); // Warm-up - for (size_t i = 0; i < 5; i++) { + for (size_t i = 0; i < 5; i++) + { r.generate(blocks, threads, data, size, generate_func, extra); // CHECK: CUDA_CALL(hipPeekAtLastError()); // CHECK: CUDA_CALL(hipDeviceSynchronize()); @@ -316,7 +368,8 @@ void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, // Measurement auto start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < trials; i++) { + for (size_t i = 0; i < trials; i++) + { r.generate(blocks, threads, data, size, generate_func, extra); } // CHECK: CUDA_CALL(hipPeekAtLastError()); @@ -326,132 +379,147 @@ void run_benchmark(const cli::Parser& parser, const GenerateFunc& generate_func, auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; - std::cout << std::fixed << std::setprecision(3) << " " - << "Throughput = " << std::setw(8) - << (trials * size * sizeof(T)) / (elapsed.count() / 1e3 * (1 << 30)) - << " GB/s, Samples = " << std::setw(8) - << (trials * size) / (elapsed.count() / 1e3 * (1 << 30)) - << " GSample/s, AvgTime (1 trial) = " << std::setw(8) << elapsed.count() / trials - << " ms, Time (all) = " << std::setw(8) << elapsed.count() << " ms, Size = " << size + std::cout << std::fixed << std::setprecision(3) + << " " + << "Throughput = " + << std::setw(8) << (trials * size * sizeof(T)) / + (elapsed.count() / 1e3 * (1 << 30)) + << " GB/s, Samples = " + << std::setw(8) << (trials * size) / + (elapsed.count() / 1e3 * (1 << 30)) + << " GSample/s, AvgTime (1 trial) = " + << std::setw(8) << elapsed.count() / trials + << " ms, Time (all) = " + << std::setw(8) << elapsed.count() + << " ms, Size = " << size << std::endl; // CHECK: CUDA_CALL(hipFree(data)); CUDA_CALL(cudaFree(data)); } -template -void run_benchmarks(const cli::Parser& parser, const std::string& distribution) { - if (distribution == "uniform-uint") { +template +void run_benchmarks(const cli::Parser& parser, + const std::string& distribution) +{ + if (distribution == "uniform-uint") + { // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP // CHECK-NOT: if (!std::is_same::value && // CHECK-NOT: !std::is_same::value) if (!std::is_same::value && - !std::is_same::value) { + !std::is_same::value) + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand(state); - return curand(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand(state); + return curand(state); + }, 0 + ); } } - if (distribution == "uniform-long-long") { + if (distribution == "uniform-long-long") + { // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP // CHECK-NOT: if (!std::is_same::value && // CHECK-NOT: !std::is_same::value) if (std::is_same::value || - std::is_same::value) { - run_benchmark( - parser, - [] __device__(GeneratorState * state, int) { + std::is_same::value) + { + run_benchmark(parser, + [] __device__ (GeneratorState * state, int) { // CHECK: return hiprand(state); return curand(state); - }, - 0); + }, 0 + ); } } - if (distribution == "uniform-float") { + if (distribution == "uniform-float") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_uniform(state); - return curand_uniform(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_uniform(state); + return curand_uniform(state); + }, 0 + ); } - if (distribution == "uniform-double") { + if (distribution == "uniform-double") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_uniform_double(state); - return curand_uniform_double(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_uniform_double(state); + return curand_uniform_double(state); + }, 0 + ); } - if (distribution == "normal-float") { + if (distribution == "normal-float") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_normal(state); - return curand_normal(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_normal(state); + return curand_normal(state); + }, 0 + ); } - if (distribution == "normal-double") { + if (distribution == "normal-double") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_normal_double(state); - return curand_normal_double(state); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_normal_double(state); + return curand_normal_double(state); + }, 0 + ); } - if (distribution == "log-normal-float") { + if (distribution == "log-normal-float") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_log_normal(state, - // 0.0f, 1.0f); - return curand_log_normal(state, 0.0f, 1.0f); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_log_normal(state, 0.0f, 1.0f); + return curand_log_normal(state, 0.0f, 1.0f); + }, 0 + ); } - if (distribution == "log-normal-double") { + if (distribution == "log-normal-double") + { run_benchmark(parser, - [] __device__(GeneratorState * state, int) { - // CHECK: return hiprand_log_normal_double(state, - // 0.0, 1.0); - return curand_log_normal_double(state, 0.0, 1.0); - }, - 0); + [] __device__ (GeneratorState * state, int) { + // CHECK: return hiprand_log_normal_double(state, 0.0, 1.0); + return curand_log_normal_double(state, 0.0, 1.0); + }, 0 + ); } - if (distribution == "poisson") { + if (distribution == "poisson") + { const auto lambdas = parser.get>("lambda"); - for (double lambda : lambdas) { - std::cout << " " - << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; - run_benchmark( - parser, - [] __device__(GeneratorState * state, double lambda) { + for (double lambda : lambdas) + { + std::cout << " " << "lambda " + << std::fixed << std::setprecision(1) << lambda << std::endl; + run_benchmark(parser, + [] __device__ (GeneratorState * state, double lambda) { // CHECK: return hiprand_poisson(state, lambda); return curand_poisson(state, lambda); - }, - lambda); + }, lambda + ); } } - if (distribution == "discrete-poisson") { + if (distribution == "discrete-poisson") + { const auto lambdas = parser.get>("lambda"); - for (double lambda : lambdas) { - std::cout << " " - << "lambda " << std::fixed << std::setprecision(1) << lambda << std::endl; + for (double lambda : lambdas) + { + std::cout << " " << "lambda " + << std::fixed << std::setprecision(1) << lambda << std::endl; // CHECK: hiprandDiscreteDistribution_t discrete_distribution; curandDiscreteDistribution_t discrete_distribution; // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(lambda, &discrete_distribution)); CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution)); - run_benchmark( - parser, - // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t - // discrete_distribution) { - [] __device__(GeneratorState * state, - curandDiscreteDistribution_t discrete_distribution) { + run_benchmark(parser, + // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t discrete_distribution) { + [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) { // CHECK: return hiprand_discrete(state, discrete_distribution); return curand_discrete(state, discrete_distribution); - }, - discrete_distribution); + }, discrete_distribution + ); // CHECK: CURAND_CALL(hiprandDestroyDistribution(discrete_distribution)); CURAND_CALL(curandDestroyDistribution(discrete_distribution)); } @@ -459,9 +527,12 @@ void run_benchmarks(const cli::Parser& parser, const std::string& distribution) } const std::vector all_engines = { - "xorwow", "mrg32k3a", "mtgp32", + "xorwow", + "mrg32k3a", + "mtgp32", // "mt19937", - "philox", "sobol32", + "philox", + "sobol32", // "scrambled_sobol32", // "sobol64", // "scrambled_sobol64", @@ -480,42 +551,50 @@ const std::vector all_distributions = { "discrete-poisson", }; -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) +{ cli::Parser parser(argc, argv); const std::string distribution_desc = "space-separated list of distributions:" + std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(), - [](std::string a, std::string b) { return a + "\n " + b; }) + + [](std::string a, std::string b) { + return a + "\n " + b; + } + ) + "\n or all"; const std::string engine_desc = "space-separated list of random number engines:" + std::accumulate(all_engines.begin(), all_engines.end(), std::string(), - [](std::string a, std::string b) { return a + "\n " + b; }) + + [](std::string a, std::string b) { + return a + "\n " + b; + } + ) + "\n or all"; parser.set_optional("size", "size", DEFAULT_RAND_N, "number of values"); - parser.set_optional("dimensions", "dimensions", 1, - "number of dimensions of quasi-random values"); + parser.set_optional("dimensions", "dimensions", 1, "number of dimensions of quasi-random values"); parser.set_optional("trials", "trials", 20, "number of trials"); parser.set_optional("blocks", "blocks", 256, "number of blocks"); parser.set_optional("threads", "threads", 256, "number of threads in each block"); - parser.set_optional>("dis", "dis", {"uniform-uint"}, - distribution_desc.c_str()); - parser.set_optional>("engine", "engine", {"philox"}, - engine_desc.c_str()); - parser.set_optional>( - "lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution"); + parser.set_optional>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str()); + parser.set_optional>("engine", "engine", {"philox"}, engine_desc.c_str()); + parser.set_optional>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution"); parser.run_and_exit_if_error(); std::vector engines; { auto es = parser.get>("engine"); - if (std::find(es.begin(), es.end(), "all") != es.end()) { + if (std::find(es.begin(), es.end(), "all") != es.end()) + { engines = all_engines; - } else { - for (auto e : all_engines) { - if (std::find(es.begin(), es.end(), e) != es.end()) engines.push_back(e); + } + else + { + for (auto e : all_engines) + { + if (std::find(es.begin(), es.end(), e) != es.end()) + engines.push_back(e); } } } @@ -523,11 +602,16 @@ int main(int argc, char* argv[]) { std::vector distributions; { auto ds = parser.get>("dis"); - if (std::find(ds.begin(), ds.end(), "all") != ds.end()) { + if (std::find(ds.begin(), ds.end(), "all") != ds.end()) + { distributions = all_distributions; - } else { - for (auto d : all_distributions) { - if (std::find(ds.begin(), ds.end(), d) != ds.end()) distributions.push_back(d); + } + else + { + for (auto d : all_distributions) + { + if (std::find(ds.begin(), ds.end(), d) != ds.end()) + distributions.push_back(d); } } } @@ -552,24 +636,35 @@ int main(int argc, char* argv[]) { std::cout << "Device: " << props.name; std::cout << std::endl << std::endl; - for (auto engine : engines) { + for (auto engine : engines) + { std::cout << engine << ":" << std::endl; - for (auto distribution : distributions) { + for (auto distribution : distributions) + { std::cout << " " << distribution << ":" << std::endl; const std::string plot_name = engine + "-" + distribution; - if (engine == "xorwow") { + if (engine == "xorwow") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "mrg32k3a") { + } + else if (engine == "mrg32k3a") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "philox") { + } + else if (engine == "philox") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "sobol32") { + } + else if (engine == "sobol32") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); - } else if (engine == "mtgp32") { + } + else if (engine == "mtgp32") + { // CHECK: run_benchmarks(parser, distribution); run_benchmarks(parser, distribution); }