Re-sync with upstream.
Этот коммит содержится в:
поставляемый
+4
-3
@@ -151,7 +151,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
|
||||
// The rm command needs to run as sudo because the test steps below create files owned by root
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
sudo rm -rf ${build_dir_rel}
|
||||
rm -rf ${build_dir_rel}
|
||||
mkdir -p ${build_dir_rel}
|
||||
cd ${build_dir_rel}
|
||||
cmake -DCMAKE_BUILD_TYPE=${build_config} -DCMAKE_INSTALL_PREFIX=staging ${optional_configure} ${source_hip_abs}
|
||||
@@ -160,6 +160,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
|
||||
}
|
||||
|
||||
// Cap the maximum amount of testing, in case of hangs
|
||||
// Excluding hipPrintfKernel test from automation; variable fails on CI test machines
|
||||
timeout(time: 1, unit: 'HOURS')
|
||||
{
|
||||
stage("${platform} unit testing")
|
||||
@@ -169,7 +170,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
|
||||
cd ${build_dir_rel}
|
||||
make install -j\$(nproc)
|
||||
make build_tests -i -j\$(nproc)
|
||||
make test
|
||||
ctest -E hipPrintfKernel
|
||||
"""
|
||||
// If unit tests output a junit or xunit file in the future, jenkins can parse that file
|
||||
// to display test results on the dashboard
|
||||
@@ -193,7 +194,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
|
||||
if( platform.toLowerCase( ).startsWith( 'hcc-ctu' ) )
|
||||
{
|
||||
archiveArtifacts artifacts: "${build_dir_rel}/*.deb", fingerprint: true
|
||||
archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
|
||||
// archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +101,8 @@
|
||||
| struct | `curandStateMRG32k3a_t` | `hiprandStateMRG32k3a_t` |
|
||||
| struct | `curandStatePhilox4_32_10_t` | `hiprandStatePhilox4_32_10_t` |
|
||||
| struct | `curandStateXORWOW_t` | `hiprandStateXORWOW_t` |
|
||||
| struct | `curandState_t` | `hiprandState_t` |
|
||||
| struct | `curandState` | `hiprandState_t` |
|
||||
|
||||
## **2. Host API Functions**
|
||||
|
||||
@@ -154,6 +156,8 @@
|
||||
| `curand_normal2_double` | `hiprand_normal2_double` |
|
||||
| `curand_normal4` | `hiprand_normal4` |
|
||||
| `curand_normal4_double` | `hiprand_normal4_double` |
|
||||
| `curand_uniform` | `hiprand_uniform` |
|
||||
| `curand_uniform_double` | `hiprand_uniform_double` |
|
||||
| `curand_uniform2_double` | `hiprand_uniform2_double` |
|
||||
| `curand_uniform4` | `hiprand_uniform4` |
|
||||
| `curand_uniform4_double` | `hiprand_uniform4_double` |
|
||||
|
||||
@@ -74,7 +74,7 @@ HIP provides 3 environment variables in the HIP_*_BLOCKING family. These introd
|
||||
- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
|
||||
|
||||
These options cause HCC to serialize. Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.
|
||||
- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
|
||||
- HCC_SERIALIZE_KERNEL : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
|
||||
- HCC_SERIALIZE_COPY : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
|
||||
|
||||
- HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines.
|
||||
|
||||
@@ -54,7 +54,7 @@ A stronger system-level fence can be specified when the event is created with hi
|
||||
- HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
|
||||
|
||||
|
||||
## Unpinned Memory Transfer Optimizations
|
||||
## Unpinned Memory Transfer Optimization
|
||||
Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
|
||||
|
||||
### On Small BAR Setup
|
||||
@@ -79,11 +79,20 @@ stage the copy through an optimized pinned staging buffer, to implement H2D and
|
||||
|
||||
PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA engine.
|
||||
|
||||
By default staging buffers are used for unpinned memory transfers. Environment variables allow control over the unpinned copy algorithm and parameters:
|
||||
Unpinned memory transfer mode can be controlled using environment variable HCC_UNPINNED_COPY_MODE.
|
||||
|
||||
- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
|
||||
By default HCC_UNPINNED_COPY_MODE is set to 0, which uses default threshold values to decide which transfer way to use based on data size.
|
||||
|
||||
- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
|
||||
- HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
|
||||
- HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
|
||||
- HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
|
||||
Setting HCC_UNPINNED_COPY_MODE = 1, forces all unpinned transfer to use PinInPlace logic.
|
||||
|
||||
Setting HCC_UNPINNED_COPY_MODE = 2, forces all unpinned transfer to use Staging buffers.
|
||||
|
||||
Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct memcpy on large BAR systems.
|
||||
|
||||
Following environment variables can be used to control the transfer thresholds:
|
||||
|
||||
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
|
||||
|
||||
- HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.
|
||||
|
||||
- HCC_D2H_PININPLACE_THRESHOLD - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
|
||||
|
||||
@@ -362,29 +362,46 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
|
||||
{"curandStateMRG32k3a_t", {"hiprandStateMRG32k3a_t", CONV_TYPE, API_RAND}},
|
||||
{"curandStatePhilox4_32_10_t", {"hiprandStatePhilox4_32_10_t", CONV_TYPE, API_RAND}},
|
||||
{"curandStateXORWOW_t", {"hiprandStateXORWOW_t", CONV_TYPE, API_RAND}},
|
||||
{"curandState_t", {"hiprandState_t", CONV_TYPE, API_RAND}},
|
||||
{"curandState", {"hiprandState_t", CONV_TYPE, API_RAND}},
|
||||
};
|
||||
|
||||
/// Maps cuda header names to hip header names.
|
||||
const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
|
||||
// CUDA includes
|
||||
{"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
|
||||
{"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
|
||||
{"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"device_functions.h", {"hip/device_functions.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"driver_types.h", {"hip/driver_types.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuComplex.h", {"hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_fp16.h", {"hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_texture_types.h", {"hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"vector_types.h", {"hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
|
||||
{"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
|
||||
{"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"device_functions.h", {"hip/device_functions.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"driver_types.h", {"hip/driver_types.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuComplex.h", {"hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_fp16.h", {"hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"cuda_texture_types.h", {"hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
{"vector_types.h", {"hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME}},
|
||||
|
||||
// CUBLAS includes
|
||||
{"cublas.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}},
|
||||
{"cublas_v2.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}},
|
||||
{"cublas.h", {"hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
|
||||
{"cublas_v2.h", {"hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
|
||||
|
||||
// CURAND includes
|
||||
{"curand.h", {"hiprand.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_kernel.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand.h", {"hiprand.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND}},
|
||||
{"curand_kernel.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_discrete.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_discrete2.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_globals.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_lognormal.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_mrg32k3a.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_mtgp32.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_mtgp32_host.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_mtgp32_kernel.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_mtgp32dc_p_11213.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_normal.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_normal_static.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_philox4x32_x.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_poisson.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_precalc.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
{"curand_uniform.h", {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
|
||||
|
||||
// HIP includes
|
||||
// TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h
|
||||
@@ -2852,10 +2869,12 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
|
||||
{"curand_normal2_double", {"hiprand_normal2_double", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_normal4", {"hiprand_normal4", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_normal4_double", {"hiprand_normal4_double", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_uniform", {"hiprand_uniform", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_uniform_double", {"hiprand_uniform_double", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_uniform2_double", {"hiprand_uniform2_double", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_uniform4", {"hiprand_uniform4", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_uniform4_double", {"hiprand_uniform4_double", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_discrete", {"hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_discrete", {"hiprand_discrete", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_discrete4", {"hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_poisson", {"hiprand_poisson", CONV_DEVICE_FUNC, API_RAND}},
|
||||
{"curand_poisson4", {"hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND}},
|
||||
|
||||
@@ -137,6 +137,48 @@ std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Exp
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
bool HipifyAction::Exclude(const hipCounter & hipToken) {
|
||||
switch (hipToken.type) {
|
||||
case CONV_INCLUDE_CUDA_MAIN_H:
|
||||
switch (hipToken.apiType) {
|
||||
case API_DRIVER:
|
||||
case API_RUNTIME:
|
||||
if (insertedRuntimeHeader) { return true; }
|
||||
insertedRuntimeHeader = true;
|
||||
return false;
|
||||
case API_BLAS:
|
||||
if (insertedBLASHeader) { return true; }
|
||||
insertedBLASHeader = true;
|
||||
return false;
|
||||
case API_RAND:
|
||||
if (hipToken.hipName == "hiprand_kernel.h") {
|
||||
if (insertedRAND_kernelHeader) { return true; }
|
||||
insertedRAND_kernelHeader = true;
|
||||
return false;
|
||||
} else if (hipToken.hipName == "hiprand.h") {
|
||||
if (insertedRANDHeader) { return true; }
|
||||
insertedRANDHeader = true;
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
case CONV_INCLUDE:
|
||||
switch (hipToken.apiType) {
|
||||
case API_RAND:
|
||||
if (insertedRAND_kernelHeader) { return true; }
|
||||
insertedRAND_kernelHeader = true;
|
||||
return false;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
const clang::Token&,
|
||||
@@ -149,24 +191,17 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
if (!SM.isWrittenInMainFile(hash_loc)) {
|
||||
return;
|
||||
}
|
||||
if (!firstHeader) {
|
||||
firstHeader = true;
|
||||
firstHeaderLoc = hash_loc;
|
||||
}
|
||||
|
||||
const auto found = CUDA_INCLUDE_MAP.find(file_name);
|
||||
if (found == CUDA_INCLUDE_MAP.end()) {
|
||||
if (!firstNotMainHeader) {
|
||||
firstNotMainHeader = true;
|
||||
firstNotMainHeaderLoc = hash_loc;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Special-casing to avoid duplication of the hip_runtime include.
|
||||
bool secondMainInclude = false;
|
||||
if (found->second.hipName == "hip/hip_runtime.h") {
|
||||
if (insertedRuntimeHeader) {
|
||||
secondMainInclude = true;
|
||||
}
|
||||
insertedRuntimeHeader = true;
|
||||
}
|
||||
bool exclude = Exclude(found->second);
|
||||
|
||||
Statistics::current().incrementCounter(found->second, file_name.str());
|
||||
|
||||
@@ -180,7 +215,7 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
clang::StringRef newInclude;
|
||||
|
||||
// Keep the same include type that the user gave.
|
||||
if (!secondMainInclude) {
|
||||
if (!exclude) {
|
||||
clang::SmallString<128> includeBuffer;
|
||||
if (is_angled) {
|
||||
newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
|
||||
@@ -198,8 +233,13 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
|
||||
}
|
||||
|
||||
void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
|
||||
if (pragmaOnce) { return; }
|
||||
if (pragmaOnce) {
|
||||
return;
|
||||
}
|
||||
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
|
||||
if (!SM.isWrittenInMainFile(Loc)) {
|
||||
return;
|
||||
}
|
||||
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
|
||||
const clang::Token tok = PP.LookAhead(0);
|
||||
StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
|
||||
@@ -356,8 +396,8 @@ void HipifyAction::EndSourceFileAction() {
|
||||
clang::SourceLocation sl;
|
||||
if (pragmaOnce) {
|
||||
sl = pragmaOnceLoc;
|
||||
} else if (firstNotMainHeader) {
|
||||
sl = firstNotMainHeaderLoc;
|
||||
} else if (firstHeader) {
|
||||
sl = firstHeaderLoc;
|
||||
} else {
|
||||
sl = SM.getLocForStartOfFile(SM.getMainFileID());
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "clang/Tooling/Core/Replacement.h"
|
||||
#include "clang/ASTMatchers/ASTMatchFinder.h"
|
||||
#include "ReplacementsFrontendActionFactory.h"
|
||||
#include "Statistics.h"
|
||||
|
||||
namespace ct = clang::tooling;
|
||||
|
||||
@@ -23,9 +24,12 @@ private:
|
||||
// not, we insert it at the top of the file when we finish processing it.
|
||||
// This approach means we do the best it's possible to do w.r.t preserving the user's include order.
|
||||
bool insertedRuntimeHeader = false;
|
||||
bool firstNotMainHeader = false;
|
||||
bool insertedBLASHeader = false;
|
||||
bool insertedRANDHeader = false;
|
||||
bool insertedRAND_kernelHeader = false;
|
||||
bool firstHeader = false;
|
||||
bool pragmaOnce = false;
|
||||
clang::SourceLocation firstNotMainHeaderLoc;
|
||||
clang::SourceLocation firstHeaderLoc;
|
||||
clang::SourceLocation pragmaOnceLoc;
|
||||
|
||||
/**
|
||||
@@ -89,4 +93,6 @@ protected:
|
||||
void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
|
||||
|
||||
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
|
||||
|
||||
bool Exclude(const hipCounter & hipToken);
|
||||
};
|
||||
|
||||
@@ -53,8 +53,8 @@ void printStat(std::ostream *csv, llvm::raw_ostream* printOut, const std::string
|
||||
|
||||
void StatCounter::incrementCounter(const hipCounter& counter, std::string name) {
|
||||
counters[name]++;
|
||||
apiCounters[(int) counter.countApiType]++;
|
||||
convTypeCounters[(int) counter.countType]++;
|
||||
apiCounters[(int) counter.apiType]++;
|
||||
convTypeCounters[(int) counter.type]++;
|
||||
}
|
||||
|
||||
void StatCounter::add(const StatCounter& other) {
|
||||
|
||||
@@ -67,8 +67,8 @@ extern const char *apiNames[NUM_API_TYPES];
|
||||
|
||||
struct hipCounter {
|
||||
llvm::StringRef hipName;
|
||||
ConvTypes countType;
|
||||
ApiTypes countApiType;
|
||||
ConvTypes type;
|
||||
ApiTypes apiType;
|
||||
bool unsupported;
|
||||
};
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ THE SOFTWARE.
|
||||
#include "LLVMCompat.h"
|
||||
#include "HipifyAction.h"
|
||||
#include "ArgParse.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
#define DEBUG_TYPE "cuda2hip"
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ namespace hip_impl
|
||||
std::copy_n(it, sizeof(y.header.cbuf), y.header.cbuf);
|
||||
it += sizeof(y.header.cbuf);
|
||||
|
||||
y.triple.assign(it, it + y.header.triple_sz);
|
||||
y.triple.assign(it, it + y.triple_sz);
|
||||
|
||||
std::copy_n(
|
||||
f + y.header.offset,
|
||||
@@ -152,9 +152,9 @@ namespace hip_impl
|
||||
};
|
||||
|
||||
// CREATORS
|
||||
template<typename I>
|
||||
Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
|
||||
template<typename RandomAccessIterator>
|
||||
Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l) : Bundled_code_header{}
|
||||
{
|
||||
read(f, l, *this);
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -1420,7 +1420,56 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
|
||||
hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
|
||||
const void* src, size_t count, hipMemcpyKind kind);
|
||||
|
||||
/**
|
||||
* @brief Copies data between host and device.
|
||||
*
|
||||
* @param[in] dst Destination memory address
|
||||
* @param[in] srcArray Source memory address
|
||||
* @param[in] woffset Source starting X offset
|
||||
* @param[in] hOffset Source starting Y offset
|
||||
* @param[in] count Size in bytes to copy
|
||||
* @param[in] kind Type of transfer
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
|
||||
*
|
||||
* @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
|
||||
*/
|
||||
hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
|
||||
size_t count, hipMemcpyKind kind);
|
||||
|
||||
/**
|
||||
* @brief Copies data between host and device.
|
||||
*
|
||||
* @param[in] dst Destination memory address
|
||||
* @param[in] srcArray Source array
|
||||
* @param[in] srcoffset Offset in bytes of source array
|
||||
* @param[in] count Size of memory copy in bytes
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
|
||||
*
|
||||
* @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
|
||||
*/
|
||||
hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count);
|
||||
|
||||
/**
|
||||
* @brief Copies data between host and device.
|
||||
*
|
||||
* @param[in] dstArray Destination memory address
|
||||
* @param[in] dstOffset Offset in bytes of destination array
|
||||
* @param[in] srcHost Source host pointer
|
||||
* @param[in] count Size of memory copy in bytes
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
|
||||
*
|
||||
* @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
|
||||
*/
|
||||
hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count);
|
||||
|
||||
/**
|
||||
* @brief Copies data between host and device.
|
||||
*
|
||||
* @param[in] p 3D memory copy parameters
|
||||
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
|
||||
*
|
||||
* @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
|
||||
*/
|
||||
hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p);
|
||||
|
||||
// doxygen end Memory
|
||||
|
||||
@@ -69,18 +69,16 @@ namespace hip_impl
|
||||
}
|
||||
};
|
||||
|
||||
using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
|
||||
|
||||
const std::unordered_map<
|
||||
hsa_agent_t, std::vector<hsa_executable_t>>& executables();
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
|
||||
const std::unordered_map<std::uintptr_t, std::string>& function_names();
|
||||
std::unordered_map<std::string, RAII_global>& globals();
|
||||
std::unordered_map<std::string, void*>& globals();
|
||||
|
||||
hsa_executable_t load_executable(
|
||||
const std::string& file,
|
||||
hsa_executable_t executable,
|
||||
hsa_agent_t agent);
|
||||
} // Namespace hip_impl.
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -483,6 +483,18 @@ inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t
|
||||
return hipCUDAErrorTohipError(cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset, size_t count, hipMemcpyKind kind) {
|
||||
return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count, hipMemcpyKindToCudaMemcpyKind(kind)));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count) {
|
||||
return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count) {
|
||||
return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
|
||||
}
|
||||
|
||||
inline static hipError_t hipDeviceSynchronize() {
|
||||
return hipCUDAErrorTohipError(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
@@ -8,12 +8,12 @@ if(DOXYGEN_EXE)
|
||||
install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
|
||||
endif()
|
||||
|
||||
find_program(GRIP_EXE grip)
|
||||
if(GRIP_EXE)
|
||||
add_custom_target(convert_md_to_html ALL
|
||||
COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
|
||||
install(DIRECTORY md2html/ DESTINATION docs)
|
||||
endif()
|
||||
#find_program(GRIP_EXE grip)
|
||||
#if(GRIP_EXE)
|
||||
# add_custom_target(convert_md_to_html ALL
|
||||
# COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
|
||||
# install(DIRECTORY md2html/ DESTINATION docs)
|
||||
#endif()
|
||||
|
||||
#############################
|
||||
# Packaging steps
|
||||
|
||||
@@ -37,6 +37,7 @@ THE SOFTWARE.
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
|
||||
#include <hc.hpp>
|
||||
#include <hc_am.hpp>
|
||||
@@ -1409,9 +1410,38 @@ void ihipInit()
|
||||
tprintf(DB_SYNC, "pid=%u %-30s g_numLogicalThreads=%u\n", getpid(), "<ihipInit>", g_numLogicalThreads);
|
||||
}
|
||||
|
||||
hipError_t ihipStreamSynchronize(hipStream_t stream)
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
if (stream == hipStreamNull) {
|
||||
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
|
||||
ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
|
||||
} else {
|
||||
// note this does not synchornize with the NULL stream:
|
||||
stream->locked_wait();
|
||||
e = hipSuccess;
|
||||
}
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
void ihipStreamCallbackHandler(ihipStreamCallback_t *cb)
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
// Notify hipStreamAddCallback that callback handler thread is active
|
||||
std::lock_guard<std::mutex> guard(cb->_mtx);
|
||||
cb->_ready = true;
|
||||
|
||||
// Synchronize stream
|
||||
tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n", ToString(cb->_stream).c_str());
|
||||
e = ihipStreamSynchronize(cb->_stream);
|
||||
|
||||
// Call registered callback function
|
||||
cb->_callback(cb->_stream, e, cb->_userData);
|
||||
delete cb;
|
||||
}
|
||||
|
||||
//---
|
||||
// Get the stream to use for a command submission.
|
||||
|
||||
@@ -622,6 +622,24 @@ private: // Data
|
||||
};
|
||||
|
||||
|
||||
//----
|
||||
// Internal structure for stream callback handler
|
||||
class ihipStreamCallback_t {
|
||||
public:
|
||||
ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void *userData) :
|
||||
_stream(stream),
|
||||
_callback(callback),
|
||||
_userData(userData)
|
||||
{
|
||||
_ready = false;
|
||||
};
|
||||
hipStream_t _stream;
|
||||
hipStreamCallback_t _callback;
|
||||
void* _userData;
|
||||
std::mutex _mtx;
|
||||
bool _ready;
|
||||
};
|
||||
|
||||
|
||||
//----
|
||||
// Internal event structure:
|
||||
@@ -931,6 +949,8 @@ ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
|
||||
|
||||
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t);
|
||||
hipError_t ihipStreamSynchronize(hipStream_t stream);
|
||||
void ihipStreamCallbackHandler(ihipStreamCallback_t *cb);
|
||||
|
||||
// Stream printf functions:
|
||||
inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
|
||||
|
||||
@@ -453,6 +453,7 @@ hipError_t hipArrayCreate ( hipArray** array, const HIP_ARRAY_DESCRIPTOR* pAlloc
|
||||
array[0]->width = pAllocateArray->width;
|
||||
array[0]->height = pAllocateArray->height;
|
||||
array[0]->isDrv = true;
|
||||
array[0]->textureType = hipTextureType2D;
|
||||
void ** ptr = &array[0]->data;
|
||||
if (ctx) {
|
||||
const unsigned am_flags = 0;
|
||||
@@ -1411,6 +1412,65 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
|
||||
size_t count, hipMemcpyKind kind) {
|
||||
|
||||
HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, srcArray, wOffset, hOffset, count, kind);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
try {
|
||||
stream->locked_copySync((char *)dst, (char*)srcArray->data + wOffset, count, kind);
|
||||
}
|
||||
catch (ihipException &ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count)
|
||||
{
|
||||
HIP_INIT_SPECIAL_API((TRACE_MCMD), dstArray, dstOffset, srcHost, count);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
try {
|
||||
stream->locked_copySync((char *)dstArray->data + dstOffset, srcHost, count, hipMemcpyHostToDevice);
|
||||
} catch (ihipException &ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count)
|
||||
{
|
||||
HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, srcArray, srcOffset, count);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
try {
|
||||
stream->locked_copySync((char *)dst, (char*)srcArray->data + srcOffset, count, hipMemcpyDeviceToHost);
|
||||
}
|
||||
catch (ihipException &ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
|
||||
{
|
||||
HIP_INIT_SPECIAL_API((TRACE_MCMD), p);
|
||||
|
||||
@@ -594,7 +594,6 @@ hipError_t hipModuleGetTexRef(
|
||||
const auto it = globals().find(name);
|
||||
if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue);
|
||||
|
||||
*texRef = static_cast<textureReference*>(it->second.get());
|
||||
|
||||
*texRef = reinterpret_cast<textureReference*>(it->second);
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "trace_helper.h"
|
||||
@@ -147,20 +149,8 @@ hipError_t hipStreamSynchronize(hipStream_t stream)
|
||||
{
|
||||
HIP_INIT_SPECIAL_API(TRACE_SYNC, stream);
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
if (stream == hipStreamNull) {
|
||||
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
|
||||
ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
|
||||
} else {
|
||||
// note this does not synchornize with the NULL stream:
|
||||
stream->locked_wait();
|
||||
e = hipSuccess;
|
||||
}
|
||||
|
||||
|
||||
return ihipLogStatus(e);
|
||||
};
|
||||
return ihipLogStatus(ihipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
@@ -216,8 +206,20 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
|
||||
{
|
||||
HIP_INIT_API(stream, callback, userData, flags);
|
||||
hipError_t e = hipSuccess;
|
||||
//--- explicitly synchronize stream to add callback routines
|
||||
hipStreamSynchronize(stream);
|
||||
callback(stream, e, userData);
|
||||
|
||||
// Create a thread in detached mode to handle callback
|
||||
ihipStreamCallback_t *cb = new ihipStreamCallback_t(stream, callback, userData);
|
||||
std::thread (ihipStreamCallbackHandler, cb).detach();
|
||||
|
||||
// Wait for thread to be ready
|
||||
cb->_mtx.lock();
|
||||
while(cb->_ready != true)
|
||||
{
|
||||
cb->_mtx.unlock();
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
cb->_mtx.lock();
|
||||
}
|
||||
cb->_mtx.unlock();
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ namespace
|
||||
lock_guard<mutex> lck{mtx};
|
||||
|
||||
if (globals().find(x) != globals().cend()) return;
|
||||
|
||||
globals().emplace(x, (void*)(it1->second.first));
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(it1->second.first),
|
||||
@@ -163,7 +163,6 @@ namespace
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
|
||||
globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -444,9 +443,9 @@ namespace hip_impl
|
||||
return r;
|
||||
}
|
||||
|
||||
unordered_map<string, RAII_global>& globals()
|
||||
unordered_map<string, void*>& globals()
|
||||
{
|
||||
static unordered_map<string, RAII_global> r;
|
||||
static unordered_map<string, void*> r;
|
||||
static once_flag f;
|
||||
call_once(f, []() { r.reserve(symbol_addresses().size()); });
|
||||
|
||||
@@ -473,4 +472,4 @@ namespace hip_impl
|
||||
|
||||
return executable;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// To measure effects of memory coalescing. Coalescing.cu
|
||||
// B. Wilkinson Jan 30, 2011
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#define BlockSize 16 // Size of blocks, 32 x 32 threads, fixed, used globally
|
||||
|
||||
__global__ void gpu_Comput (int *h, int N, int T) {
|
||||
|
||||
// Array loaded with global thread ID that acesses that location
|
||||
|
||||
int col = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
int row = threadIdx.y + blockDim.y * blockIdx.y;
|
||||
|
||||
int threadID = col + row * N;
|
||||
int index = row + col * N; // sequentially down each row
|
||||
|
||||
for (int t = 0; t < T; t++) // loop to repeat to reduce other time effects
|
||||
h[index] = threadID; // load array with flattened global thread ID
|
||||
}
|
||||
|
||||
void printArray(int *h, int N) {
|
||||
|
||||
printf("Results of computation, every N/8 numbers, eight numbers\n");
|
||||
|
||||
for (int row = 0; row < N; row += N/8) {
|
||||
for (int col = 0; col < N; col += N/8)
|
||||
printf("%6d ", h[col + row * N]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
int T = 100; // number of iterations, entered at keyboard
|
||||
int B = 1; // number of blocks, entered at keyboard
|
||||
char key;
|
||||
|
||||
int *h, *dev_h; // ptr to array holding numbers on host and device
|
||||
// CHECK: hipEvent_t start, stop;
|
||||
cudaEvent_t start, stop; // cuda events to measure time
|
||||
float elapsed_time_ms1;
|
||||
// CHECK: hipEventCreate( &start );
|
||||
// CHECK: hipEventCreate( &stop );
|
||||
cudaEventCreate( &start );
|
||||
cudaEventCreate( &stop );
|
||||
|
||||
/* ------------------------- Keyboard input -----------------------------------*/
|
||||
|
||||
do { // loop to repeat complete program
|
||||
|
||||
printf("Grid Structure 2-D grid, 2-D blocks\n");
|
||||
printf("Blocks fixed at 16 x 16 threads, 512 threads, max for compute cap. 1.x\n");
|
||||
printf("Enter number of blocks in grid, each dimension, currently %d\n",B);
|
||||
scanf("%d",&B);
|
||||
printf("Enter number of iterations, currently %d\n",T);
|
||||
scanf("%d",&T);
|
||||
|
||||
int N = B * BlockSize; // size of data array, given input data
|
||||
|
||||
printf("Array size (and total grid-block size) %d x %d\n", N, N);
|
||||
|
||||
dim3 Block(BlockSize, BlockSize); //Block structure, 32 x 32 max
|
||||
dim3 Grid(B, B); //Grid structure, B x B
|
||||
|
||||
/* ------------------------- Allocate Memory-----------------------------------*/
|
||||
|
||||
int size = N * N * sizeof(int); // number of bytes in total in array
|
||||
h = (int*) malloc(size); // Array on host
|
||||
// CHECK: hipMalloc((void**)&dev_h, size);
|
||||
cudaMalloc((void**)&dev_h, size); // allocate device memory
|
||||
|
||||
/* ------------------------- GPU Computation -----------------------------------*/
|
||||
|
||||
// CHECK: hipEventRecord( start, 0 );
|
||||
cudaEventRecord( start, 0 );
|
||||
// CHECK: hipLaunchKernelGGL(gpu_Comput, dim3(Grid), dim3(Block), 0, 0, dev_h, N, T);
|
||||
gpu_Comput<<< Grid, Block >>>(dev_h, N, T);
|
||||
// CHECK: hipEventRecord( stop, 0 );
|
||||
// CHECK: hipEventSynchronize( stop );
|
||||
// CHECK: hipEventElapsedTime( &elapsed_time_ms1, start, stop );
|
||||
cudaEventRecord( stop, 0 ); // instrument code to measue end time
|
||||
cudaEventSynchronize( stop ); // wait for all work done by threads
|
||||
cudaEventElapsedTime( &elapsed_time_ms1, start, stop );
|
||||
// CHECK: hipMemcpy(h,dev_h, size ,hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(h,dev_h, size ,cudaMemcpyDeviceToHost); //Get results to check
|
||||
|
||||
printArray(h,N);
|
||||
printf("\nTime to calculate results on GPU: %f ms.\n", elapsed_time_ms1);
|
||||
|
||||
/* -------------------------REPEAT PROGRAM INPUT-----------------------------------*/
|
||||
|
||||
printf("\nEnter c to repeat, return to terminate\n");
|
||||
|
||||
scanf("%c",&key);
|
||||
scanf("%c",&key);
|
||||
|
||||
} while (key == 'c'); // loop of complete program
|
||||
|
||||
/* -------------- clean up ---------------------------------------*/
|
||||
|
||||
free(h);
|
||||
// CHECK: hipFree(dev_h);
|
||||
cudaFree(dev_h);
|
||||
// CHECK: hipEventDestroy(start);
|
||||
// CHECK: hipEventDestroy(stop);
|
||||
cudaEventDestroy(start);
|
||||
cudaEventDestroy(stop);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,393 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
#include <numeric>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
#include "cmdparser.hpp"
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda_runtime.h>
|
||||
// CHECK: #include <hiprand.h>
|
||||
#include <curand.h>
|
||||
|
||||
// CHECK: if((x)!=hipSuccess) {
|
||||
#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__);\
|
||||
exit(EXIT_FAILURE);}} while(0)
|
||||
// CHECK: if((x)!=HIPRAND_STATUS_SUCCESS) {
|
||||
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__);\
|
||||
exit(EXIT_FAILURE);}} while(0)
|
||||
|
||||
#ifndef DEFAULT_RAND_N
|
||||
const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
|
||||
#endif
|
||||
|
||||
// CHECK: typedef hiprandRngType_t rng_type_t;
|
||||
typedef curandRngType rng_type_t;
|
||||
|
||||
// CHECK: using generate_func_type = std::function<hiprandStatus_t(hiprandGenerator_t, T *, size_t)>;
|
||||
template<typename T>
|
||||
using generate_func_type = std::function<curandStatus_t(curandGenerator_t, T *, size_t)>;
|
||||
|
||||
template<typename T>
|
||||
void run_benchmark(const cli::Parser& parser,
|
||||
const rng_type_t rng_type,
|
||||
generate_func_type<T> generate_func)
|
||||
{
|
||||
const size_t size = parser.get<size_t>("size");
|
||||
const size_t trials = parser.get<size_t>("trials");
|
||||
|
||||
T * data;
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T)));
|
||||
CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T)));
|
||||
|
||||
// CHECK: hiprandGenerator_t generator;
|
||||
// CHECK: CURAND_CALL(hiprandCreateGenerator(&generator, rng_type));
|
||||
curandGenerator_t generator;
|
||||
CURAND_CALL(curandCreateGenerator(&generator, rng_type));
|
||||
|
||||
const size_t dimensions = parser.get<size_t>("dimensions");
|
||||
// CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, dimensions);
|
||||
// CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR)
|
||||
curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
|
||||
if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
|
||||
{
|
||||
CURAND_CALL(status);
|
||||
}
|
||||
|
||||
// Warm-up
|
||||
for (size_t i = 0; i < 5; i++)
|
||||
{
|
||||
CURAND_CALL(generate_func(generator, data, size));
|
||||
}
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
|
||||
// Measurement
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
for (size_t i = 0; i < trials; i++)
|
||||
{
|
||||
CURAND_CALL(generate_func(generator, data, size));
|
||||
}
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> elapsed = end - start;
|
||||
|
||||
std::cout << std::fixed << std::setprecision(3)
|
||||
<< " "
|
||||
<< "Throughput = "
|
||||
<< std::setw(8) << (trials * size * sizeof(T)) /
|
||||
(elapsed.count() / 1e3 * (1 << 30))
|
||||
<< " GB/s, Samples = "
|
||||
<< std::setw(8) << (trials * size) /
|
||||
(elapsed.count() / 1e3 * (1 << 30))
|
||||
<< " GSample/s, AvgTime (1 trial) = "
|
||||
<< std::setw(8) << elapsed.count() / trials
|
||||
<< " ms, Time (all) = "
|
||||
<< std::setw(8) << elapsed.count()
|
||||
<< " ms, Size = " << size
|
||||
<< std::endl;
|
||||
// CHECK: CURAND_CALL(hiprandDestroyGenerator(generator));
|
||||
// CHECK: CUDA_CALL(hipFree(data));
|
||||
CURAND_CALL(curandDestroyGenerator(generator));
|
||||
CUDA_CALL(cudaFree(data));
|
||||
}
|
||||
|
||||
void run_benchmarks(const cli::Parser& parser,
|
||||
const rng_type_t rng_type,
|
||||
const std::string& distribution)
|
||||
{
|
||||
if (distribution == "uniform-uint")
|
||||
{
|
||||
// CHECK: if (rng_type != HIPRAND_RNG_QUASI_SOBOL64 &&
|
||||
// CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64)
|
||||
if (rng_type != CURAND_RNG_QUASI_SOBOL64 &&
|
||||
rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
|
||||
{
|
||||
run_benchmark<unsigned int>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, unsigned int * data, size_t size) {
|
||||
// CHECK: return hiprandGenerate(gen, data, size);
|
||||
[](curandGenerator_t gen, unsigned int * data, size_t size) {
|
||||
return curandGenerate(gen, data, size);
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
if (distribution == "uniform-long-long")
|
||||
{
|
||||
// CHECK: if (rng_type == HIPRAND_RNG_QUASI_SOBOL64 ||
|
||||
// CHECK: rng_type == HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64)
|
||||
if (rng_type == CURAND_RNG_QUASI_SOBOL64 ||
|
||||
rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
|
||||
{
|
||||
run_benchmark<unsigned long long>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, unsigned long long * data, size_t size) {
|
||||
[](curandGenerator_t gen, unsigned long long * data, size_t size) {
|
||||
// curandGenerateLongLong is yet unsupported by HIP
|
||||
// CHECK-NOT: return hiprandGenerateLongLong(gen, data, size);
|
||||
return curandGenerateLongLong(gen, data, size);
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
if (distribution == "uniform-float")
|
||||
{
|
||||
run_benchmark<float>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
|
||||
// CHECK: return hiprandGenerateUniform(gen, data, size);
|
||||
[](curandGenerator_t gen, float * data, size_t size) {
|
||||
return curandGenerateUniform(gen, data, size);
|
||||
}
|
||||
);
|
||||
}
|
||||
if (distribution == "uniform-double")
|
||||
{
|
||||
run_benchmark<double>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
|
||||
// CHECK: return hiprandGenerateUniformDouble(gen, data, size);
|
||||
[](curandGenerator_t gen, double * data, size_t size) {
|
||||
return curandGenerateUniformDouble(gen, data, size);
|
||||
}
|
||||
);
|
||||
}
|
||||
if (distribution == "normal-float")
|
||||
{
|
||||
run_benchmark<float>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
|
||||
// CHECK: return hiprandGenerateNormal(gen, data, size, 0.0f, 1.0f);
|
||||
[](curandGenerator_t gen, float * data, size_t size) {
|
||||
return curandGenerateNormal(gen, data, size, 0.0f, 1.0f);
|
||||
}
|
||||
);
|
||||
}
|
||||
if (distribution == "normal-double")
|
||||
{
|
||||
run_benchmark<double>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
|
||||
// CHECK: return hiprandGenerateNormalDouble(gen, data, size, 0.0, 1.0);
|
||||
[](curandGenerator_t gen, double * data, size_t size) {
|
||||
return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0);
|
||||
}
|
||||
);
|
||||
}
|
||||
if (distribution == "log-normal-float")
|
||||
{
|
||||
run_benchmark<float>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
|
||||
// CHECK: return hiprandGenerateLogNormal(gen, data, size, 0.0f, 1.0f);
|
||||
[](curandGenerator_t gen, float * data, size_t size) {
|
||||
return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f);
|
||||
}
|
||||
);
|
||||
}
|
||||
if (distribution == "log-normal-double")
|
||||
{
|
||||
run_benchmark<double>(parser, rng_type,
|
||||
// CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
|
||||
// CHECK: return hiprandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0);
|
||||
[](curandGenerator_t gen, double * data, size_t size) {
|
||||
return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0);
|
||||
}
|
||||
);
|
||||
}
|
||||
if (distribution == "poisson")
|
||||
{
|
||||
const auto lambdas = parser.get<std::vector<double>>("lambda");
|
||||
for (double lambda : lambdas)
|
||||
{
|
||||
std::cout << " " << "lambda "
|
||||
<< std::fixed << std::setprecision(1) << lambda << std::endl;
|
||||
run_benchmark<unsigned int>(parser, rng_type,
|
||||
// CHECK: [lambda](hiprandGenerator_t gen, unsigned int * data, size_t size) {
|
||||
// CHECK: return hiprandGeneratePoisson(gen, data, size, lambda);
|
||||
[lambda](curandGenerator_t gen, unsigned int * data, size_t size) {
|
||||
return curandGeneratePoisson(gen, data, size, lambda);
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<std::string> all_engines = {
|
||||
"xorwow",
|
||||
"mrg32k3a",
|
||||
"mtgp32",
|
||||
// "mt19937",
|
||||
"philox",
|
||||
"sobol32",
|
||||
// "scrambled_sobol32",
|
||||
// "sobol64",
|
||||
// "scrambled_sobol64",
|
||||
};
|
||||
|
||||
const std::vector<std::string> all_distributions = {
|
||||
"uniform-uint",
|
||||
"uniform-long-long",
|
||||
"uniform-float",
|
||||
"uniform-double",
|
||||
"normal-float",
|
||||
"normal-double",
|
||||
"log-normal-float",
|
||||
"log-normal-double",
|
||||
"poisson"
|
||||
};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
cli::Parser parser(argc, argv);
|
||||
|
||||
const std::string distribution_desc =
|
||||
"space-separated list of distributions:" +
|
||||
std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(),
|
||||
[](std::string a, std::string b) {
|
||||
return a + "\n " + b;
|
||||
}
|
||||
) +
|
||||
"\n or all";
|
||||
const std::string engine_desc =
|
||||
"space-separated list of random number engines:" +
|
||||
std::accumulate(all_engines.begin(), all_engines.end(), std::string(),
|
||||
[](std::string a, std::string b) {
|
||||
return a + "\n " + b;
|
||||
}
|
||||
) +
|
||||
"\n or all";
|
||||
|
||||
parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
|
||||
parser.set_optional<size_t>("dimensions", "dimensions", 1, "number of dimensions of quasi-random values");
|
||||
parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
|
||||
parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
|
||||
parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
|
||||
parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
|
||||
parser.run_and_exit_if_error();
|
||||
|
||||
std::vector<std::string> engines;
|
||||
{
|
||||
auto es = parser.get<std::vector<std::string>>("engine");
|
||||
if (std::find(es.begin(), es.end(), "all") != es.end())
|
||||
{
|
||||
engines = all_engines;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto e : all_engines)
|
||||
{
|
||||
if (std::find(es.begin(), es.end(), e) != es.end())
|
||||
engines.push_back(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> distributions;
|
||||
{
|
||||
auto ds = parser.get<std::vector<std::string>>("dis");
|
||||
if (std::find(ds.begin(), ds.end(), "all") != ds.end())
|
||||
{
|
||||
distributions = all_distributions;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto d : all_distributions)
|
||||
{
|
||||
if (std::find(ds.begin(), ds.end(), d) != ds.end())
|
||||
distributions.push_back(d);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int version;
|
||||
// CHECK: CURAND_CALL(hiprandGetVersion(&version));
|
||||
CURAND_CALL(curandGetVersion(&version));
|
||||
int runtime_version;
|
||||
// cudaRuntimeGetVersion is yet unsupported by HIP
|
||||
// CHECK-NOT: CUDA_CALL(hipRuntimeGetVersion(&runtime_version));
|
||||
CUDA_CALL(cudaRuntimeGetVersion(&runtime_version));
|
||||
int device_id;
|
||||
// CHECK: CUDA_CALL(hipGetDevice(&device_id));
|
||||
// CHECK: hipDeviceProp_t props;
|
||||
// CHECK: CUDA_CALL(hipGetDeviceProperties(&props, device_id));
|
||||
CUDA_CALL(cudaGetDevice(&device_id));
|
||||
cudaDeviceProp props;
|
||||
CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
|
||||
|
||||
std::cout << "cuRAND: " << version << " ";
|
||||
std::cout << "Runtime: " << runtime_version << " ";
|
||||
std::cout << "Device: " << props.name;
|
||||
std::cout << std::endl << std::endl;
|
||||
|
||||
for (auto engine : engines)
|
||||
{
|
||||
// CHECK: rng_type_t rng_type = HIPRAND_RNG_PSEUDO_XORWOW;
|
||||
// CHECK: rng_type = HIPRAND_RNG_PSEUDO_XORWOW;
|
||||
// CHECK: rng_type = HIPRAND_RNG_PSEUDO_MRG32K3A;
|
||||
// CHECK: rng_type = HIPRAND_RNG_PSEUDO_MTGP32;
|
||||
// CHECK: rng_type = HIPRAND_RNG_PSEUDO_MT19937;
|
||||
// CHECK: rng_type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10;
|
||||
// CHECK: rng_type = HIPRAND_RNG_QUASI_SOBOL32;
|
||||
// CHECK: rng_type = HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32;
|
||||
// CHECK: rng_type = HIPRAND_RNG_QUASI_SOBOL64;
|
||||
// CHECK: rng_type = HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64;
|
||||
rng_type_t rng_type = CURAND_RNG_PSEUDO_XORWOW;
|
||||
if (engine == "xorwow")
|
||||
rng_type = CURAND_RNG_PSEUDO_XORWOW;
|
||||
else if (engine == "mrg32k3a")
|
||||
rng_type = CURAND_RNG_PSEUDO_MRG32K3A;
|
||||
else if (engine == "mtgp32")
|
||||
rng_type = CURAND_RNG_PSEUDO_MTGP32;
|
||||
else if (engine == "mt19937")
|
||||
rng_type = CURAND_RNG_PSEUDO_MT19937;
|
||||
else if (engine == "philox")
|
||||
rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10;
|
||||
else if (engine == "sobol32")
|
||||
rng_type = CURAND_RNG_QUASI_SOBOL32;
|
||||
else if (engine == "scrambled_sobol32")
|
||||
rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL32;
|
||||
else if (engine == "sobol64")
|
||||
rng_type = CURAND_RNG_QUASI_SOBOL64;
|
||||
else if (engine == "scrambled_sobol64")
|
||||
rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64;
|
||||
else
|
||||
{
|
||||
std::cout << "Wrong engine name" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::cout << engine << ":" << std::endl;
|
||||
|
||||
for (auto distribution : distributions)
|
||||
{
|
||||
std::cout << " " << distribution << ":" << std::endl;
|
||||
run_benchmarks(parser, rng_type, distribution);
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,669 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
#include <numeric>
|
||||
#include <utility>
|
||||
#include <type_traits>
|
||||
#include <algorithm>
|
||||
|
||||
#include "cmdparser.hpp"
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda_runtime.h>
|
||||
// CHECK: #include <hiprand.h>
|
||||
#include <curand.h>
|
||||
// CHECK: #include <hiprand_kernel.h>
|
||||
#include <curand_kernel.h>
|
||||
// CHECK-NOT: #include <curand_mtgp32_host.h>
|
||||
// CHECK-NOT: #include <curand_mtgp32dc_p_11213.h>
|
||||
#include <curand_mtgp32_host.h>
|
||||
#include <curand_mtgp32dc_p_11213.h>
|
||||
|
||||
// CHECK: hipError_t error = (x);
|
||||
// CHECK: if(error!=hipSuccess) {
|
||||
#define CUDA_CALL(x) do { \
|
||||
cudaError_t error = (x);\
|
||||
if(error!=cudaSuccess) { \
|
||||
printf("Error %d at %s:%d\n",error,__FILE__,__LINE__);\
|
||||
exit(EXIT_FAILURE);}} while(0)
|
||||
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__);\
|
||||
exit(EXIT_FAILURE);}} while(0)
|
||||
|
||||
#ifndef DEFAULT_RAND_N
|
||||
const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
|
||||
#endif
|
||||
|
||||
size_t next_power2(size_t x)
|
||||
{
|
||||
size_t power = 1;
|
||||
while (power < x)
|
||||
{
|
||||
power *= 2;
|
||||
}
|
||||
return power;
|
||||
}
|
||||
|
||||
template<typename GeneratorState>
|
||||
__global__
|
||||
void init_kernel(GeneratorState * states,
|
||||
const unsigned long long seed,
|
||||
const unsigned long long offset)
|
||||
{
|
||||
const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
GeneratorState state;
|
||||
// CHECK: hiprand_init(seed, state_id, offset, &state);
|
||||
curand_init(seed, state_id, offset, &state);
|
||||
states[state_id] = state;
|
||||
}
|
||||
|
||||
template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
|
||||
__global__
|
||||
void generate_kernel(GeneratorState * states,
|
||||
T * data,
|
||||
const size_t size,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const unsigned int stride = gridDim.x * blockDim.x;
|
||||
|
||||
GeneratorState state = states[state_id];
|
||||
unsigned int index = state_id;
|
||||
while(index < size)
|
||||
{
|
||||
data[index] = generate_func(&state, extra);
|
||||
index += stride;
|
||||
}
|
||||
states[state_id] = state;
|
||||
}
|
||||
|
||||
template<typename GeneratorState>
|
||||
struct runner
|
||||
{
|
||||
GeneratorState * states;
|
||||
|
||||
runner(const size_t dimensions,
|
||||
const size_t blocks,
|
||||
const size_t threads,
|
||||
const unsigned long long seed,
|
||||
const unsigned long long offset)
|
||||
{
|
||||
const size_t states_size = blocks * threads;
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
|
||||
CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
|
||||
// CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
|
||||
init_kernel<<<blocks, threads>>>(states, seed, offset);
|
||||
// CHECK: CUDA_CALL(hipPeekAtLastError());
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaPeekAtLastError());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
~runner()
|
||||
{
|
||||
CUDA_CALL(cudaFree(states));
|
||||
}
|
||||
|
||||
template<typename T, typename GenerateFunc, typename Extra>
|
||||
void generate(const size_t blocks,
|
||||
const size_t threads,
|
||||
T * data,
|
||||
const size_t size,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
|
||||
generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
|
||||
}
|
||||
};
|
||||
|
||||
// CHECK: void generate_kernel(hiprandStateMtgp32_t * states,
|
||||
template<typename T, typename GenerateFunc, typename Extra>
|
||||
__global__
|
||||
void generate_kernel(curandStateMtgp32_t * states,
|
||||
T * data,
|
||||
const size_t size,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
const unsigned int state_id = blockIdx.x;
|
||||
const unsigned int thread_id = threadIdx.x;
|
||||
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int stride = gridDim.x * blockDim.x;
|
||||
// CHECK: __shared__ hiprandStateMtgp32_t state;
|
||||
__shared__ curandStateMtgp32_t state;
|
||||
|
||||
if (thread_id == 0)
|
||||
state = states[state_id];
|
||||
__syncthreads();
|
||||
|
||||
const size_t r = size%blockDim.x;
|
||||
const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
|
||||
while(index < size_rounded_up)
|
||||
{
|
||||
auto value = generate_func(&state, extra);
|
||||
if(index < size)
|
||||
data[index] = value;
|
||||
index += stride;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (thread_id == 0)
|
||||
states[state_id] = state;
|
||||
}
|
||||
|
||||
// CHECK: struct runner<hiprandStateMtgp32_t>
|
||||
template<>
|
||||
struct runner<curandStateMtgp32_t>
|
||||
{
|
||||
// CHECK: hiprandStateMtgp32_t * states;
|
||||
curandStateMtgp32_t * states;
|
||||
mtgp32_kernel_params_t * d_param;
|
||||
|
||||
runner(const size_t dimensions,
|
||||
const size_t blocks,
|
||||
const size_t threads,
|
||||
const unsigned long long seed,
|
||||
const unsigned long long offset)
|
||||
{
|
||||
const size_t states_size = std::min((size_t)200, blocks);
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateMtgp32_t)));
|
||||
CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateMtgp32_t)));
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&d_param, sizeof(mtgp32_kernel_params)));
|
||||
CUDA_CALL(cudaMalloc((void **)&d_param, sizeof(mtgp32_kernel_params)));
|
||||
// curandMakeMTGP32Constants is yet unsupported by HIP
|
||||
// CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param));
|
||||
CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param));
|
||||
// curandMakeMTGP32KernelState is yet unsupported by HIP
|
||||
// CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed));
|
||||
CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed));
|
||||
}
|
||||
|
||||
~runner()
|
||||
{
|
||||
// CHECK: CUDA_CALL(hipFree(states));
|
||||
// CHECK: CUDA_CALL(hipFree(d_param));
|
||||
CUDA_CALL(cudaFree(states));
|
||||
CUDA_CALL(cudaFree(d_param));
|
||||
}
|
||||
|
||||
template<typename T, typename GenerateFunc, typename Extra>
|
||||
void generate(const size_t blocks,
|
||||
const size_t threads,
|
||||
T * data,
|
||||
const size_t size,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
|
||||
generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
|
||||
}
|
||||
};
|
||||
|
||||
// CHECK: void init_kernel(hiprandStateSobol32_t * states,
|
||||
template<typename Directions>
|
||||
__global__
|
||||
void init_kernel(curandStateSobol32_t * states,
|
||||
const Directions directions,
|
||||
const unsigned long long offset)
|
||||
{
|
||||
const unsigned int dimension = blockIdx.y;
|
||||
const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// CHECK: hiprandStateSobol32_t state;
|
||||
// CHECK: hiprand_init(directions[dimension], offset + state_id, &state);
|
||||
curandStateSobol32_t state;
|
||||
curand_init(directions[dimension], offset + state_id, &state);
|
||||
states[gridDim.x * blockDim.x * dimension + state_id] = state;
|
||||
}
|
||||
|
||||
// CHECK: void generate_kernel(hiprandStateSobol32_t * states,
|
||||
template<typename T, typename GenerateFunc, typename Extra>
|
||||
__global__
|
||||
void generate_kernel(curandStateSobol32_t * states,
|
||||
T * data,
|
||||
const size_t size,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
const unsigned int dimension = blockIdx.y;
|
||||
const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const unsigned int stride = gridDim.x * blockDim.x;
|
||||
// CHECK: hiprandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id];
|
||||
curandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id];
|
||||
const unsigned int offset = dimension * size;
|
||||
unsigned int index = state_id;
|
||||
while(index < size)
|
||||
{
|
||||
data[offset + index] = generate_func(&state, extra);
|
||||
skipahead(stride - 1, &state);
|
||||
index += stride;
|
||||
}
|
||||
state = states[gridDim.x * blockDim.x * dimension + state_id];
|
||||
skipahead(static_cast<unsigned int>(size), &state);
|
||||
states[gridDim.x * blockDim.x * dimension + state_id] = state;
|
||||
}
|
||||
|
||||
// CHECK: struct runner<hiprandStateSobol32_t>
|
||||
template<>
|
||||
struct runner<curandStateSobol32_t>
|
||||
{
|
||||
// CHECK: hiprandStateSobol32_t * states;
|
||||
curandStateSobol32_t * states;
|
||||
size_t dimensions;
|
||||
|
||||
runner(const size_t dimensions,
|
||||
const size_t blocks,
|
||||
const size_t threads,
|
||||
const unsigned long long seed,
|
||||
const unsigned long long offset)
|
||||
{
|
||||
this->dimensions = dimensions;
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateSobol32_t)));
|
||||
const size_t states_size = blocks * threads * dimensions;
|
||||
CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateSobol32_t)));
|
||||
// CHECK: hiprandDirectionVectors32_t * directions;
|
||||
curandDirectionVectors32_t * directions;
|
||||
// CHECK: const size_t size = dimensions * sizeof(hiprandDirectionVectors32_t);
|
||||
const size_t size = dimensions * sizeof(curandDirectionVectors32_t);
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&directions, size));
|
||||
CUDA_CALL(cudaMalloc((void **)&directions, size));
|
||||
// CHECK: hiprandDirectionVectors32_t * h_directions;
|
||||
curandDirectionVectors32_t * h_directions;
|
||||
// hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of hiprandDirectionVectorSet_t) are yet unsupported by HIP
|
||||
// CHECK-NOT: CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, HIPRAND_DIRECTION_VECTORS_32_JOEKUO6));
|
||||
CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
|
||||
// CHECK: CUDA_CALL(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
|
||||
CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
|
||||
|
||||
const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
|
||||
// CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
|
||||
init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
|
||||
// CHECK: CUDA_CALL(hipPeekAtLastError());
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaPeekAtLastError());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
// CHECK: CUDA_CALL(hipFree(directions));
|
||||
CUDA_CALL(cudaFree(directions));
|
||||
}
|
||||
|
||||
~runner()
|
||||
{
|
||||
// CHECK: CUDA_CALL(hipFree(states));
|
||||
CUDA_CALL(cudaFree(states));
|
||||
}
|
||||
|
||||
template<typename T, typename GenerateFunc, typename Extra>
|
||||
void generate(const size_t blocks,
|
||||
const size_t threads,
|
||||
T * data,
|
||||
const size_t size,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
|
||||
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
|
||||
generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename GeneratorState, typename GenerateFunc, typename Extra>
|
||||
void run_benchmark(const cli::Parser& parser,
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
const size_t size = parser.get<size_t>("size");
|
||||
const size_t dimensions = parser.get<size_t>("dimensions");
|
||||
const size_t trials = parser.get<size_t>("trials");
|
||||
|
||||
const size_t blocks = parser.get<size_t>("blocks");
|
||||
const size_t threads = parser.get<size_t>("threads");
|
||||
|
||||
T * data;
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T)));
|
||||
CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T)));
|
||||
|
||||
runner<GeneratorState> r(dimensions, blocks, threads, 12345ULL, 6789ULL);
|
||||
|
||||
// Warm-up
|
||||
for (size_t i = 0; i < 5; i++)
|
||||
{
|
||||
r.generate(blocks, threads, data, size, generate_func, extra);
|
||||
// CHECK: CUDA_CALL(hipPeekAtLastError());
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaPeekAtLastError());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
|
||||
// Measurement
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
for (size_t i = 0; i < trials; i++)
|
||||
{
|
||||
r.generate(blocks, threads, data, size, generate_func, extra);
|
||||
}
|
||||
// CHECK: CUDA_CALL(hipPeekAtLastError());
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
CUDA_CALL(cudaPeekAtLastError());
|
||||
CUDA_CALL(cudaDeviceSynchronize());
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> elapsed = end - start;
|
||||
|
||||
std::cout << std::fixed << std::setprecision(3)
|
||||
<< " "
|
||||
<< "Throughput = "
|
||||
<< std::setw(8) << (trials * size * sizeof(T)) /
|
||||
(elapsed.count() / 1e3 * (1 << 30))
|
||||
<< " GB/s, Samples = "
|
||||
<< std::setw(8) << (trials * size) /
|
||||
(elapsed.count() / 1e3 * (1 << 30))
|
||||
<< " GSample/s, AvgTime (1 trial) = "
|
||||
<< std::setw(8) << elapsed.count() / trials
|
||||
<< " ms, Time (all) = "
|
||||
<< std::setw(8) << elapsed.count()
|
||||
<< " ms, Size = " << size
|
||||
<< std::endl;
|
||||
// CHECK: CUDA_CALL(hipFree(data));
|
||||
CUDA_CALL(cudaFree(data));
|
||||
}
|
||||
|
||||
template<typename GeneratorState>
|
||||
void run_benchmarks(const cli::Parser& parser,
|
||||
const std::string& distribution)
|
||||
{
|
||||
if (distribution == "uniform-uint")
|
||||
{
|
||||
// curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP
|
||||
// CHECK-NOT: if (!std::is_same<GeneratorState, hiprandStateSobol64_t>::value &&
|
||||
// CHECK-NOT: !std::is_same<GeneratorState, hiprandStateScrambledSobol64_t>::value)
|
||||
if (!std::is_same<GeneratorState, curandStateSobol64_t>::value &&
|
||||
!std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
|
||||
{
|
||||
run_benchmark<unsigned int, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand(state);
|
||||
return curand(state);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
}
|
||||
if (distribution == "uniform-long-long")
|
||||
{
|
||||
// curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP
|
||||
// CHECK-NOT: if (!std::is_same<GeneratorState, hiprandStateSobol64_t>::value &&
|
||||
// CHECK-NOT: !std::is_same<GeneratorState, hiprandStateScrambledSobol64_t>::value)
|
||||
if (std::is_same<GeneratorState, curandStateSobol64_t>::value ||
|
||||
std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
|
||||
{
|
||||
run_benchmark<unsigned long long, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand(state);
|
||||
return curand(state);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
}
|
||||
if (distribution == "uniform-float")
|
||||
{
|
||||
run_benchmark<float, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand_uniform(state);
|
||||
return curand_uniform(state);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
if (distribution == "uniform-double")
|
||||
{
|
||||
run_benchmark<double, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand_uniform_double(state);
|
||||
return curand_uniform_double(state);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
if (distribution == "normal-float")
|
||||
{
|
||||
run_benchmark<float, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand_normal(state);
|
||||
return curand_normal(state);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
if (distribution == "normal-double")
|
||||
{
|
||||
run_benchmark<double, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand_normal_double(state);
|
||||
return curand_normal_double(state);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
if (distribution == "log-normal-float")
|
||||
{
|
||||
run_benchmark<float, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand_log_normal(state, 0.0f, 1.0f);
|
||||
return curand_log_normal(state, 0.0f, 1.0f);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
if (distribution == "log-normal-double")
|
||||
{
|
||||
run_benchmark<double, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, int) {
|
||||
// CHECK: return hiprand_log_normal_double(state, 0.0, 1.0);
|
||||
return curand_log_normal_double(state, 0.0, 1.0);
|
||||
}, 0
|
||||
);
|
||||
}
|
||||
if (distribution == "poisson")
|
||||
{
|
||||
const auto lambdas = parser.get<std::vector<double>>("lambda");
|
||||
for (double lambda : lambdas)
|
||||
{
|
||||
std::cout << " " << "lambda "
|
||||
<< std::fixed << std::setprecision(1) << lambda << std::endl;
|
||||
run_benchmark<unsigned int, GeneratorState>(parser,
|
||||
[] __device__ (GeneratorState * state, double lambda) {
|
||||
// CHECK: return hiprand_poisson(state, lambda);
|
||||
return curand_poisson(state, lambda);
|
||||
}, lambda
|
||||
);
|
||||
}
|
||||
}
|
||||
if (distribution == "discrete-poisson")
|
||||
{
|
||||
const auto lambdas = parser.get<std::vector<double>>("lambda");
|
||||
for (double lambda : lambdas)
|
||||
{
|
||||
std::cout << " " << "lambda "
|
||||
<< std::fixed << std::setprecision(1) << lambda << std::endl;
|
||||
// CHECK: hiprandDiscreteDistribution_t discrete_distribution;
|
||||
curandDiscreteDistribution_t discrete_distribution;
|
||||
// CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(lambda, &discrete_distribution));
|
||||
CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution));
|
||||
run_benchmark<unsigned int, GeneratorState>(parser,
|
||||
// CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t discrete_distribution) {
|
||||
[] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) {
|
||||
// CHECK: return hiprand_discrete(state, discrete_distribution);
|
||||
return curand_discrete(state, discrete_distribution);
|
||||
}, discrete_distribution
|
||||
);
|
||||
// CHECK: CURAND_CALL(hiprandDestroyDistribution(discrete_distribution));
|
||||
CURAND_CALL(curandDestroyDistribution(discrete_distribution));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<std::string> all_engines = {
|
||||
"xorwow",
|
||||
"mrg32k3a",
|
||||
"mtgp32",
|
||||
// "mt19937",
|
||||
"philox",
|
||||
"sobol32",
|
||||
// "scrambled_sobol32",
|
||||
// "sobol64",
|
||||
// "scrambled_sobol64",
|
||||
};
|
||||
|
||||
const std::vector<std::string> all_distributions = {
|
||||
"uniform-uint",
|
||||
// "uniform-long-long",
|
||||
"uniform-float",
|
||||
"uniform-double",
|
||||
"normal-float",
|
||||
"normal-double",
|
||||
"log-normal-float",
|
||||
"log-normal-double",
|
||||
"poisson",
|
||||
"discrete-poisson",
|
||||
};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
cli::Parser parser(argc, argv);
|
||||
|
||||
const std::string distribution_desc =
|
||||
"space-separated list of distributions:" +
|
||||
std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(),
|
||||
[](std::string a, std::string b) {
|
||||
return a + "\n " + b;
|
||||
}
|
||||
) +
|
||||
"\n or all";
|
||||
const std::string engine_desc =
|
||||
"space-separated list of random number engines:" +
|
||||
std::accumulate(all_engines.begin(), all_engines.end(), std::string(),
|
||||
[](std::string a, std::string b) {
|
||||
return a + "\n " + b;
|
||||
}
|
||||
) +
|
||||
"\n or all";
|
||||
|
||||
parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
|
||||
parser.set_optional<size_t>("dimensions", "dimensions", 1, "number of dimensions of quasi-random values");
|
||||
parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
|
||||
parser.set_optional<size_t>("blocks", "blocks", 256, "number of blocks");
|
||||
parser.set_optional<size_t>("threads", "threads", 256, "number of threads in each block");
|
||||
parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
|
||||
parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
|
||||
parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
|
||||
parser.run_and_exit_if_error();
|
||||
|
||||
std::vector<std::string> engines;
|
||||
{
|
||||
auto es = parser.get<std::vector<std::string>>("engine");
|
||||
if (std::find(es.begin(), es.end(), "all") != es.end())
|
||||
{
|
||||
engines = all_engines;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto e : all_engines)
|
||||
{
|
||||
if (std::find(es.begin(), es.end(), e) != es.end())
|
||||
engines.push_back(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> distributions;
|
||||
{
|
||||
auto ds = parser.get<std::vector<std::string>>("dis");
|
||||
if (std::find(ds.begin(), ds.end(), "all") != ds.end())
|
||||
{
|
||||
distributions = all_distributions;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto d : all_distributions)
|
||||
{
|
||||
if (std::find(ds.begin(), ds.end(), d) != ds.end())
|
||||
distributions.push_back(d);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int version;
|
||||
// CHECK: CURAND_CALL(hiprandGetVersion(&version));
|
||||
CURAND_CALL(curandGetVersion(&version));
|
||||
int runtime_version;
|
||||
// cudaRuntimeGetVersion is yet unsupported by HIP
|
||||
// CHECK-NOT: CUDA_CALL(hipRuntimeGetVersion(&runtime_version));
|
||||
CUDA_CALL(cudaRuntimeGetVersion(&runtime_version));
|
||||
int device_id;
|
||||
// CHECK: CUDA_CALL(hipGetDevice(&device_id));
|
||||
// CHECK: hipDeviceProp_t props;
|
||||
// CHECK: CUDA_CALL(hipGetDeviceProperties(&props, device_id));
|
||||
CUDA_CALL(cudaGetDevice(&device_id));
|
||||
cudaDeviceProp props;
|
||||
CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
|
||||
|
||||
std::cout << "cuRAND: " << version << " ";
|
||||
std::cout << "Runtime: " << runtime_version << " ";
|
||||
std::cout << "Device: " << props.name;
|
||||
std::cout << std::endl << std::endl;
|
||||
|
||||
for (auto engine : engines)
|
||||
{
|
||||
std::cout << engine << ":" << std::endl;
|
||||
for (auto distribution : distributions)
|
||||
{
|
||||
std::cout << " " << distribution << ":" << std::endl;
|
||||
const std::string plot_name = engine + "-" + distribution;
|
||||
if (engine == "xorwow")
|
||||
{
|
||||
// CHECK: run_benchmarks<hiprandStateXORWOW_t>(parser, distribution);
|
||||
run_benchmarks<curandStateXORWOW_t>(parser, distribution);
|
||||
}
|
||||
else if (engine == "mrg32k3a")
|
||||
{
|
||||
// CHECK: run_benchmarks<hiprandStateMRG32k3a_t>(parser, distribution);
|
||||
run_benchmarks<curandStateMRG32k3a_t>(parser, distribution);
|
||||
}
|
||||
else if (engine == "philox")
|
||||
{
|
||||
// CHECK: run_benchmarks<hiprandStatePhilox4_32_10_t>(parser, distribution);
|
||||
run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution);
|
||||
}
|
||||
else if (engine == "sobol32")
|
||||
{
|
||||
// CHECK: run_benchmarks<hiprandStateSobol32_t>(parser, distribution);
|
||||
run_benchmarks<curandStateSobol32_t>(parser, distribution);
|
||||
}
|
||||
else if (engine == "mtgp32")
|
||||
{
|
||||
// CHECK: run_benchmarks<hiprandStateMtgp32_t>(parser, distribution);
|
||||
run_benchmarks<curandStateMtgp32_t>(parser, distribution);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,513 @@
|
||||
// The MIT License (MIT)
|
||||
//
|
||||
// Copyright (c) 2015 - 2016 Florian Rappl
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
/*
|
||||
This file is part of the C++ CmdParser utility.
|
||||
Copyright (c) 2015 - 2016 Florian Rappl
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <functional>
|
||||
|
||||
namespace cli {
|
||||
struct CallbackArgs {
|
||||
const std::vector<std::string>& arguments;
|
||||
std::ostream& output;
|
||||
std::ostream& error;
|
||||
};
|
||||
class Parser {
|
||||
private:
|
||||
class CmdBase {
|
||||
public:
|
||||
explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) :
|
||||
name(name),
|
||||
command(name.size() > 0 ? "-" + name : ""),
|
||||
alternative(alternative.size() > 0 ? "--" + alternative : ""),
|
||||
description(description),
|
||||
required(required),
|
||||
handled(false),
|
||||
arguments({}),
|
||||
dominant(dominant),
|
||||
variadic(variadic) {
|
||||
}
|
||||
|
||||
virtual ~CmdBase() {
|
||||
}
|
||||
|
||||
std::string name;
|
||||
std::string command;
|
||||
std::string alternative;
|
||||
std::string description;
|
||||
bool required;
|
||||
bool handled;
|
||||
std::vector<std::string> arguments;
|
||||
bool const dominant;
|
||||
bool const variadic;
|
||||
|
||||
virtual std::string print_value() const = 0;
|
||||
virtual bool parse(std::ostream& output, std::ostream& error) = 0;
|
||||
|
||||
bool is(const std::string& given) const {
|
||||
return given == command || given == alternative;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct ArgumentCountChecker
|
||||
{
|
||||
static constexpr bool Variadic = false;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct ArgumentCountChecker<std::vector<T>>
|
||||
{
|
||||
static constexpr bool Variadic = true;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class CmdFunction final : public CmdBase {
|
||||
public:
|
||||
explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) :
|
||||
CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker<T>::Variadic) {
|
||||
}
|
||||
|
||||
virtual bool parse(std::ostream& output, std::ostream& error) {
|
||||
try {
|
||||
CallbackArgs args { arguments, output, error };
|
||||
value = callback(args);
|
||||
return true;
|
||||
} catch (...) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
virtual std::string print_value() const {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::function<T(CallbackArgs&)> callback;
|
||||
T value;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class CmdArgument final : public CmdBase {
|
||||
public:
|
||||
explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) :
|
||||
CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker<T>::Variadic) {
|
||||
}
|
||||
|
||||
virtual bool parse(std::ostream&, std::ostream&) {
|
||||
try {
|
||||
value = Parser::parse(arguments, value);
|
||||
return true;
|
||||
} catch (...) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
virtual std::string print_value() const {
|
||||
return stringify(value);
|
||||
}
|
||||
|
||||
T value;
|
||||
};
|
||||
|
||||
static int parse(const std::vector<std::string>& elements, const int&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stoi(elements[0]);
|
||||
}
|
||||
|
||||
static bool parse(const std::vector<std::string>& elements, const bool& defval) {
|
||||
if (elements.size() != 0)
|
||||
throw std::runtime_error("A boolean command line parameter cannot have any arguments.");
|
||||
|
||||
return !defval;
|
||||
}
|
||||
|
||||
static double parse(const std::vector<std::string>& elements, const double&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stod(elements[0]);
|
||||
}
|
||||
|
||||
static float parse(const std::vector<std::string>& elements, const float&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stof(elements[0]);
|
||||
}
|
||||
|
||||
static long double parse(const std::vector<std::string>& elements, const long double&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stold(elements[0]);
|
||||
}
|
||||
|
||||
static unsigned int parse(const std::vector<std::string>& elements, const unsigned int&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return static_cast<unsigned int>(std::stoul(elements[0]));
|
||||
}
|
||||
|
||||
static unsigned long parse(const std::vector<std::string>& elements, const unsigned long&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stoul(elements[0]);
|
||||
}
|
||||
|
||||
static unsigned long long parse(const std::vector<std::string>& elements, const unsigned long long&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stoull(elements[0]);
|
||||
}
|
||||
|
||||
static long parse(const std::vector<std::string>& elements, const long&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return std::stol(elements[0]);
|
||||
}
|
||||
|
||||
static std::string parse(const std::vector<std::string>& elements, const std::string&) {
|
||||
if (elements.size() != 1)
|
||||
throw std::bad_cast();
|
||||
|
||||
return elements[0];
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> parse(const std::vector<std::string>& elements, const std::vector<T>&) {
|
||||
const T defval = T();
|
||||
std::vector<T> values { };
|
||||
std::vector<std::string> buffer(1);
|
||||
|
||||
for (const auto& element : elements) {
|
||||
buffer[0] = element;
|
||||
values.push_back(parse(buffer, defval));
|
||||
}
|
||||
|
||||
return values;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static std::string stringify(const T& value) {
|
||||
return std::to_string(value);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static std::string stringify(const std::vector<T>& values) {
|
||||
std::stringstream ss { };
|
||||
ss << "[ ";
|
||||
|
||||
for (const auto& value : values) {
|
||||
ss << stringify(value) << " ";
|
||||
}
|
||||
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static std::string stringify(const std::string& str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
public:
|
||||
explicit Parser(int argc, const char** argv) :
|
||||
_appname(argv[0]) {
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
_arguments.push_back(argv[i]);
|
||||
}
|
||||
enable_help();
|
||||
}
|
||||
|
||||
explicit Parser(int argc, char** argv) :
|
||||
_appname(argv[0]) {
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
_arguments.push_back(argv[i]);
|
||||
}
|
||||
enable_help();
|
||||
}
|
||||
|
||||
~Parser() {
|
||||
for (int i = 0, n = _commands.size(); i < n; ++i) {
|
||||
delete _commands[i];
|
||||
}
|
||||
}
|
||||
|
||||
bool has_help() const {
|
||||
for (const auto command : _commands) {
|
||||
if (command->name == "h" && command->alternative == "--help") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void enable_help() {
|
||||
set_callback("h", "help", std::function<bool(CallbackArgs&)>([this](CallbackArgs& args){
|
||||
args.output << this->usage();
|
||||
exit(0);
|
||||
return false;
|
||||
}), "", true);
|
||||
}
|
||||
|
||||
void disable_help() {
|
||||
for (auto command = _commands.begin(); command != _commands.end(); ++command) {
|
||||
if ((*command)->name == "h" && (*command)->alternative == "--help") {
|
||||
_commands.erase(command);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void set_default(bool is_required, const std::string& description = "") {
|
||||
auto command = new CmdArgument<T> { "", "", description, is_required, false };
|
||||
_commands.push_back(command);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) {
|
||||
auto command = new CmdArgument<T> { name, alternative, description, true, dominant };
|
||||
_commands.push_back(command);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) {
|
||||
auto command = new CmdArgument<T> { name, alternative, description, false, dominant };
|
||||
command->value = defaultValue;
|
||||
_commands.push_back(command);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void set_callback(const std::string& name, const std::string& alternative, std::function<T(CallbackArgs&)> callback, const std::string& description = "", bool dominant = false) {
|
||||
auto command = new CmdFunction<T> { name, alternative, description, false, dominant };
|
||||
command->callback = callback;
|
||||
_commands.push_back(command);
|
||||
}
|
||||
|
||||
inline void run_and_exit_if_error() {
|
||||
if (run() == false) {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
inline bool run() {
|
||||
return run(std::cout, std::cerr);
|
||||
}
|
||||
|
||||
inline bool run(std::ostream& output) {
|
||||
return run(output, std::cerr);
|
||||
}
|
||||
|
||||
bool run(std::ostream& output, std::ostream& error) {
|
||||
if (_arguments.size() > 0) {
|
||||
auto current = find_default();
|
||||
|
||||
for (int i = 0, n = _arguments.size(); i < n; ++i) {
|
||||
auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-';
|
||||
auto associated = isarg ? find(_arguments[i]) : nullptr;
|
||||
|
||||
if (associated != nullptr) {
|
||||
current = associated;
|
||||
associated->handled = true;
|
||||
} else if (current == nullptr) {
|
||||
error << no_default();
|
||||
return false;
|
||||
} else {
|
||||
current->arguments.push_back(_arguments[i]);
|
||||
current->handled = true;
|
||||
if (!current->variadic)
|
||||
{
|
||||
// If the current command is not variadic, then no more arguments
|
||||
// should be added to it. In this case, switch back to the default
|
||||
// command.
|
||||
current = find_default();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First, parse dominant arguments since they succeed even if required
|
||||
// arguments are missing.
|
||||
for (auto command : _commands) {
|
||||
if (command->handled && command->dominant && !command->parse(output, error)) {
|
||||
error << howto_use(command);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Next, check for any missing arguments.
|
||||
for (auto command : _commands) {
|
||||
if (command->required && !command->handled) {
|
||||
error << howto_required(command);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, parse all remaining arguments.
|
||||
for (auto command : _commands) {
|
||||
if (command->handled && !command->dominant && !command->parse(output, error)) {
|
||||
error << howto_use(command);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T get(const std::string& name) const {
|
||||
for (const auto& command : _commands) {
|
||||
if (command->name == name) {
|
||||
auto cmd = dynamic_cast<CmdArgument<T>*>(command);
|
||||
|
||||
if (cmd == nullptr) {
|
||||
throw std::runtime_error("Invalid usage of the parameter " + name + " detected.");
|
||||
}
|
||||
|
||||
return cmd->value;
|
||||
}
|
||||
}
|
||||
|
||||
throw std::runtime_error("The parameter " + name + " could not be found.");
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T get_if(const std::string& name, std::function<T(T)> callback) const {
|
||||
auto value = get<T>(name);
|
||||
return callback(value);
|
||||
}
|
||||
|
||||
int requirements() const {
|
||||
int count = 0;
|
||||
|
||||
for (const auto& command : _commands) {
|
||||
if (command->required) {
|
||||
++count;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
int commands() const {
|
||||
return static_cast<int>(_commands.size());
|
||||
}
|
||||
|
||||
inline const std::string& app_name() const {
|
||||
return _appname;
|
||||
}
|
||||
|
||||
protected:
|
||||
CmdBase* find(const std::string& name) {
|
||||
for (auto command : _commands) {
|
||||
if (command->is(name)) {
|
||||
return command;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CmdBase* find_default() {
|
||||
for (auto command : _commands) {
|
||||
if (command->name == "") {
|
||||
return command;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::string usage() const {
|
||||
std::stringstream ss { };
|
||||
ss << "Available parameters:\n\n";
|
||||
|
||||
for (const auto& command : _commands) {
|
||||
ss << " " << command->command << "\t" << command->alternative;
|
||||
|
||||
if (command->required == true) {
|
||||
ss << "\t(required)";
|
||||
}
|
||||
|
||||
ss << "\n " << command->description;
|
||||
|
||||
if (command->required == false) {
|
||||
ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'.";
|
||||
}
|
||||
|
||||
ss << "\n\n";
|
||||
}
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void print_help(std::stringstream& ss) const {
|
||||
if (has_help()) {
|
||||
ss << "For more help use --help or -h.\n";
|
||||
}
|
||||
}
|
||||
|
||||
std::string howto_required(CmdBase* command) const {
|
||||
std::stringstream ss { };
|
||||
ss << "The parameter " << command->name << " is required.\n";
|
||||
ss << command->description << '\n';
|
||||
print_help(ss);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string howto_use(CmdBase* command) const {
|
||||
std::stringstream ss { };
|
||||
ss << "The parameter " << command->name << " has invalid arguments.\n";
|
||||
ss << command->description << '\n';
|
||||
print_help(ss);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string no_default() const {
|
||||
std::stringstream ss { };
|
||||
ss << "No default parameter has been specified.\n";
|
||||
ss << "The given argument must be used with a parameter.\n";
|
||||
print_help(ss);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
private:
|
||||
const std::string _appname;
|
||||
std::vector<std::string> _arguments;
|
||||
std::vector<CmdBase*> _commands;
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,417 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// Taken from: http://docs.nvidia.com/cuda/curand/device-api-overview.html#poisson-api-example
|
||||
/*
|
||||
* This program uses CURAND library for Poisson distribution
|
||||
* to simulate queues in store for 16 hours. It shows the
|
||||
* difference of using 3 different APIs:
|
||||
* - HOST API -arrival of customers is described by Poisson(4)
|
||||
* - SIMPLE DEVICE API -arrival of customers is described by
|
||||
* Poisson(4*(sin(x/100)+1)), where x is number of minutes
|
||||
* from store opening time.
|
||||
* - ROBUST DEVICE API -arrival of customers is described by:
|
||||
* - Poisson(2) for first 3 hours.
|
||||
* - Poisson(1) for second 3 hours.
|
||||
* - Poisson(3) after 6 hours.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda.h>
|
||||
// CHECK: #include <hiprand_kernel.h>
|
||||
#include <curand_kernel.h>
|
||||
// CHECK: #include <hiprand.h>
|
||||
#include <curand.h>
|
||||
|
||||
// CHECK: #define CUDA_CALL(x) do { if((x) != hipSuccess) {
|
||||
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__); \
|
||||
return EXIT_FAILURE;}} while(0)
|
||||
// CHECK: #define CURAND_CALL(x) do { if((x)!=HIPRAND_STATUS_SUCCESS) {
|
||||
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__);\
|
||||
return EXIT_FAILURE;}} while(0)
|
||||
|
||||
|
||||
#define HOURS 16
|
||||
#define OPENING_HOUR 7
|
||||
#define CLOSING_HOUR (OPENING_HOUR + HOURS)
|
||||
|
||||
#define access_2D(type, ptr, row, column, pitch)\
|
||||
*((type*)((char*)ptr + (row) * pitch) + column)
|
||||
|
||||
enum API_TYPE {
|
||||
HOST_API = 0,
|
||||
SIMPLE_DEVICE_API = 1,
|
||||
ROBUST_DEVICE_API = 2,
|
||||
};
|
||||
|
||||
/* global variables */
|
||||
API_TYPE api;
|
||||
int report_break;
|
||||
int cashiers_load_h[HOURS];
|
||||
__constant__ int cashiers_load[HOURS];
|
||||
// CHECK: __global__ void setup_kernel(hiprandState_t *state)
|
||||
__global__ void setup_kernel(curandState *state)
|
||||
{
|
||||
int id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
/* Each thread gets same seed, a different sequence
|
||||
number, no offset */
|
||||
// CHECK: hiprand_init(1234, id, 0, &state[id]);
|
||||
curand_init(1234, id, 0, &state[id]);
|
||||
}
|
||||
|
||||
__inline__ __device__
|
||||
void update_queue(int id, int min, unsigned int new_customers,
|
||||
unsigned int &queue_length,
|
||||
unsigned int *queue_lengths, size_t pitch)
|
||||
{
|
||||
int balance;
|
||||
balance = new_customers - 2 * cashiers_load[(min-1)/60];
|
||||
if (balance + (int)queue_length <= 0){
|
||||
queue_length = 0;
|
||||
}else{
|
||||
queue_length += balance;
|
||||
}
|
||||
/* Store results */
|
||||
access_2D(unsigned int, queue_lengths, min-1, id, pitch)
|
||||
= queue_length;
|
||||
}
|
||||
|
||||
// CHECK: __global__ void simple_device_API_kernel(hiprandState_t *state,
|
||||
__global__ void simple_device_API_kernel(curandState *state,
|
||||
unsigned int *queue_lengths, size_t pitch)
|
||||
{
|
||||
int id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
unsigned int new_customers;
|
||||
unsigned int queue_length = 0;
|
||||
/* Copy state to local memory for efficiency */
|
||||
// CHECK: hiprandState_t localState = state[id];
|
||||
curandState localState = state[id];
|
||||
/* Simulate queue in time */
|
||||
for(int min = 1; min <= 60 * HOURS; min++) {
|
||||
/* Draw number of new customers depending on API */
|
||||
// CHECK: new_customers = hiprand_poisson(&localState,
|
||||
new_customers = curand_poisson(&localState,
|
||||
4*(sin((float)min/100.0)+1));
|
||||
/* Update queue */
|
||||
update_queue(id, min, new_customers, queue_length,
|
||||
queue_lengths, pitch);
|
||||
}
|
||||
/* Copy state back to global memory */
|
||||
state[id] = localState;
|
||||
}
|
||||
|
||||
|
||||
__global__ void host_API_kernel(unsigned int *poisson_numbers,
|
||||
unsigned int *queue_lengths, size_t pitch)
|
||||
{
|
||||
int id = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
unsigned int new_customers;
|
||||
unsigned int queue_length = 0;
|
||||
/* Simulate queue in time */
|
||||
for(int min = 1; min <= 60 * HOURS; min++) {
|
||||
/* Get random number from global memory */
|
||||
new_customers = poisson_numbers
|
||||
[blockDim.x * gridDim.x * (min -1) + id];
|
||||
/* Update queue */
|
||||
update_queue(id, min, new_customers, queue_length,
|
||||
queue_lengths, pitch);
|
||||
}
|
||||
}
|
||||
// CHECK: __global__ void robust_device_API_kernel(hiprandState_t *state,
|
||||
// CHECK: hiprandDiscreteDistribution_t poisson_1,
|
||||
// CHECK: hiprandDiscreteDistribution_t poisson_2,
|
||||
// CHECK: hiprandDiscreteDistribution_t poisson_3,
|
||||
__global__ void robust_device_API_kernel(curandState *state,
|
||||
curandDiscreteDistribution_t poisson_1,
|
||||
curandDiscreteDistribution_t poisson_2,
|
||||
curandDiscreteDistribution_t poisson_3,
|
||||
unsigned int *queue_lengths, size_t pitch)
|
||||
{
|
||||
int id = threadIdx.x + blockIdx.x * 64;
|
||||
unsigned int new_customers;
|
||||
unsigned int queue_length = 0;
|
||||
/* Copy state to local memory for efficiency */
|
||||
// CHECK: hiprandState_t localState = state[id];
|
||||
curandState localState = state[id];
|
||||
/* Simulate queue in time */
|
||||
/* first 3 hours */
|
||||
for(int min = 1; min <= 60 * 3; min++) {
|
||||
/* draw number of new customers depending on API */
|
||||
new_customers =
|
||||
// CHECK: hiprand_discrete(&localState, poisson_2);
|
||||
curand_discrete(&localState, poisson_2);
|
||||
/* Update queue */
|
||||
update_queue(id, min, new_customers, queue_length,
|
||||
queue_lengths, pitch);
|
||||
}
|
||||
/* second 3 hours */
|
||||
for(int min = 60 * 3 + 1; min <= 60 * 6; min++) {
|
||||
/* draw number of new customers depending on API */
|
||||
new_customers =
|
||||
// CHECK: hiprand_discrete(&localState, poisson_1);
|
||||
curand_discrete(&localState, poisson_1);
|
||||
/* Update queue */
|
||||
update_queue(id, min, new_customers, queue_length,
|
||||
queue_lengths, pitch);
|
||||
}
|
||||
/* after 6 hours */
|
||||
for(int min = 60 * 6 + 1; min <= 60 * HOURS; min++) {
|
||||
/* draw number of new customers depending on API */
|
||||
new_customers =
|
||||
// CHECK: hiprand_discrete(&localState, poisson_3);
|
||||
curand_discrete(&localState, poisson_3);
|
||||
/* Update queue */
|
||||
update_queue(id, min, new_customers, queue_length,
|
||||
queue_lengths, pitch);
|
||||
}
|
||||
/* Copy state back to global memory */
|
||||
state[id] = localState;
|
||||
}
|
||||
|
||||
/* Set time intervals between reports */
|
||||
void report_settings()
|
||||
{
|
||||
do{
|
||||
printf("Set time intervals between queue reports");
|
||||
printf("(in minutes > 0)\n");
|
||||
if (scanf("%d", &report_break) == 0) continue;
|
||||
}while(report_break <= 0);
|
||||
}
|
||||
|
||||
|
||||
/* Set number of cashiers each hour */
|
||||
void add_cachiers(int *cashiers_load)
|
||||
{
|
||||
int i, min, max, begin, end;
|
||||
printf("Cashier serves 2 customers per minute...\n");
|
||||
for (i = 0; i < HOURS; i++){
|
||||
cashiers_load_h[i] = 0;
|
||||
}
|
||||
while (true){
|
||||
printf("Adding cashier...\n");
|
||||
min = OPENING_HOUR;
|
||||
max = CLOSING_HOUR-1;
|
||||
do{
|
||||
printf("Set hour that cahier comes (%d-%d)",
|
||||
min, max);
|
||||
printf(" [type 0 to finish adding cashiers]\n");
|
||||
if (scanf("%d", &begin) == 0) continue;
|
||||
}while (begin > max || (begin < min && begin != 0));
|
||||
if (begin == 0) break;
|
||||
min = begin+1;
|
||||
max = CLOSING_HOUR;
|
||||
do{
|
||||
printf("Set hour that cahier leaves (%d-%d)",
|
||||
min, max);
|
||||
printf(" [type 0 to finish adding cashiers]\n");
|
||||
if (scanf("%d", &end) == 0) continue;
|
||||
}while (end > max || (end < min && end != 0));
|
||||
if (end == 0) break;
|
||||
for (i = begin - OPENING_HOUR;
|
||||
i < end - OPENING_HOUR; i++){
|
||||
cashiers_load_h[i]++;
|
||||
}
|
||||
}
|
||||
for (i = OPENING_HOUR; i < CLOSING_HOUR; i++){
|
||||
printf("\n%2d:00 - %2d:00 %d cashier",
|
||||
i, i+1, cashiers_load_h[i-OPENING_HOUR]);
|
||||
if (cashiers_load[i-OPENING_HOUR] != 1) printf("s");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/* Set API type */
|
||||
API_TYPE set_API_type()
|
||||
{
|
||||
printf("Choose API type:\n");
|
||||
int choose;
|
||||
do{
|
||||
printf("type 1 for HOST API\n");
|
||||
printf("type 2 for SIMPLE DEVICE API\n");
|
||||
printf("type 3 for ROBUST DEVICE API\n");
|
||||
if (scanf("%d", &choose) == 0) continue;
|
||||
}while( choose < 1 || choose > 3);
|
||||
switch(choose){
|
||||
case 1: return HOST_API;
|
||||
case 2: return SIMPLE_DEVICE_API;
|
||||
case 3: return ROBUST_DEVICE_API;
|
||||
default:
|
||||
fprintf(stderr, "wrong API\n");
|
||||
return HOST_API;
|
||||
}
|
||||
}
|
||||
|
||||
void settings()
|
||||
{
|
||||
add_cachiers(cashiers_load);
|
||||
// CHECK: hipMemcpyToSymbol("cashiers_load", cashiers_load_h,
|
||||
// CHECK: HOURS * sizeof(int), 0, hipMemcpyHostToDevice);
|
||||
cudaMemcpyToSymbol("cashiers_load", cashiers_load_h,
|
||||
HOURS * sizeof(int), 0, cudaMemcpyHostToDevice);
|
||||
report_settings();
|
||||
api = set_API_type();
|
||||
}
|
||||
|
||||
void print_statistics(unsigned int *hostResults, size_t pitch)
|
||||
{
|
||||
int min, i, hour, minute;
|
||||
unsigned int sum;
|
||||
for(min = report_break; min <= 60 * HOURS;
|
||||
min += report_break) {
|
||||
sum = 0;
|
||||
for(i = 0; i < 64 * 64; i++) {
|
||||
sum += access_2D(unsigned int, hostResults,
|
||||
min-1, i, pitch);
|
||||
}
|
||||
hour = OPENING_HOUR + min/60;
|
||||
minute = min%60;
|
||||
printf("%2d:%02d # of waiting customers = %10.4g |",
|
||||
hour, minute, (float)sum/(64.0 * 64.0));
|
||||
printf(" # of cashiers = %d | ",
|
||||
cashiers_load_h[(min-1)/60]);
|
||||
printf("# of new customers/min ~= ");
|
||||
switch (api){
|
||||
case HOST_API:
|
||||
printf("%2.2f\n", 4.0);
|
||||
break;
|
||||
case SIMPLE_DEVICE_API:
|
||||
printf("%2.2f\n",
|
||||
4*(sin((float)min/100.0)+1));
|
||||
break;
|
||||
case ROBUST_DEVICE_API:
|
||||
if (min <= 3 * 60){
|
||||
printf("%2.2f\n", 2.0);
|
||||
}else{
|
||||
if (min <= 6 * 60){
|
||||
printf("%2.2f\n", 1.0);
|
||||
}else{
|
||||
printf("%2.2f\n", 3.0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Wrong API\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int n;
|
||||
size_t pitch;
|
||||
// CHECK: hiprandState_t *devStates;
|
||||
curandState *devStates;
|
||||
unsigned int *devResults, *hostResults;
|
||||
unsigned int *poisson_numbers_d;
|
||||
// CHECK: hiprandDiscreteDistribution_t poisson_1, poisson_2;
|
||||
// CHECK: hiprandDiscreteDistribution_t poisson_3;
|
||||
// CHECK: hiprandGenerator_t gen;
|
||||
curandDiscreteDistribution_t poisson_1, poisson_2;
|
||||
curandDiscreteDistribution_t poisson_3;
|
||||
curandGenerator_t gen;
|
||||
|
||||
/* Setting cashiers, report and API */
|
||||
settings();
|
||||
|
||||
/* Allocate space for results on device */
|
||||
// CHECK: CUDA_CALL(hipMallocPitch((void **)&devResults, &pitch,
|
||||
CUDA_CALL(cudaMallocPitch((void **)&devResults, &pitch,
|
||||
64 * 64 * sizeof(unsigned int), 60 * HOURS));
|
||||
|
||||
/* Allocate space for results on host */
|
||||
hostResults = (unsigned int *)calloc(pitch * 60 * HOURS,
|
||||
sizeof(unsigned int));
|
||||
|
||||
/* Allocate space for prng states on device */
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&devStates, 64 * 64 *
|
||||
// CHECK: sizeof(hiprandState_t)));
|
||||
CUDA_CALL(cudaMalloc((void **)&devStates, 64 * 64 *
|
||||
sizeof(curandState)));
|
||||
|
||||
/* Setup prng states */
|
||||
if (api != HOST_API){
|
||||
// CHECK: hipLaunchKernelGGL(setup_kernel, dim3(64), dim3(64), 0, 0, devStates);
|
||||
setup_kernel<<<64, 64>>>(devStates);
|
||||
}
|
||||
/* Simulate queue */
|
||||
switch (api){
|
||||
case HOST_API:
|
||||
/* Create pseudo-random number generator */
|
||||
// CHECK: CURAND_CALL(hiprandCreateGenerator(&gen,
|
||||
// CHECK: HIPRAND_RNG_PSEUDO_DEFAULT));
|
||||
CURAND_CALL(curandCreateGenerator(&gen,
|
||||
CURAND_RNG_PSEUDO_DEFAULT));
|
||||
/* Set seed */
|
||||
// CHECK: CURAND_CALL(hiprandSetPseudoRandomGeneratorSeed(
|
||||
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
|
||||
gen, 1234ULL));
|
||||
/* compute n */
|
||||
n = 64 * 64 * HOURS * 60;
|
||||
/* Allocate n unsigned ints on device */
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&poisson_numbers_d,
|
||||
CUDA_CALL(cudaMalloc((void **)&poisson_numbers_d,
|
||||
n * sizeof(unsigned int)));
|
||||
/* Generate n unsigned ints on device */
|
||||
// CHECK: CURAND_CALL(hiprandGeneratePoisson(gen,
|
||||
CURAND_CALL(curandGeneratePoisson(gen,
|
||||
poisson_numbers_d, n, 4.0));
|
||||
// CHECK: hipLaunchKernelGGL(host_API_kernel, dim3(64), dim3(64), 0, 0, poisson_numbers_d,
|
||||
host_API_kernel<<<64, 64>>>(poisson_numbers_d,
|
||||
devResults, pitch);
|
||||
/* Cleanup */
|
||||
// CHECK: CURAND_CALL(hiprandDestroyGenerator(gen));
|
||||
CURAND_CALL(curandDestroyGenerator(gen));
|
||||
break;
|
||||
case SIMPLE_DEVICE_API:
|
||||
// CHECK: hipLaunchKernelGGL(simple_device_API_kernel, dim3(64), dim3(64), 0, 0, devStates,
|
||||
simple_device_API_kernel<<<64, 64>>>(devStates,
|
||||
devResults, pitch);
|
||||
break;
|
||||
case ROBUST_DEVICE_API:
|
||||
/* Create histograms for Poisson(1) */
|
||||
// CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(1.0,
|
||||
CURAND_CALL(curandCreatePoissonDistribution(1.0,
|
||||
&poisson_1));
|
||||
/* Create histograms for Poisson(2) */
|
||||
// CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(2.0,
|
||||
CURAND_CALL(curandCreatePoissonDistribution(2.0,
|
||||
&poisson_2));
|
||||
/* Create histograms for Poisson(3) */
|
||||
// CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(3.0,
|
||||
CURAND_CALL(curandCreatePoissonDistribution(3.0,
|
||||
&poisson_3));
|
||||
// CHECK: hipLaunchKernelGGL(robust_device_API_kernel, dim3(64), dim3(64), 0, 0, devStates,
|
||||
robust_device_API_kernel<<<64, 64>>>(devStates,
|
||||
poisson_1, poisson_2, poisson_3,
|
||||
devResults, pitch);
|
||||
/* Cleanup */
|
||||
// CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_1));
|
||||
// CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_2));
|
||||
// CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_3));
|
||||
CURAND_CALL(curandDestroyDistribution(poisson_1));
|
||||
CURAND_CALL(curandDestroyDistribution(poisson_2));
|
||||
CURAND_CALL(curandDestroyDistribution(poisson_3));
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Wrong API\n");
|
||||
}
|
||||
/* Copy device memory to host */
|
||||
// CHECK: CUDA_CALL(hipMemcpy2D(hostResults, pitch, devResults,
|
||||
// CHECK: 60 * HOURS, hipMemcpyDeviceToHost));
|
||||
CUDA_CALL(cudaMemcpy2D(hostResults, pitch, devResults,
|
||||
pitch, 64 * 64 * sizeof(unsigned int),
|
||||
60 * HOURS, cudaMemcpyDeviceToHost));
|
||||
/* Show result */
|
||||
print_statistics(hostResults, pitch);
|
||||
/* Cleanup */
|
||||
// CHECK: CUDA_CALL(hipFree(devStates));
|
||||
// CHECK: CUDA_CALL(hipFree(devResults));
|
||||
CUDA_CALL(cudaFree(devStates));
|
||||
CUDA_CALL(cudaFree(devResults));
|
||||
free(hostResults);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@@ -22,7 +22,12 @@ THE SOFTWARE.
|
||||
#include<cuda.h>
|
||||
#include<cuda_runtime.h>
|
||||
#include<iostream>
|
||||
#include<unistd.h>
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#define sleep(x) Sleep(x)
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include<stdio.h>
|
||||
#include<malloc.h>
|
||||
|
||||
@@ -33,7 +38,7 @@ THE SOFTWARE.
|
||||
// CHECK: if(status != hipSuccess) {
|
||||
#define check(msg, status){ \
|
||||
if(status != cudaSuccess) { \
|
||||
printf("%s failed. \n", #msg); \
|
||||
printf("%s failed. \n", #msg); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
// CHECK-NOT: #include <cuda_runtime.h>
|
||||
// CHECK: #include <stdio.h>
|
||||
#include <cuda.h>
|
||||
// CHECK-NOT: #include<cuda_runtime.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda_runtime.h>
|
||||
// CHECK-NOT: #include<cuda.h>
|
||||
#include <cuda.h>
|
||||
// CHECK: #include "hip/hip_runtime.h"
|
||||
// CHECK-NOT: #include "cuda_runtime.h"
|
||||
// CHECK: #include <stdio.h>
|
||||
#include "cuda.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include <stdio.h>
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// CHECK: #include <hipblas.h>
|
||||
// CHECK-NOT: #include <cublas_v2.h>
|
||||
// CHECK: #include <stdio.h>
|
||||
#include <cublas.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <stdio.h>
|
||||
@@ -0,0 +1,8 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// CHECK: #include "hipblas.h"
|
||||
// CHECK-NOT: #include "cublas.h"
|
||||
// CHECK: #include <stdio.h>
|
||||
#include "cublas_v2.h"
|
||||
#include "cublas.h"
|
||||
#include <stdio.h>
|
||||
@@ -0,0 +1,14 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
// CHECK-NOT: #include <cuda_runtime.h>
|
||||
// CHECK: #include <iostream>
|
||||
// CHECK: #include "hipblas.h"
|
||||
// CHECK-NOT: #include "cublas.h"
|
||||
// CHECK: #include <stdio.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
#include "cublas_v2.h"
|
||||
#include "cublas.h"
|
||||
#include <stdio.h>
|
||||
@@ -0,0 +1,93 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
// CHECK: #include <memory>
|
||||
|
||||
// CHECK-NOT: #include <cuda_runtime.h>
|
||||
// CHECK-NOT: #include <hip/hip_runtime.h>
|
||||
|
||||
// CHECK: #include "hip/hip_runtime_api.h"
|
||||
// CHECK: #include "hip/channel_descriptor.h"
|
||||
// CHECK: #include "hip/device_functions.h"
|
||||
// CHECK: #include "hip/driver_types.h"
|
||||
// CHECK: #include "hip/hip_complex.h"
|
||||
// CHECK: #include "hip/hip_fp16.h"
|
||||
// CHECK: #include "hip/hip_texture_types.h"
|
||||
// CHECK: #include "hip/hip_vector_types.h"
|
||||
|
||||
// CHECK: #include <iostream>
|
||||
|
||||
// CHECK: #include "hipblas.h"
|
||||
// CHECK-NOT: #include "cublas.h"
|
||||
|
||||
// CHECK: #include <stdio.h>
|
||||
|
||||
// CHECK: #include "hiprand.h"
|
||||
// CHECK: #include "hiprand_kernel.h"
|
||||
|
||||
// CHECK: #include <algorithm>
|
||||
|
||||
// CHECK-NOT: #include "hiprand.h"
|
||||
// CHECK-NOT: #include "hiprand_kernel.h"
|
||||
// CHECK-NOT: #include "curand_discrete.h"
|
||||
// CHECK-NOT: #include "curand_discrete2.h"
|
||||
// CHECK-NOT: #include "curand_globals.h"
|
||||
// CHECK-NOT: #include "curand_lognormal.h"
|
||||
// CHECK-NOT: #include "curand_mrg32k3a.h"
|
||||
// CHECK-NOT: #include "curand_mtgp32.h"
|
||||
// CHECK-NOT: #include "curand_mtgp32_host.h"
|
||||
// CHECK-NOT: #include "curand_mtgp32_kernel.h"
|
||||
// CHECK-NOT: #include "curand_mtgp32dc_p_11213.h"
|
||||
// CHECK-NOT: #include "curand_normal.h"
|
||||
// CHECK-NOT: #include "curand_normal_static.h"
|
||||
// CHECK-NOT: #include "curand_philox4x32_x.h"
|
||||
// CHECK-NOT: #include "curand_poisson.h"
|
||||
// CHECK-NOT: #include "curand_precalc.h"
|
||||
// CHECK-NOT: #include "curand_uniform.h"
|
||||
|
||||
// CHECK: #include <string>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "cuda_runtime_api.h"
|
||||
#include "channel_descriptor.h"
|
||||
#include "device_functions.h"
|
||||
#include "driver_types.h"
|
||||
#include "cuComplex.h"
|
||||
#include "cuda_fp16.h"
|
||||
#include "cuda_texture_types.h"
|
||||
#include "vector_types.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cublas_v2.h"
|
||||
#include "cublas.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "curand.h"
|
||||
#include "curand_kernel.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "curand_discrete.h"
|
||||
#include "curand_discrete2.h"
|
||||
#include "curand_globals.h"
|
||||
#include "curand_lognormal.h"
|
||||
#include "curand_mrg32k3a.h"
|
||||
#include "curand_mtgp32.h"
|
||||
#include "curand_mtgp32_host.h"
|
||||
#include "curand_mtgp32_kernel.h"
|
||||
#include "curand_mtgp32dc_p_11213.h"
|
||||
#include "curand_normal.h"
|
||||
#include "curand_normal_static.h"
|
||||
#include "curand_philox4x32_x.h"
|
||||
#include "curand_poisson.h"
|
||||
#include "curand_precalc.h"
|
||||
#include "curand_uniform.h"
|
||||
|
||||
#include <string>
|
||||
@@ -0,0 +1,174 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#define K_THREADS 64
|
||||
#define K_INDEX() ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x)
|
||||
#define RND() ((rand() & 0x7FFF) / float(0x8000))
|
||||
#define ERRORCHECK() cErrorCheck(__FILE__, __LINE__)
|
||||
|
||||
// CHECK: hipEvent_t t##_start, t##_end; \
|
||||
// CHECK: hipEventCreate(&t##_start); \
|
||||
// CHECK: hipEventCreate(&t##_end);
|
||||
#define TIMER_CREATE(t) \
|
||||
cudaEvent_t t##_start, t##_end; \
|
||||
cudaEventCreate(&t##_start); \
|
||||
cudaEventCreate(&t##_end);
|
||||
|
||||
// CHECK: hipEventRecord(t##_start); \
|
||||
// CHECK: hipEventSynchronize(t##_start);
|
||||
#define TIMER_START(t) \
|
||||
cudaEventRecord(t##_start); \
|
||||
cudaEventSynchronize(t##_start); \
|
||||
|
||||
// CHECK: hipEventRecord(t##_start); \
|
||||
// CHECK: hipEventSynchronize(t##_start); \
|
||||
// CHECK: hipEventRecord(t##_end); \
|
||||
// CHECK: hipEventSynchronize(t##_end); \
|
||||
// CHECK: hipEventElapsedTime(&t, t##_start, t##_end);
|
||||
#define TIMER_END(t) \
|
||||
cudaEventRecord(t##_start); \
|
||||
cudaEventSynchronize(t##_start); \
|
||||
cudaEventRecord(t##_end); \
|
||||
cudaEventSynchronize(t##_end); \
|
||||
cudaEventElapsedTime(&t, t##_start, t##_end);
|
||||
|
||||
|
||||
inline void cErrorCheck(const char *file, int line) {
|
||||
// CHECK: hipDeviceSynchronize();
|
||||
// CHECK: hipError_t err = hipGetLastError();
|
||||
// CHECK: if (err != hipSuccess) {
|
||||
// CHECK: printf("Error: %s\n", hipGetErrorString(err));
|
||||
cudaThreadSynchronize();
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("Error: %s\n", cudaGetErrorString(err));
|
||||
printf(" @ %s: %d\n", file, line);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
inline dim3 K_GRID(int n, int threads = K_THREADS) {
|
||||
int blocks = (int)ceilf(sqrtf((float)n/threads));
|
||||
dim3 grid(blocks, blocks);
|
||||
return grid;
|
||||
}
|
||||
|
||||
typedef struct data {
|
||||
int n;
|
||||
float4 *r, *v, *f;
|
||||
} data;
|
||||
|
||||
data cpu, gpu;
|
||||
|
||||
#define N 20
|
||||
|
||||
__global__ void repulsion(data gpu);
|
||||
__global__ void integration(data gpu);
|
||||
|
||||
|
||||
int main() {
|
||||
printf("Cuda Test 1\n");
|
||||
|
||||
int count = 0;
|
||||
// CHECK: hipGetDeviceCount(&count);
|
||||
cudaGetDeviceCount(&count);
|
||||
printf(" %d CUDA devices found\n", count);
|
||||
if(!count) {
|
||||
::exit(EXIT_FAILURE);
|
||||
}
|
||||
// CHECK: hipFree(0);
|
||||
cudaFree(0);
|
||||
|
||||
cpu.n = N;
|
||||
|
||||
cpu.r = (float4*)malloc(N * sizeof(float4));
|
||||
cpu.v = (float4*)malloc(N * sizeof(float4));
|
||||
cpu.f = (float4*)malloc(N * sizeof(float4));
|
||||
|
||||
for(int i = 0; i < N; ++i) {
|
||||
cpu.v[i] = make_float4(0,0,0,0);
|
||||
cpu.r[i] = make_float4(RND(), RND(), RND(), 0);
|
||||
cpu.f[i] = make_float4(0,0.01,0,0);
|
||||
}
|
||||
|
||||
gpu = cpu;
|
||||
// CHECK: hipMalloc(&gpu.r, N * sizeof(float4));
|
||||
// CHECK: hipMalloc(&gpu.v, N * sizeof(float4));
|
||||
// CHECK: hipMalloc(&gpu.f, N * sizeof(float4));
|
||||
cudaMalloc(&gpu.r, N * sizeof(float4));
|
||||
cudaMalloc(&gpu.v, N * sizeof(float4));
|
||||
cudaMalloc(&gpu.f, N * sizeof(float4));
|
||||
// CHECK: hipMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
|
||||
// CHECK: hipMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
|
||||
// CHECK: hipMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
|
||||
cudaMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
|
||||
|
||||
ERRORCHECK();
|
||||
float rep;
|
||||
TIMER_CREATE(rep);
|
||||
TIMER_START(rep);
|
||||
// CHECK: hipLaunchKernelGGL(integration, dim3(K_GRID(cpu.n)), dim3(K_THREADS), 0, 0, gpu);
|
||||
integration <<< K_GRID(cpu.n), K_THREADS >>>(gpu);
|
||||
|
||||
TIMER_END(rep);
|
||||
printf("Took: %f ms\n", rep);
|
||||
ERRORCHECK();
|
||||
// CHECK: hipMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
|
||||
// CHECK: hipMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
|
||||
// CHECK: hipMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
|
||||
// CHECK: hipHostFree(cpu.r);
|
||||
// CHECK: hipHostFree(cpu.v);
|
||||
// CHECK: hipHostFree(cpu.f);
|
||||
cudaFreeHost(cpu.r);
|
||||
cudaFreeHost(cpu.v);
|
||||
cudaFreeHost(cpu.f);
|
||||
// CHECK: hipFree(gpu.r);
|
||||
// CHECK: hipFree(gpu.v);
|
||||
// CHECK: hipFree(gpu.f);
|
||||
cudaFree(gpu.r);
|
||||
cudaFree(gpu.v);
|
||||
cudaFree(gpu.f);
|
||||
// CHECK: hipDeviceReset();
|
||||
cudaDeviceReset();
|
||||
|
||||
printf("Results: \n");
|
||||
for(int i = 0; i < N; ++i) {
|
||||
printf("%f, %f, %f \n", cpu.r[i].x, cpu.r[i].y, cpu.r[i].z);
|
||||
}
|
||||
|
||||
printf("Ready...\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
__global__ void repulsion(data gpu) {
|
||||
int idx = K_INDEX();
|
||||
if(idx < N) {
|
||||
gpu.r[idx].x = 1;
|
||||
gpu.r[idx].y = 1;
|
||||
gpu.r[idx].z = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#define MULT4(v, s) v.x *= s; v.y *= s; v.z *= s; v.w *= s;
|
||||
#define ADD4(v1, v2) v1.x += v2.x; v1.y += v2.y; v1.z += v2.z; v1.w += v2.w;
|
||||
|
||||
__global__ void integration(data gpu) {
|
||||
int i = K_INDEX();
|
||||
if(i < N) {
|
||||
MULT4(gpu.f[i], 0.01);
|
||||
MULT4(gpu.v[i], 0.01);
|
||||
ADD4(gpu.v[i], gpu.f[i]);
|
||||
ADD4(gpu.r[i], gpu.v[i]);
|
||||
gpu.f[i] = make_float4(0,0,0,0);
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,8 @@ config.test_format = lit.formats.ShTest()
|
||||
# test_source_root: The root path where tests are located.
|
||||
config.test_source_root = os.path.dirname(__file__)
|
||||
|
||||
config.excludes = ['cmdparser.hpp']
|
||||
|
||||
# test_exec_root: The path where tests are located (default is the test suite root).
|
||||
#config.test_exec_root = config.test_source_root
|
||||
|
||||
|
||||
@@ -14,5 +14,6 @@ set clang_args=%4%clang_args%
|
||||
|
||||
%HIPIFY% -o=%TMP_FILE% %IN_FILE% -- %clang_args%
|
||||
if errorlevel 1 (echo Error: hipify-clang.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
|
||||
%FILE_CHECK% %IN_FILE% -input-file=%TMP_FILE%
|
||||
|
||||
findstr /v /r /c:"[ ]*//[ ]*[CHECK*|RUN]" %TMP_FILE% | %FILE_CHECK% %IN_FILE%
|
||||
if errorlevel 1 (echo Error: FileCheck.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
// Kernel definition
|
||||
__global__ void vecAdd(float* A, float* B, float* C)
|
||||
{
|
||||
int i = threadIdx.x;
|
||||
A[i] = 0;
|
||||
B[i] = i;
|
||||
C[i] = A[i] + B[i];
|
||||
}
|
||||
// CHECK: #include <hip/hip_runtime.h>
|
||||
#include <stdio.h>
|
||||
#define SIZE 10
|
||||
#define KERNELINVOKES 5000000
|
||||
int vecadd(int gpudevice, int rank)
|
||||
{
|
||||
int devcheck(int, int);
|
||||
devcheck(gpudevice, rank);
|
||||
float A[SIZE], B[SIZE], C[SIZE];
|
||||
// Kernel invocation
|
||||
float *devPtrA;
|
||||
float *devPtrB;
|
||||
float *devPtrC;
|
||||
int memsize = SIZE * sizeof(float);
|
||||
// CHECK: hipMalloc((void**)&devPtrA, memsize);
|
||||
// CHECK: hipMalloc((void**)&devPtrB, memsize);
|
||||
// CHECK: hipMalloc((void**)&devPtrC, memsize);
|
||||
cudaMalloc((void**)&devPtrA, memsize);
|
||||
cudaMalloc((void**)&devPtrB, memsize);
|
||||
cudaMalloc((void**)&devPtrC, memsize);
|
||||
// CHECK: hipMemcpy(devPtrA, A, memsize, hipMemcpyHostToDevice);
|
||||
// CHECK: hipMemcpy(devPtrB, B, memsize, hipMemcpyHostToDevice);
|
||||
cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
|
||||
for (int i = 0; i<KERNELINVOKES; i++)
|
||||
{
|
||||
// CHECK: hipLaunchKernelGGL(vecAdd, dim3(1), dim3(gpudevice), 0, 0, devPtrA, devPtrB, devPtrC);
|
||||
vecAdd <<< 1, gpudevice >>>(devPtrA, devPtrB, devPtrC);
|
||||
}
|
||||
// CHECK: hipMemcpy(C, devPtrC, memsize, hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
|
||||
// calculate only up to gpudevice to show the unique output
|
||||
// of each rank's kernel launch
|
||||
for (int i = 0; i<gpudevice; i++)
|
||||
printf("rank %d: C[%d]=%f\n", rank, i, C[i]);
|
||||
// CHECK: hipFree(devPtrA);
|
||||
// CHECK: hipFree(devPtrA);
|
||||
// CHECK: hipFree(devPtrA);
|
||||
cudaFree(devPtrA);
|
||||
cudaFree(devPtrA);
|
||||
cudaFree(devPtrA);
|
||||
}
|
||||
int devcheck(int gpudevice, int rank)
|
||||
{
|
||||
int device_count = 0;
|
||||
int device; // used with cudaGetDevice() to verify cudaSetDevice()
|
||||
// CHECK: hipGetDeviceCount(&device_count);
|
||||
cudaGetDeviceCount(&device_count);
|
||||
if (gpudevice >= device_count)
|
||||
{
|
||||
printf("gpudevice >= device_count ... exiting\n");
|
||||
exit(1);
|
||||
}
|
||||
// CHECK: hipError_t cudareturn;
|
||||
// CHECK: hipDeviceProp_t deviceProp;
|
||||
// CHECK: hipGetDeviceProperties(&deviceProp, gpudevice);
|
||||
cudaError_t cudareturn;
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, gpudevice);
|
||||
// CHECK: if (deviceProp.hipWarpSize <= 1)
|
||||
if (deviceProp.warpSize <= 1)
|
||||
{
|
||||
printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
|
||||
exit(1);
|
||||
}
|
||||
// CHECK: cudareturn = hipSetDevice(gpudevice);
|
||||
cudareturn = cudaSetDevice(gpudevice);
|
||||
// CHECK: if (cudareturn == hipErrorInvalidDevice)
|
||||
if (cudareturn == cudaErrorInvalidDevice)
|
||||
{
|
||||
// CHECK: perror("hipSetDevice returned hipErrorInvalidDevice");
|
||||
perror("cudaSetDevice returned cudaErrorInvalidDevice");
|
||||
}
|
||||
else
|
||||
{
|
||||
// CHECK: hipGetDevice(&device);
|
||||
cudaGetDevice(&device);
|
||||
printf("rank %d: cudaGetDevice()=%d\n", rank, device);
|
||||
}
|
||||
}
|
||||
@@ -18,13 +18,14 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
// Test under-development. Call hipStreamAddCallback function and see if it works as expected.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
@@ -32,32 +33,61 @@ THE SOFTWARE.
|
||||
#define HIPRT_CB
|
||||
#endif
|
||||
|
||||
class CallbackClass
|
||||
__global__ void vector_square(float *C_d, float *A_d, size_t N)
|
||||
{
|
||||
public:
|
||||
static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData);
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
private:
|
||||
void callbackFunc(hipError_t status);
|
||||
};
|
||||
|
||||
void HIPRT_CB CallbackClass::Callback(hipStream_t stream, hipError_t status, void *userData)
|
||||
{
|
||||
CallbackClass* obj = (CallbackClass*) userData;
|
||||
obj->callbackFunc(status);
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
void CallbackClass::callbackFunc(hipError_t status)
|
||||
float *A_h, *C_h;
|
||||
bool cbDone = false;
|
||||
|
||||
static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData)
|
||||
{
|
||||
HIPASSERT(status==hipSuccess);
|
||||
for (size_t i=0; i<N; i++) {
|
||||
if (C_h[i] != A_h[i] * A_h[i]) {
|
||||
warn("Data mismatch %zu", i);
|
||||
}
|
||||
}
|
||||
printf ("PASSED!\n");
|
||||
cbDone = true;
|
||||
}
|
||||
|
||||
int main(){
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
float *A_d, *C_d;
|
||||
size_t Nbytes = N * sizeof(float);
|
||||
|
||||
A_h = (float*)malloc(Nbytes);
|
||||
HIPCHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
C_h = (float*)malloc(Nbytes);
|
||||
HIPCHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
|
||||
// Fill with Phi + i
|
||||
for (size_t i=0; i<N; i++)
|
||||
{
|
||||
A_h[i] = 1.618f + i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipMalloc(&A_d, Nbytes));
|
||||
HIPCHECK(hipMalloc(&C_d, Nbytes));
|
||||
|
||||
hipStream_t mystream;
|
||||
HIPCHECK(hipStreamCreate(&mystream));
|
||||
CallbackClass* obj = new CallbackClass;
|
||||
HIPCHECK(hipStreamAddCallback(mystream, CallbackClass::Callback, obj, 0));
|
||||
HIPCHECK(hipStreamAddCallback(NULL, CallbackClass::Callback, obj, 0));
|
||||
HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
|
||||
|
||||
passed();
|
||||
HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
|
||||
|
||||
const unsigned blocks = 512;
|
||||
const unsigned threadsPerBlock = 256;
|
||||
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, N);
|
||||
|
||||
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
|
||||
HIPCHECK(hipStreamAddCallback(mystream, Callback, NULL, 0));
|
||||
|
||||
while(!cbDone)
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user