Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP into feature_purge_genco

This commit is contained in:
Alex Voicu
2018-01-17 14:02:19 +00:00
35 fájl változott, egészen pontosan 529 új sor hozzáadva és 413 régi sor törölve
+6 -1
Fájl megtekintése
@@ -368,6 +368,12 @@ if(POLICY CMP0037)
cmake_policy(POP)
endif()
#############################
# Code analysis
#############################
# Target: static_check
add_custom_target(static_check COMMAND cppcheck --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
#############################
# Testing steps
#############################
@@ -395,5 +401,4 @@ else()
message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
endif()
# vim: ts=4:sw=4:expandtab:smartindent
+6 -10
Fájl megtekintése
@@ -367,11 +367,11 @@ if( params.hcc_integration_test )
// The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda
parallel hcc_ctu:
{
node('docker && rocm')
node('docker && rocm && dkms')
{
String hcc_ver = 'hcc-ctu'
String from_image = 'compute-artifactory:5001/radeonopencompute/hcc/clang_tot_upgrade/hcc-lc-ubuntu-16.04:latest'
String inside_args = '--device=/dev/kfd'
String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'
// Checkout source code, dependencies and version files
String source_hip_rel = checkout_and_version( hcc_ver )
@@ -408,11 +408,11 @@ parallel hcc_ctu:
},
hcc_1_6:
{
node('docker && rocm')
node('docker && rocm && !dkms')
{
String hcc_ver = 'hcc-1.6'
String from_image = 'compute-artifactory:5001/radeonopencompute/hcc/roc-1.6.x/hcc-lc-ubuntu-16.04:latest'
String inside_args = '--device=/dev/kfd'
String from_image = 'rocm/dev-ubuntu-16.04:latest'
String inside_args = '--device=/dev/kfd --device=/dev/dri'
// Checkout source code, dependencies and version files
String source_hip_rel = checkout_and_version( hcc_ver )
@@ -449,11 +449,7 @@ nvcc:
// Block of string constants customizing behavior for cuda
String nvcc_ver = 'nvcc-9.0'
String from_image = 'nvidia/cuda:9.0-devel'
// This unfortunately hardcodes the driver version nvidia_driver_384.90 in the volume mount. Research if a way
// exists to get volume driver to customize the volume names to leave out driver version
String inside_args = '''--device=/dev/nvidiactl --device=/dev/nvidia0 --device=/dev/nvidia-uvm --device=/dev/nvidia-uvm-tools
--volume-driver=nvidia-docker --volume=nvidia_driver_384.90:/usr/local/nvidia:ro''';
String inside_args = '--runtime=nvidia';
// Checkout source code, dependencies and version files
String source_hip_rel = checkout_and_version( nvcc_ver )
@@ -7,6 +7,7 @@ MAINTAINER Kent Knox <kent.knox@amd>
ARG user_uid
# Install Packages
# python and libnuma1 are dependencies of rocm_agent_enumerator
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
sudo \
build-essential \
@@ -14,6 +15,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
git \
libelf-dev \
rpm \
python \
libnuma1 \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
@@ -5,8 +5,8 @@
| **type** | **CUDA** | **HIP** |
|-------------:|---------------------------------------------------------------|------------------------------------------------------------|
| struct | `CUDA_ARRAY3D_DESCRIPTOR` | |
| struct | `CUDA_ARRAY_DESCRIPTOR` | |
| struct | `CUDA_MEMCPY2D` | |
| struct | `CUDA_ARRAY_DESCRIPTOR` | `HIP_ARRAY_DESCRIPTOR` |
| struct | `CUDA_MEMCPY2D` | `hip_Memcpy2D` |
| struct | `CUDA_MEMCPY3D` | |
| struct | `CUDA_MEMCPY3D_PEER` | |
| struct | `CUDA_POINTER_ATTRIBUTE_P2P_TOKENS` | |
@@ -27,15 +27,15 @@
| 0x03 |*`CU_CUBEMAP_FACE_NEGATIVE_Y`* | |
| 0x04 |*`CU_CUBEMAP_FACE_POSITIVE_Z`* | |
| 0x05 |*`CU_CUBEMAP_FACE_NEGATIVE_Z`* | |
| enum |***`CUarray_format`*** | |
| 0x01 |*`CU_AD_FORMAT_UNSIGNED_INT8`* | |
| 0x02 |*`CU_AD_FORMAT_UNSIGNED_INT16`* | |
| 0x03 |*`CU_AD_FORMAT_UNSIGNED_INT32`* | |
| 0x08 |*`CU_AD_FORMAT_SIGNED_INT8`* | |
| 0x09 |*`CU_AD_FORMAT_SIGNED_INT16`* | |
| 0x0a |*`CU_AD_FORMAT_SIGNED_INT32`* | |
| 0x10 |*`CU_AD_FORMAT_HALF`* | |
| 0x20 |*`CU_AD_FORMAT_FLOAT`* | |
| enum |***`CUarray_format`*** |***`hipArray_format`*** |
| 0x01 |*`CU_AD_FORMAT_UNSIGNED_INT8`* |*`HIP_AD_FORMAT_UNSIGNED_INT8`* |
| 0x02 |*`CU_AD_FORMAT_UNSIGNED_INT16`* |*`HIP_AD_FORMAT_UNSIGNED_INT16`* |
| 0x03 |*`CU_AD_FORMAT_UNSIGNED_INT32`* |*`HIP_AD_FORMAT_UNSIGNED_INT32`* |
| 0x08 |*`CU_AD_FORMAT_SIGNED_INT8`* |*`HIP_AD_FORMAT_SIGNED_INT8`* |
| 0x09 |*`CU_AD_FORMAT_SIGNED_INT16`* |*`HIP_AD_FORMAT_SIGNED_INT16`* |
| 0x0a |*`CU_AD_FORMAT_SIGNED_INT32`* |*`HIP_AD_FORMAT_SIGNED_INT32`* |
| 0x10 |*`CU_AD_FORMAT_HALF`* |*`HIP_AD_FORMAT_HALF`* |
| 0x20 |*`CU_AD_FORMAT_FLOAT`* |*`HIP_AD_FORMAT_FLOAT`* |
| enum |***`CUctx_flags`*** | |
| 0x00 |*`CU_CTX_SCHED_AUTO`* | |
| 0x01 |*`CU_CTX_SCHED_SPIN`* | |
@@ -518,7 +518,7 @@
| **CUDA** | **HIP** |
|-----------------------------------------------------------|-------------------------------|
| `cuArray3DCreate` | |
| `cuArray3DCreate` | `hipArray3DCreate` |
| `cuArray3DGetDescriptor` | |
| `cuArrayCreate` | |
| `cuArrayDestroy` | |
+23 -5
Fájl megtekintése
@@ -65,22 +65,40 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DI
install(TARGETS hipify-clang DESTINATION bin)
if (HIPIFY_CLANG_TESTS)
find_package(PythonInterp 2.7 REQUIRED EXACT)
find_package(PythonInterp 2.7 REQUIRED)
function (require_program PROGRAM_NAME)
find_program(FOUND_PROGRAM ${PROGRAM_NAME})
if (NOT FOUND_PROGRAM)
message(FATAL_ERROR "Can't find ${PROGRAM_NAME}. Either set HIPIFY_CLANG_TESTS to OFF to disable hipify tests, or install the missing program.")
find_program(FOUND_${PROGRAM_NAME} ${PROGRAM_NAME})
if (FOUND_${PROGRAM_NAME})
message(STATUS "Found ${PROGRAM_NAME}: ${FOUND_${PROGRAM_NAME}}")
else()
message(SEND_ERROR "Can't find ${PROGRAM_NAME}. Either set HIPIFY_CLANG_TESTS to OFF to disable hipify tests, or install the missing program.")
endif()
endfunction()
require_program(lit)
require_program(FileCheck)
require_program(socat)
# Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
# value of --cuda-path for the test runs.
find_package(CUDA REQUIRED)
if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
(CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
(CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
(CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0"))
message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
if (CUDA_VERSION VERSION_LESS "7.0")
message(STATUS "Please install CUDA 7.0 or higher.")
elseif ((CUDA_VERSION VERSION_EQUAL "7.0") OR (CUDA_VERSION VERSION_EQUAL "7.5"))
message(STATUS "Please install clang 3.8 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "8.0")
message(STATUS "Please install clang 4.0 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "9.0")
message(STATUS "Please install clang 6.0 or higher.")
elseif (CUDA_VERSION VERSION_EQUAL "9.1")
message(STATUS "Please install clang 7.0 or higher.")
endif()
endif()
configure_file(
${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
@@ -10,8 +10,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
///////////////////////////// CUDA DRIVER API /////////////////////////////
{"CUDA_ARRAY3D_DESCRIPTOR", {"HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUDA_ARRAY_DESCRIPTOR", {"HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUDA_MEMCPY2D", {"HIP_MEMCPY2D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUDA_ARRAY_DESCRIPTOR", {"HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER}},
{"CUDA_MEMCPY2D", {"hip_Memcpy2D", CONV_TYPE, API_DRIVER}},
{"CUDA_MEMCPY3D", {"HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUDA_MEMCPY3D_PEER", {"HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUDA_POINTER_ATTRIBUTE_P2P_TOKENS", {"HIP_POINTER_ATTRIBUTE_P2P_TOKENS", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
@@ -23,7 +23,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
{"CUaddress_mode", {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUarray_cubemap_face", {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUarray_format", {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
{"CUarray_format", {"hipArray_format", CONV_TYPE, API_DRIVER}},
{"CUcomputemode", {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // API_RUNTIME ANALOGUE (cudaComputeMode)
{"CUmem_advise", {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // API_RUNTIME ANALOGUE (cudaComputeMode)
{"CUmem_range_attribute", {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // API_RUNTIME ANALOGUE (cudaMemRangeAttribute)
@@ -213,6 +213,13 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
{"cudaMipmappedArray_t", {"hipMipmappedArray_t", CONV_MEM, API_RUNTIME}},
{"cudaMipmappedArray_const_t", {"hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME}},
// defines
{"cudaArrayDefault", {"hipArrayDefault", CONV_MEM, API_RUNTIME}},
{"cudaArrayLayered", {"hipArrayLayered", CONV_MEM, API_RUNTIME}},
{"cudaArraySurfaceLoadStore", {"hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME}},
{"cudaArrayCubemap", {"hipArrayCubemap", CONV_MEM, API_RUNTIME}},
{"cudaArrayTextureGather", {"hipArrayTextureGather", CONV_MEM, API_RUNTIME}},
{"cudaMemoryAdvise", {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, // API_Driver ANALOGUE (CUmem_advise)
{"cudaMemRangeAttribute", {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, // API_Driver ANALOGUE (CUmem_range_attribute)
{"cudaMemcpyKind", {"hipMemcpyKind", CONV_MEM, API_RUNTIME}},
@@ -263,6 +270,15 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
{"cudaSurfaceFormatMode", {"hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED}},
// defines
{"cudaTextureType1D", {"hipTextureType1D", CONV_TEX, API_RUNTIME}},
{"cudaTextureType2D", {"hipTextureType2D", CONV_TEX, API_RUNTIME}},
{"cudaTextureType3D", {"hipTextureType3D", CONV_TEX, API_RUNTIME}},
{"cudaTextureTypeCubemap", {"hipTextureTypeCubemap", CONV_TEX, API_RUNTIME}},
{"cudaTextureType1DLayered", {"hipTextureType1DLayered", CONV_TEX, API_RUNTIME}},
{"cudaTextureType2DLayered", {"hipTextureType2DLayered", CONV_TEX, API_RUNTIME}},
{"cudaTextureTypeCubemapLayered", {"hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME}},
// Inter-Process Communication (IPC)
{"cudaIpcEventHandle_t", {"hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME}},
{"cudaIpcEventHandle_st", {"hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME}},
@@ -548,14 +564,14 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
{"CU_CUBEMAP_FACE_NEGATIVE_Z", {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x05
// CUarray_format enum
{"CU_AD_FORMAT_UNSIGNED_INT8", {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01
{"CU_AD_FORMAT_UNSIGNED_INT16", {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x02
{"CU_AD_FORMAT_UNSIGNED_INT32", {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x03
{"CU_AD_FORMAT_SIGNED_INT8", {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x08
{"CU_AD_FORMAT_SIGNED_INT16", {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x09
{"CU_AD_FORMAT_SIGNED_INT32", {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x0a
{"CU_AD_FORMAT_HALF", {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x10
{"CU_AD_FORMAT_FLOAT", {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x20
{"CU_AD_FORMAT_UNSIGNED_INT8", {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER}}, // 0x01
{"CU_AD_FORMAT_UNSIGNED_INT16", {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER}}, // 0x02
{"CU_AD_FORMAT_UNSIGNED_INT32", {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER}}, // 0x03
{"CU_AD_FORMAT_SIGNED_INT8", {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER}}, // 0x08
{"CU_AD_FORMAT_SIGNED_INT16", {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER}}, // 0x09
{"CU_AD_FORMAT_SIGNED_INT32", {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER}}, // 0x0a
{"CU_AD_FORMAT_HALF", {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER}}, // 0x10
{"CU_AD_FORMAT_FLOAT", {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER}}, // 0x20
// CUcomputemode enum
{"CU_COMPUTEMODE_DEFAULT", {"hipComputeModeDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // 0 // API_RUNTIME ANALOGUE (cudaComputeModeDefault = 0)
@@ -1047,7 +1063,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
{"cuStreamBatchMemOp", {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}}, // // no API_Runtime ANALOGUE
// Memory management
{"cuArray3DCreate", {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
{"cuArray3DCreate", {"hipArray3DCreate", CONV_MEM, API_DRIVER}},
{"cuArray3DGetDescriptor", {"hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
{"cuArrayCreate", {"hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
{"cuArrayDestroy", {"hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
@@ -152,16 +152,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
const auto found = CUDA_INCLUDE_MAP.find(file_name);
if (found == CUDA_INCLUDE_MAP.end()) {
// Not a CUDA include - don't touch it.
if (!firstNotMainHeader) {
firstNotMainHeader = true;
firstNotMainHeaderLoc = hash_loc;
}
return;
}
// Special-casing to avoid duplication of the hip_runtime include.
bool secondMainInclude = false;
if (found->second.hipName == "hip/hip_runtime.h") {
if (insertedRuntimeHeader) {
return;
secondMainInclude = true;
}
insertedRuntimeHeader = true;
}
@@ -169,28 +172,42 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
clang::SourceLocation sl = filename_range.getBegin();
if (found->second.unsupported) {
// An unsupported CUDA header? Oh dear. Print a warning.
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
return;
}
const char *B = SM.getCharacterData(sl);
const char *E = SM.getCharacterData(filename_range.getEnd());
clang::SmallString<128> includeBuffer;
clang::StringRef newInclude;
// Keep the same include type that the user gave.
if (is_angled) {
newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
if (!secondMainInclude) {
clang::SmallString<128> includeBuffer;
if (is_angled) {
newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
} else {
newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
}
} else {
newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
// hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
sl = hash_loc;
}
const char *B = SM.getCharacterData(sl);
const char *E = SM.getCharacterData(filename_range.getEnd());
ct::Replacement Rep(SM, sl, E - B, newInclude);
insertReplacement(Rep, clang::FullSourceLoc{sl, SM});
}
void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
if (pragmaOnce) { return; }
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
const clang::Token tok = PP.LookAhead(0);
StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
if (Text == "once") {
pragmaOnce = true;
pragmaOnceLoc = PP.LookAhead(1).getLocation();
}
}
bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
StringRef refName = "cudaLaunchKernel";
@@ -336,10 +353,16 @@ void HipifyAction::EndSourceFileAction() {
// implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert
// one copy of the hip include into every file.
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
clang::SourceLocation sl = SM.getLocForStartOfFile(SM.getMainFileID());
clang::SourceLocation sl;
if (pragmaOnce) {
sl = pragmaOnceLoc;
} else if (firstNotMainHeader) {
sl = firstNotMainHeaderLoc;
} else {
sl = SM.getLocForStartOfFile(SM.getMainFileID());
}
clang::FullSourceLoc fullSL(sl, SM);
ct::Replacement Rep(SM, sl, 0, "#include <hip/hip_runtime.h>\n");
ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
insertReplacement(Rep, fullSL);
}
@@ -364,6 +387,10 @@ public:
const clang::Module* imported) override {
hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported);
}
void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) override {
hipifyAction.PragmaDirective(Loc, Introducer);
}
};
}
@@ -23,6 +23,10 @@ private:
// not, we insert it at the top of the file when we finish processing it.
// This approach means we do the best it's possible to do w.r.t preserving the user's include order.
bool insertedRuntimeHeader = false;
bool firstNotMainHeader = false;
bool pragmaOnce = false;
clang::SourceLocation firstNotMainHeaderLoc;
clang::SourceLocation pragmaOnceLoc;
/**
* Rewrite a string literal to refer to hip, not CUDA.
@@ -57,6 +61,11 @@ public:
StringRef relative_path,
const clang::Module *imported);
/**
* Called by the preprocessor for each pragma directive during the non-raw lexing pass.
*/
void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer);
protected:
/**
* Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
@@ -25,7 +25,7 @@ ct::Replacements& getReplacements(ct::RefactoringTool& Tool, clang::StringRef fi
void insertReplacement(ct::Replacements& replacements, const ct::Replacement& rep) {
#if LLVM_VERSION_MAJOR > 3
// New clang added error checking to Replacements, and *insists* that you explicitly check it.
llvm::Error e = replacements.add(rep);
llvm::consumeError(replacements.add(rep));
#else
// In older versions, it's literally an std::set<Replacement>
replacements.insert(rep);
@@ -62,6 +62,8 @@ struct HIP_ARRAY_DESCRIPTOR {
unsigned int numChannels;
size_t width;
size_t height;
unsigned int flags;
size_t depth;
};
struct hipArray {
@@ -73,6 +75,7 @@ struct hipArray {
unsigned int depth;
struct HIP_ARRAY_DESCRIPTOR drvDesc;
bool isDrv;
unsigned int textureType;
};
typedef struct hip_Memcpy2D {
@@ -251,6 +254,30 @@ struct hipMemcpy3DParms {
struct hipExtent extent;
enum hipMemcpyKind kind;
size_t Depth;
size_t Height;
size_t WidthInBytes;
hipDeviceptr_t dstDevice;
size_t dstHeight;
void * dstHost;
size_t dstLOD;
hipMemoryType dstMemoryType;
size_t dstPitch;
size_t dstXInBytes;
size_t dstY;
size_t dstZ;
void * reserved0;
void * reserved1;
hipDeviceptr_t srcDevice;
size_t srcHeight;
const void * srcHost;
size_t srcLOD;
hipMemoryType srcMemoryType;
size_t srcPitch;
size_t srcXInBytes;
size_t srcY;
size_t srcZ;
};
static __inline__ struct hipPitchedPtr make_hipPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
@@ -1317,6 +1317,11 @@ hipError_t hipMallocArray(hipArray** array, const struct hipChannelFormatDesc* d
size_t width, size_t height, unsigned int flags);
#endif
hipError_t hipArrayCreate ( hipArray** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray );
hipError_t hipArray3DCreate(hipArray_t *array, const HIP_ARRAY_DESCRIPTOR* pAllocateArray );
hipError_t hipMalloc3D (hipPitchedPtr* pitchedDevPtr, hipExtent extent );
/**
* @brief Frees an array on the device.
*
+1 -1
Fájl megtekintése
@@ -448,7 +448,7 @@ __device__ __half2 __lowhigh2highlow(const __half2 a) {
__device__ __half2 __lows2half2(const __half2 a, const __half2 b) {
__half2 c;
c.y = a.x;
c.x = a.x;
c.y = b.x;
return c;
}
+239 -55
Fájl megtekintése
@@ -344,24 +344,16 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
return hipHostMalloc(ptr, sizeBytes, flags);
};
// width in bytes
hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
hipError_t ihipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height, size_t depth)
{
HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, pitch, width, height);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
if(width == 0 || height == 0)
return ihipLogStatus(hipErrorUnknown);
// hardcoded 128 bytes
*pitch = ((((int)width-1)/128) + 1)*128;
const size_t sizeBytes = (*pitch)*height;
auto ctx = ihipGetTlsDefaultCtx();
//err = hipMalloc(ptr, (*pitch)*height);
if (ctx) {
hc::accelerator acc = ctx->getDevice()->_acc;
hsa_agent_t* agent =static_cast<hsa_agent_t*>(acc.get_hsa_agent());
@@ -373,9 +365,12 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
hsa_ext_image_descriptor_t imageDescriptor;
imageDescriptor.width = *pitch;
imageDescriptor.height = height;
imageDescriptor.depth = 0;
imageDescriptor.depth = 0;//depth;
imageDescriptor.array_size = 0;
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
if(depth == 0)
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
else
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_3D;
imageDescriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
imageDescriptor.format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
@@ -394,6 +389,42 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
hip_status = hipErrorMemoryAllocation;
}
return hip_status;
}
// width in bytes
hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
{
HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, pitch, width, height);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
if(width == 0 || height == 0)
return ihipLogStatus(hipErrorUnknown);
hip_status = ihipMallocPitch(ptr, pitch, width, height, 0);
return ihipLogStatus(hip_status);
}
hipError_t hipMalloc3D (hipPitchedPtr* pitchedDevPtr, hipExtent extent )
{
HIP_INIT_API(pitchedDevPtr, &extent);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
if(extent.width == 0 || extent.height == 0)
return ihipLogStatus(hipErrorUnknown);
if(!pitchedDevPtr)
return ihipLogStatus(hipErrorInvalidValue);
void* ptr;
size_t pitch;
hip_status = ihipMallocPitch(&pitchedDevPtr->ptr, &pitch, extent.width, extent.height, extent.depth);
if(hip_status == hipSuccess) {
pitchedDevPtr->pitch = pitch;
pitchedDevPtr->xsize = extent.width;
pitchedDevPtr->ysize = extent.height;
}
return ihipLogStatus(hip_status);
}
@@ -531,7 +562,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
array[0]->depth = 1;
array[0]->desc = *desc;
array[0]->isDrv = false;
array[0]->textureType = hipTextureType2D;
void ** ptr = &array[0]->data;
if (ctx) {
@@ -610,12 +641,132 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
return ihipLogStatus(hip_status);
}
hipError_t hipArray3DCreate(hipArray_t *array, const HIP_ARRAY_DESCRIPTOR* pAllocateArray )
{
HIP_INIT_SPECIAL_API((TRACE_MEM), array, pAllocateArray);
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
*array = (hipArray*)malloc(sizeof(hipArray));
array[0]->type = pAllocateArray->flags;
array[0]->width = pAllocateArray->width;
array[0]->height = pAllocateArray->height;
array[0]->depth = pAllocateArray->depth;
array[0]->drvDesc = *pAllocateArray;
array[0]->isDrv = true;
array[0]->textureType = hipTextureType3D;
void ** ptr = &array[0]->data;
if (ctx) {
const unsigned am_flags = 0;
const size_t size = pAllocateArray->width*pAllocateArray->height*pAllocateArray->depth;
size_t allocSize = 0;
hsa_ext_image_channel_type_t channelType;
switch(pAllocateArray->format) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
allocSize = size * sizeof(uint8_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
break;
case HIP_AD_FORMAT_UNSIGNED_INT16:
allocSize = size * sizeof(uint16_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
break;
case HIP_AD_FORMAT_UNSIGNED_INT32:
allocSize = size * sizeof(uint32_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
break;
case HIP_AD_FORMAT_SIGNED_INT8:
allocSize = size * sizeof(int8_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
break;
case HIP_AD_FORMAT_SIGNED_INT16:
allocSize = size * sizeof(int16_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
break;
case HIP_AD_FORMAT_SIGNED_INT32:
allocSize = size * sizeof(int32_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
break;
case HIP_AD_FORMAT_HALF:
allocSize = size * sizeof(int16_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
break;
case HIP_AD_FORMAT_FLOAT:
allocSize = size * sizeof(float);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
break;
default:
hip_status = hipErrorUnknown;
break;
}
hc::accelerator acc = ctx->getDevice()->_acc;
hsa_agent_t* agent =static_cast<hsa_agent_t*>(acc.get_hsa_agent());
size_t allocGranularity = 0;
hsa_amd_memory_pool_t *allocRegion = static_cast<hsa_amd_memory_pool_t*>(acc.get_hsa_am_region());
hsa_amd_memory_pool_get_info(*allocRegion, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &allocGranularity);
hsa_ext_image_descriptor_t imageDescriptor;
imageDescriptor.width = pAllocateArray->width;
imageDescriptor.height = pAllocateArray->height;
imageDescriptor.depth = 0;
imageDescriptor.array_size = 0;
switch (pAllocateArray->flags) {
case hipArrayLayered:
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA;
imageDescriptor.array_size = pAllocateArray->depth;
break;
case hipArraySurfaceLoadStore:
case hipArrayTextureGather:
case hipArrayDefault:
assert(0);
break;
case hipArrayCubemap:
default:
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_3D;
imageDescriptor.depth = pAllocateArray->depth;
break;
}
hsa_ext_image_channel_order_t channelOrder;
//getChannelOrderAndType(*desc, hipReadModeElementType, &channelOrder, &channelType);
if (pAllocateArray->numChannels == 4) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
} else if (pAllocateArray->numChannels == 2) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
} else if (pAllocateArray->numChannels == 1) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
}
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
hsa_ext_image_data_info_t imageInfo;
hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo);
size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment;
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment);
if (size && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
} else {
hip_status = hipErrorMemoryAllocation;
}
return ihipLogStatus(hip_status);
}
hipError_t hipMalloc3DArray(hipArray_t *array,
const struct hipChannelFormatDesc* desc,
struct hipExtent extent,
unsigned int flags)
{
HIP_INIT();
HIP_INIT_API(array, desc, &extent, flags);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
@@ -627,7 +778,8 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
array[0]->height = extent.height;
array[0]->depth = extent.depth;
array[0]->desc = *desc;
array[0]->isDrv = false;
array[0]->textureType = hipTextureType3D;
void ** ptr = &array[0]->data;
if (ctx) {
@@ -702,7 +854,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
hip_status = hipErrorMemoryAllocation;
}
return hip_status;
return ihipLogStatus(hip_status);
}
hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
@@ -1262,53 +1414,85 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
{
HIP_INIT_SPECIAL_API((TRACE_MCMD), p);
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
hc::completion_future marker;
hipError_t e = hipSuccess;
size_t byteSize;
if(p) {
switch(p->dstArray->desc.f) {
case hipChannelFormatKindSigned:
byteSize = sizeof(int);
break;
case hipChannelFormatKindUnsigned:
byteSize = sizeof(unsigned int);
break;
case hipChannelFormatKindFloat:
byteSize = sizeof(float);
break;
case hipChannelFormatKindNone:
byteSize = sizeof(size_t);
break;
default:
byteSize = 0;
break;
size_t byteSize;
size_t depth;
size_t height;
size_t widthInBytes;
size_t dstWidthInbytes;
size_t srcPitch;
size_t dstPitch;
void *srcPtr;
void *dstPtr;
size_t ySize;
if(p->dstArray != nullptr) {
if(p->dstArray->isDrv == false) {
switch(p->dstArray->desc.f) {
case hipChannelFormatKindSigned:
byteSize = sizeof(int);
break;
case hipChannelFormatKindUnsigned:
byteSize = sizeof(unsigned int);
break;
case hipChannelFormatKindFloat:
byteSize = sizeof(float);
break;
case hipChannelFormatKindNone:
byteSize = sizeof(size_t);
break;
default:
byteSize = 0;
break;
}
depth = p->extent.depth;
height = p->extent.height;
widthInBytes = p->extent.width * byteSize;
srcPitch = p->srcPtr.pitch;
srcPtr = p->srcPtr.ptr;
ySize = p->srcPtr.ysize;
dstWidthInbytes = p->dstArray->width*byteSize;
dstPtr = p->dstArray->data;
} else {
depth = p->Depth;
height = p->Height;
widthInBytes = p->WidthInBytes;
dstWidthInbytes = p->dstArray->width*4;
srcPitch = p->srcPitch;
srcPtr = (void*)p->srcHost;
ySize = p->srcHeight;
dstPtr = p->dstArray->data;
}
} else {
//Non array destination
depth = p->extent.depth;
height = p->extent.height;
widthInBytes = p->extent.width;
srcPitch = p->srcPtr.pitch;
srcPtr = p->srcPtr.ptr;
dstPtr = p->dstPtr.ptr;
ySize = p->srcPtr.ysize;
dstWidthInbytes = p->dstPtr.pitch;
}
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
hc::completion_future marker;
try {
for (int i = 0; i < depth; i++) {
for(int j = 0; j < height; j++) {
// TODO: p->srcPos or p->dstPos are not 0.
unsigned char* src = (unsigned char*)srcPtr + i*ySize*srcPitch + j*srcPitch;
unsigned char* dst = (unsigned char*)dstPtr + i*height*dstWidthInbytes + j*dstWidthInbytes;
stream->locked_copySync(dst, src, widthInBytes, p->kind);
}
}
} catch (ihipException ex) {
e = ex._code;
}
} else {
return ihipLogStatus(hipErrorUnknown);
e = hipErrorInvalidValue;
}
try {
for (int i = 0; i < p->extent.depth; i++) {
for(int j = 0; j < p->extent.height; j++) {
// TODO: p->srcPos or p->dstPos are not 0.
unsigned char* src = (unsigned char*)p->srcPtr.ptr + i*p->srcPtr.ysize*p->srcPtr.pitch + j*p->srcPtr.pitch;
unsigned char* dst = (unsigned char*)p->dstArray->data + i*p->dstArray->height*p->dstArray->width*byteSize + j*p->dstArray->width*byteSize;
stream->locked_copySync(dst, src, p->extent.width*byteSize, p->kind);
}
}
}
catch (ihipException &ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
namespace
{
template<
+2 -2
Fájl megtekintése
@@ -623,7 +623,7 @@ hipError_t hipBindTextureToArray(textureReference* tex,
HIP_INIT_API(tex, array, desc);
hipError_t hip_status = hipSuccess;
// TODO: hipReadModeElementType is default.
hip_status = ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
hip_status = ihipBindTextureToArrayImpl(array->textureType, hipReadModeElementType,
array, *desc, tex);
return ihipLogStatus(hip_status);
}
@@ -742,7 +742,7 @@ hipError_t hipTexRefSetArray ( textureReference* tex, hipArray_const_t array, u
HIP_INIT_API(tex, array, flags);
hipError_t hip_status = hipSuccess;
hip_status = ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
hip_status = ihipBindTextureToArrayImpl(array->textureType, hipReadModeElementType,
array, array->desc,tex );
return ihipLogStatus(hip_status);
}
@@ -89,9 +89,8 @@ namespace hip_impl
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
}
@@ -16,7 +16,6 @@
template<typename T>
__global__ void axpy(T a, T *x, T *y) {
// CHECK: y[hipThreadIdx_x] = a * x[hipThreadIdx_x];
y[threadIdx.x] = a * x[threadIdx.x];
}
@@ -1,242 +0,0 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
//
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
// in CUDA 3.2.
//
// Devices of compute capability 1.x will run the kernels one after another
// Devices of compute capability 2.0 or higher can overlap the kernels
//
#include <stdio.h>
#include <helper_functions.h>
#include <helper_cuda.h>
// This is a kernel that does no real work but runs at least for a specified number of clocks
__global__ void clock_block(clock_t *d_o, clock_t clock_count)
{
unsigned int start_clock = (unsigned int) clock();
clock_t clock_offset = 0;
while (clock_offset < clock_count)
{
unsigned int end_clock = (unsigned int) clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
// Single warp reduction kernel
__global__ void sum(clock_t *d_clocks, int N)
{
__shared__ clock_t s_clocks[32];
clock_t my_sum = 0;
for (int i = threadIdx.x; i < N; i+= blockDim.x)
{
my_sum += d_clocks[i];
}
s_clocks[threadIdx.x] = my_sum;
syncthreads();
for (int i=16; i>0; i/=2)
{
if (threadIdx.x < i)
{
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
}
syncthreads();
}
d_clocks[0] = s_clocks[0];
}
int main(int argc, char **argv)
{
int nkernels = 8; // number of concurrent kernels
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
int nbytes = nkernels * sizeof(clock_t); // number of data bytes
float kernel_time = 10; // time the kernel should run in ms
float elapsed_time; // timing variables
int cuda_device = 0;
printf("[%s] - Starting...\n", argv[0]);
// get number of kernels if overridden on the command line
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
{
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
nstreams = nkernels + 1;
}
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
// CHECK: hipDeviceProp_t deviceProp;
cudaDeviceProp deviceProp;
// CHECK: checkCudaErrors(hipGetDevice(&cuda_device));
checkCudaErrors(cudaGetDevice(&cuda_device));
// CHECK: checkCudaErrors(hipGetDeviceProperties(&deviceProp, cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
if ((deviceProp.concurrentKernels == 0))
{
printf("> GPU does not support concurrent kernel execution\n");
printf(" CUDA kernel runs will be serialized\n");
}
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// allocate host memory
clock_t *a = 0; // pointer to the array data in host memory
// CHECK: checkCudaErrors(hipHostMalloc((void **)&a, nbytes));
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
// allocate device memory
clock_t *d_a = 0; // pointers to data and init value in the device memory
// CHECK: checkCudaErrors(hipMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
// CHECK: hipStream_t *streams = (hipStream_t *) malloc(nstreams * sizeof(hipStream_t));
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
// CHECK: checkCudaErrors(hipStreamCreate(&(streams[i])));
checkCudaErrors(cudaStreamCreate(&(streams[i])));
}
// CHECK: hipEvent_t start_event, stop_event;
// create CUDA event handles
cudaEvent_t start_event, stop_event;
// CHECK: checkCudaErrors(hipEventCreate(&start_event));
// CHECK: checkCudaErrors(hipEventCreate(&stop_event));
checkCudaErrors(cudaEventCreate(&start_event));
checkCudaErrors(cudaEventCreate(&stop_event));
// the events are used for synchronization only and hence do not need to record timings
// this also makes events not introduce global sync points when recorded which is critical to get overlap
// CHECK: hipEvent_t *kernelEvent;
// CHECK: kernelEvent = (hipEvent_t *) malloc(nkernels * sizeof(hipEvent_t));
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
for (int i = 0; i < nkernels; i++)
{
// CHECK: checkCudaErrors(hipEventCreateWithFlags(&(kernelEvent[i]), hipEventDisableTiming));
checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
}
//////////////////////////////////////////////////////////////////////
// time execution with nkernels streams
clock_t total_clocks = 0;
#if defined(__arm__) || defined(__aarch64__)
// the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
#else
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif
// CHECK: hipEventRecord(start_event, 0);
cudaEventRecord(start_event, 0);
// queue nkernels in separate streams and record when they are done
for (int i=0; i<nkernels; ++i)
{
// CHECK: hipLaunchKernelGGL(clock_block, dim3(1), dim3(1), 0, streams[i], &d_a[i], time_clocks);
clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
total_clocks += time_clocks;
// CHECK: checkCudaErrors(hipEventRecord(kernelEvent[i], streams[i]));
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
// make the last stream wait for the kernel event to be recorded
// CHECK: checkCudaErrors(hipStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
}
// queue a sum kernel and a copy back to host in the last stream.
// the commands in this stream get dispatched as soon as all the kernel events have been recorded
// CHECK: hipLaunchKernelGGL(sum, dim3(1), dim3(32), 0, streams[nstreams-1], d_a, nkernels);
// CHECK: checkCudaErrors(hipMemcpyAsync(a, d_a, sizeof(clock_t), hipMemcpyDeviceToHost, streams[nstreams-1]));
sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
// at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
// in this sample we just wait until the GPU is done
// CHECK: checkCudaErrors(hipEventRecord(stop_event, 0));
// CHECK: checkCudaErrors(hipEventSynchronize(stop_event));
// CHECK: checkCudaErrors(hipEventElapsedTime(&elapsed_time, start_event, stop_event));
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
bool bTestResult = (a[0] > total_clocks);
// release resources
for (int i = 0; i < nkernels; i++)
{
// CHECK: hipStreamDestroy(streams[i]);
// CHECK: hipEventDestroy(kernelEvent[i]);
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
free(streams);
free(kernelEvent);
// CHECK: hipEventDestroy(start_event);
// CHECK: hipEventDestroy(stop_event);
// CHECK: hipHostFree(a);
// CHECK: hipFree(d_a);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
if (!bTestResult)
{
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
printf("Test passed\n");
exit(EXIT_SUCCESS);
}
@@ -38,7 +38,6 @@ if(status != cudaSuccess) { \
}
__global__ void Inc1(float *Ad, float *Bd){
// CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < 1 ){
for(int i=0;i<ITER;i++){
@@ -51,7 +50,6 @@ __global__ void Inc1(float *Ad, float *Bd){
}
__global__ void Inc2(float *Ad, float *Bd){
// CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < 1024){
for(int i=0;i<ITER;i++){
@@ -0,0 +1,6 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
// CHECK: #include <hip/hip_runtime.h>
#include <cuda.h>
// CHECK-NOT: #include<cuda_runtime.h>
#include <cuda_runtime.h>
@@ -0,0 +1,6 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
// CHECK: #include <hip/hip_runtime.h>
#include <cuda_runtime.h>
// CHECK-NOT: #include<cuda.h>
#include <cuda.h>
@@ -0,0 +1,10 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
// CHECK: #pragma once
// CHECK-NEXT: #include <hip/hip_runtime.h>
#pragma once
// CHECK-NOT: #include <hip/hip_runtime.h>
int main(int argc, char* argv[]) {
return 0;
}
@@ -0,0 +1,12 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
// CHECK: #include <hip/hip_runtime.h>
// CHECK-NEXT: #include <stdio.h>
// CHECK-NEXT: #include <iostream>
#include <stdio.h>
#include <iostream>
// CHECK-NOT: #include <hip/hip_runtime.h>
int main(int argc, char* argv[]) {
return 0;
}
@@ -0,0 +1,12 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
// CHECK: #pragma once
// CHECK-NEXT: #include <hip/hip_runtime.h>
#pragma once
// CHECK-NOT: #include <hip/hip_runtime.h>
#include <stdio.h>
int main(int argc, char* argv[]) {
return 0;
}
+14 -5
Fájl megtekintése
@@ -13,7 +13,7 @@ import lit.util
config.name = 'hipify'
# suffixes: CUDA source is only supported
config.suffixes = ['.cu']
config.suffixes = ['.cu','.cuh','.cpp','.c','.hpp','.h']
# testFormat: The test format to use to interpret tests.
config.test_format = lit.formats.ShTest()
@@ -44,8 +44,17 @@ if obj_root is not None:
path = os.path.pathsep.join((llvm_tools_dir, config.environment['PATH']))
config.environment['PATH'] = path
config.substitutions.append(("hipify", obj_root+"/hipify-clang"))
hipify_path = obj_root
clang_args = "-x cuda -v --cuda-gpu-arch=sm_30 --cuda-path='%s'"
# Clang args for CUDA...
config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30 -isystem%s/samples/common/inc" % (config.cuda_root, config.cuda_root)))
config.substitutions.append(("%run_test", config.test_source_root + "/run_test.sh"))
if sys.platform in ['win32']:
run_test_ext = ".bat"
hipify_path += "/" + config.build_type
clang_args += " -isystem'%s'/common/inc -std=c++14"
else:
run_test_ext = ".sh"
clang_args += " -isystem'%s'/samples/common/inc"
config.substitutions.append(("%cuda_args", clang_args % (config.cuda_root, config.cuda_sdk_root)))
config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"'))
config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"'))
@@ -1,8 +1,23 @@
import sys
import os
config.llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@"
config.obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
config.cuda_root = "@CUDA_TOOLKIT_ROOT_DIR@"
if sys.platform in ['win32']:
config.cuda_sdk_root = "@CUDA_SDK_ROOT_DIR@"
if not config.cuda_sdk_root or config.cuda_sdk_root == "CUDA_SDK_ROOT_DIR-NOTFOUND":
cuda_version = "@CUDA_VERSION@"
cuda_version = cuda_version.replace('.','_')
config.cuda_samples_root = os.environ.get('NVCUDASAMPLES' + cuda_version + '_ROOT')
if not config.cuda_samples_root:
lit_config.fatal('No CUDA Samples dir set! Please set CUDA_SDK_ROOT_DIR.')
config.cuda_sdk_root = config.cuda_samples_root
config.build_type = "@CMAKE_BUILD_TYPE@"
if not config.build_type:
config.build_type = "Debug"
else:
config.cuda_sdk_root = config.cuda_root
# Support substitution of the tools and libs dirs with user parameters. This is
# used when we can't determine the tool dir at configuration time.
@@ -0,0 +1,18 @@
@echo off
setlocal
for %%i in (FileCheck.exe) do set FILE_CHECK=%%~$PATH:i
if not defined FILE_CHECK (echo Error: FileCheck.exe not found in PATH. && exit /b 1)
set HIPIFY=%1
set IN_FILE=%2
set TMP_FILE=%3
set all_args=%*
call set clang_args=%%all_args:*%4=%%
set clang_args=%4%clang_args%
%HIPIFY% -o=%TMP_FILE% %IN_FILE% -- %clang_args%
if errorlevel 1 (echo Error: hipify-clang.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
%FILE_CHECK% %IN_FILE% -input-file=%TMP_FILE%
if errorlevel 1 (echo Error: FileCheck.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
@@ -13,16 +13,5 @@ shift 3
# Remaining args are the ones to forward to clang proper.
# Time for the classic insane little trick for making colour output work.
# A self-deleting shell-script that does the thing we want to do...
TMP_SCRIPT=$(mktemp)
cat << EOF > $TMP_SCRIPT
set -o errexit
set -o xtrace
rm $TMP_SCRIPT
$HIPIFY -o=$TMP_FILE $IN_FILE -- $@ && cat $TMP_FILE | sed -Ee 's|//.+|// |g' | FileCheck $IN_FILE
EOF
chmod a+x $TMP_SCRIPT
# Run the script via socat, spawning a virtual terminal and propagating exit code, and hence failure.
socat -du EXEC:$TMP_SCRIPT,pty,stderr STDOUT
@@ -41,8 +41,6 @@ template <typename T>
__global__ void
vector_square(T *C_d, const T *A_d, size_t N)
{
// CHECK: size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
// CHECK: size_t stride = hipBlockDim_x * hipGridDim_x;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../test_common.cpp NVCC_OPTIONS --Wno-deprecated-declarations
* RUN: %t
* HIT_END
*/
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../test_common.cpp NVCC_OPTIONS --Wno-deprecated-declarations
* RUN: %t
* HIT_END
*/
@@ -16,7 +16,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTI
THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/* HIT_START
* BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc
* BUILD: %t %s
* RUN: %t
* HIT_END
*/
@@ -397,32 +397,30 @@ int main(int argc, char *argv[])
if (gpuCount < 2)
{
printf("P2P application requires atleast 2 gpu devices\n");
return 0;
}
} else {
if (p_tests & 0x100) {
testPeerHostToDevice(false/*useAsyncCopy*/);
}
testPeerHostToDevice(true/*useAsyncCopy*/);
if (p_tests & 0x100) {
testPeerHostToDevice(false/*useAsyncCopy*/);
}
testPeerHostToDevice(true/*useAsyncCopy*/);
if (p_tests & 0x1) {
enablePeerFirst(false/*useAsyncCopy*/);
}
if (p_tests & 0x1) {
enablePeerFirst(false/*useAsyncCopy*/);
}
if (p_tests & 0x2) {
allocMemoryFirst(false/*useAsyncCopy*/);
}
if (p_tests & 0x2) {
allocMemoryFirst(false/*useAsyncCopy*/);
}
if (p_tests & 0x4) {
simpleNegative();
}
if (p_tests & 0x4) {
simpleNegative();
if (p_tests & 0x8) {
enablePeerFirst(true/*useAsyncCopy*/);
}
if (p_tests & 0x10) {
allocMemoryFirst(true/*useAsyncCopy*/);
}
}
if (p_tests & 0x8) {
enablePeerFirst(true/*useAsyncCopy*/);
}
if (p_tests & 0x10) {
allocMemoryFirst(true/*useAsyncCopy*/);
}
passed();
}
@@ -23,7 +23,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t EXCLUDE_HIP_PLATFORM nvcc
* RUN: %t
* HIT_END
*/
@@ -23,7 +23,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
@@ -32,7 +32,7 @@ THE SOFTWARE.
int main()
{
hipDevice_t device;
size_t Nbytes = N*sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
@@ -69,8 +69,8 @@ int main()
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes));
HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes));
hipLaunchKernel(
HipTest::vectorADD,
@@ -82,7 +82,7 @@ int main()
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -32,7 +32,6 @@ THE SOFTWARE.
int main()
{
hipDevice_t device;
size_t Nbytes = N*sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
@@ -70,8 +69,8 @@ int main()
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));
hipLaunchKernel(
HipTest::vectorADD,
@@ -83,7 +82,7 @@ int main()
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
HIPCHECK(hipStreamSynchronize(s));
HIPCHECK(hipDeviceSynchronize());