Merge pull request #224 from ChrisKitching/tests
[HIPIFY] Make the automated tests more useful
Tento commit je obsažen v:
@@ -12,11 +12,13 @@ endif()
|
||||
|
||||
option(HIPIFY_CLANG_TESTS "Build the tests for hipify-clang, if lit is installed" ON)
|
||||
|
||||
# Disable the tests if `lit` is not installed.
|
||||
# Disable the tests if `lit` or `FileCheck` is not installed.
|
||||
find_program(LIT_COMMAND lit)
|
||||
if (NOT LIT_COMMAND)
|
||||
find_program(FILECHECK_COMMAND FileCheck)
|
||||
find_program(SOCAT_COMMAND socat)
|
||||
if (NOT LIT_COMMAND OR NOT FILECHECK_COMMAND OR NOT SOCAT_COMMAND)
|
||||
set(HIPIFY_CLANG_TESTS OFF CACHE INTERNAL "")
|
||||
message(STATUS "hipify-clang's tests are not being built because `lit` is not installed.")
|
||||
message(STATUS "hipify-clang's tests are not being built because `lit`,`FileCheck` or `socat` could not be found.")
|
||||
endif()
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
|
||||
@@ -75,9 +77,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DI
|
||||
install(TARGETS hipify-clang DESTINATION bin)
|
||||
|
||||
if (HIPIFY_CLANG_TESTS)
|
||||
# tests
|
||||
find_package(PythonInterp 2.7 REQUIRED EXACT)
|
||||
|
||||
# Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
|
||||
# value of --cuda-path for the test runs.
|
||||
find_package(CUDA REQUIRED)
|
||||
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
|
||||
@@ -87,7 +92,8 @@ if (HIPIFY_CLANG_TESTS)
|
||||
add_lit_testsuite(test-hipify "Running HIPify regression tests"
|
||||
${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang
|
||||
PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
|
||||
DEPENDS hipify-clang lit
|
||||
ARGS -v
|
||||
DEPENDS hipify-clang
|
||||
)
|
||||
|
||||
add_custom_target(test-hipify-clang)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
// RUN: hipify "%s" -o=%t --
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
#include <iostream>
|
||||
|
||||
__global__ void axpy(float a, float* x, float* y) {
|
||||
// RUN: sh -c "test `grep -c -F 'y[hipThreadIdx_x] = a * x[hipThreadIdx_x];' %t` -eq 2"
|
||||
// CHECK: y[hipThreadIdx_x] = a * x[hipThreadIdx_x];
|
||||
y[threadIdx.x] = a * x[threadIdx.x];
|
||||
}
|
||||
|
||||
@@ -17,21 +17,25 @@ int main(int argc, char* argv[]) {
|
||||
// Copy input data to device.
|
||||
float* device_x;
|
||||
float* device_y;
|
||||
// RUN: sh -c "test `grep -c -F 'hipMalloc(&device_x, kDataLen * sizeof(float));' %t` -eq 2"
|
||||
|
||||
// CHECK: hipMalloc(&device_x, kDataLen * sizeof(float));
|
||||
cudaMalloc(&device_x, kDataLen * sizeof(float));
|
||||
// RUN: sh -c "test `grep -c -F 'hipMalloc(&device_y, kDataLen * sizeof(float));' %t` -eq 2"
|
||||
|
||||
// CHECK: hipMalloc(&device_y, kDataLen * sizeof(float));
|
||||
cudaMalloc(&device_y, kDataLen * sizeof(float));
|
||||
// RUN: sh -c "test `grep -c -F 'hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);' %t` -eq 2"
|
||||
|
||||
// CHECK: hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);
|
||||
cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
// Launch the kernel.
|
||||
// RUN: sh -c "test `grep -c -F 'hipLaunchKernel(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);' %t` -eq 2"
|
||||
// CHECK: hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
|
||||
axpy<<<1, kDataLen>>>(a, device_x, device_y);
|
||||
|
||||
// Copy output data to host.
|
||||
// RUN: sh -c "test `grep -c -F 'hipDeviceSynchronize();' %t` -eq 2"
|
||||
// CHECK: hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
// RUN: sh -c "test `grep -c -F 'hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);' %t` -eq 2"
|
||||
|
||||
// CHECK: hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
// Print the results.
|
||||
@@ -39,7 +43,7 @@ int main(int argc, char* argv[]) {
|
||||
std::cout << "y[" << i << "] = " << host_y[i] << "\n";
|
||||
}
|
||||
|
||||
// RUN: sh -c "test `grep -c -F 'hipDeviceReset();' %t` -eq 2"
|
||||
// CHECK: hipDeviceReset();
|
||||
cudaDeviceReset();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,240 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
/*
|
||||
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
|
||||
// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
|
||||
// in CUDA 3.2.
|
||||
//
|
||||
// Devices of compute capability 1.x will run the kernels one after another
|
||||
// Devices of compute capability 2.0 or higher can overlap the kernels
|
||||
//
|
||||
#include <stdio.h>
|
||||
#include <helper_functions.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
// This is a kernel that does no real work but runs at least for a specified number of clocks
|
||||
__global__ void clock_block(clock_t *d_o, clock_t clock_count)
|
||||
{
|
||||
unsigned int start_clock = (unsigned int) clock();
|
||||
|
||||
clock_t clock_offset = 0;
|
||||
|
||||
while (clock_offset < clock_count)
|
||||
{
|
||||
unsigned int end_clock = (unsigned int) clock();
|
||||
|
||||
// The code below should work like
|
||||
// this (thanks to modular arithmetics):
|
||||
//
|
||||
// clock_offset = (clock_t) (end_clock > start_clock ?
|
||||
// end_clock - start_clock :
|
||||
// end_clock + (0xffffffffu - start_clock));
|
||||
//
|
||||
// Indeed, let m = 2^32 then
|
||||
// end - start = end + m - start (mod m).
|
||||
|
||||
clock_offset = (clock_t)(end_clock - start_clock);
|
||||
}
|
||||
|
||||
d_o[0] = clock_offset;
|
||||
}
|
||||
|
||||
|
||||
// Single warp reduction kernel
|
||||
__global__ void sum(clock_t *d_clocks, int N)
|
||||
{
|
||||
__shared__ clock_t s_clocks[32];
|
||||
|
||||
clock_t my_sum = 0;
|
||||
|
||||
for (int i = threadIdx.x; i < N; i+= blockDim.x)
|
||||
{
|
||||
my_sum += d_clocks[i];
|
||||
}
|
||||
|
||||
s_clocks[threadIdx.x] = my_sum;
|
||||
syncthreads();
|
||||
|
||||
for (int i=16; i>0; i/=2)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
|
||||
}
|
||||
|
||||
syncthreads();
|
||||
}
|
||||
|
||||
d_clocks[0] = s_clocks[0];
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int nkernels = 8; // number of concurrent kernels
|
||||
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
|
||||
int nbytes = nkernels * sizeof(clock_t); // number of data bytes
|
||||
float kernel_time = 10; // time the kernel should run in ms
|
||||
float elapsed_time; // timing variables
|
||||
int cuda_device = 0;
|
||||
|
||||
printf("[%s] - Starting...\n", argv[0]);
|
||||
|
||||
// get number of kernels if overridden on the command line
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
|
||||
{
|
||||
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
|
||||
nstreams = nkernels + 1;
|
||||
}
|
||||
|
||||
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
|
||||
cuda_device = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
cudaDeviceProp deviceProp;
|
||||
// CHECK: checkCudaErrors(hipGetDevice(&cuda_device));
|
||||
checkCudaErrors(cudaGetDevice(&cuda_device));
|
||||
|
||||
// CHECK: checkCudaErrors(hipGetDeviceProperties(&deviceProp, cuda_device));
|
||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
||||
|
||||
if ((deviceProp.concurrentKernels == 0))
|
||||
{
|
||||
printf("> GPU does not support concurrent kernel execution\n");
|
||||
printf(" CUDA kernel runs will be serialized\n");
|
||||
}
|
||||
|
||||
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
|
||||
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
|
||||
|
||||
// allocate host memory
|
||||
clock_t *a = 0; // pointer to the array data in host memory
|
||||
// CHECK: checkCudaErrors(hipHostMalloc((void **)&a, nbytes));
|
||||
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
||||
|
||||
// allocate device memory
|
||||
clock_t *d_a = 0; // pointers to data and init value in the device memory
|
||||
// CHECK: checkCudaErrors(hipMalloc((void **)&d_a, nbytes));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
||||
|
||||
// CHECK: hipStream_t *streams = (hipStream_t *) malloc(nstreams * sizeof(hipStream_t));
|
||||
// allocate and initialize an array of stream handles
|
||||
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
|
||||
|
||||
for (int i = 0; i < nstreams; i++)
|
||||
{
|
||||
// CHECK: checkCudaErrors(hipStreamCreate(&(streams[i])));
|
||||
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
||||
}
|
||||
|
||||
// create CUDA event handles
|
||||
cudaEvent_t start_event, stop_event;
|
||||
|
||||
// CHECK: checkCudaErrors(hipEventCreate(&start_event));
|
||||
// CHECK: checkCudaErrors(hipEventCreate(&stop_event));
|
||||
checkCudaErrors(cudaEventCreate(&start_event));
|
||||
checkCudaErrors(cudaEventCreate(&stop_event));
|
||||
|
||||
// the events are used for synchronization only and hence do not need to record timings
|
||||
// this also makes events not introduce global sync points when recorded which is critical to get overlap
|
||||
|
||||
// CHECK: hipEvent_t *kernelEvent;
|
||||
// CHECK: kernelEvent = (hipEvent_t *) malloc(nkernels * sizeof(hipEvent_t));
|
||||
cudaEvent_t *kernelEvent;
|
||||
kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
|
||||
|
||||
for (int i = 0; i < nkernels; i++)
|
||||
{
|
||||
// CHECK: checkCudaErrors(hipEventCreateWithFlags(&(kernelEvent[i]), hipEventDisableTiming));
|
||||
checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// time execution with nkernels streams
|
||||
clock_t total_clocks = 0;
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
// the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
|
||||
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
|
||||
#else
|
||||
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
|
||||
#endif
|
||||
|
||||
// CHECK: hipEventRecord(start_event, 0);
|
||||
cudaEventRecord(start_event, 0);
|
||||
|
||||
// queue nkernels in separate streams and record when they are done
|
||||
for (int i=0; i<nkernels; ++i)
|
||||
{
|
||||
// CHECK: hipLaunchKernelGGL(clock_block, dim3(1), dim3(1), 0, streams[i], &d_a[i], time_clocks);
|
||||
clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
|
||||
total_clocks += time_clocks;
|
||||
|
||||
// CHECK: checkCudaErrors(hipEventRecord(kernelEvent[i], streams[i]));
|
||||
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
|
||||
|
||||
// make the last stream wait for the kernel event to be recorded
|
||||
// CHECK: checkCudaErrors(hipStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
|
||||
checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
|
||||
}
|
||||
|
||||
// queue a sum kernel and a copy back to host in the last stream.
|
||||
// the commands in this stream get dispatched as soon as all the kernel events have been recorded
|
||||
// CHECK: hipLaunchKernelGGL(sum, dim3(1), dim3(32), 0, streams[nstreams-1], d_a, nkernels);
|
||||
// CHECK: checkCudaErrors(hipMemcpyAsync(a, d_a, sizeof(clock_t), hipMemcpyDeviceToHost, streams[nstreams-1]));
|
||||
sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
|
||||
checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
|
||||
|
||||
// at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
|
||||
|
||||
// in this sample we just wait until the GPU is done
|
||||
// CHECK: checkCudaErrors(hipEventRecord(stop_event, 0));
|
||||
// CHECK: checkCudaErrors(hipEventSynchronize(stop_event));
|
||||
// CHECK: checkCudaErrors(hipEventElapsedTime(&elapsed_time, start_event, stop_event));
|
||||
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
||||
checkCudaErrors(cudaEventSynchronize(stop_event));
|
||||
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
|
||||
|
||||
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
|
||||
printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
|
||||
printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
|
||||
|
||||
bool bTestResult = (a[0] > total_clocks);
|
||||
|
||||
// release resources
|
||||
for (int i = 0; i < nkernels; i++)
|
||||
{
|
||||
// CHECK: hipStreamDestroy(streams[i]);
|
||||
// CHECK: hipEventDestroy(kernelEvent[i]);
|
||||
cudaStreamDestroy(streams[i]);
|
||||
cudaEventDestroy(kernelEvent[i]);
|
||||
}
|
||||
|
||||
free(streams);
|
||||
free(kernelEvent);
|
||||
|
||||
// CHECK: hipEventDestroy(start_event);
|
||||
// CHECK: hipEventDestroy(stop_event);
|
||||
// CHECK: hipHostFree(a);
|
||||
// CHECK: hipFree(d_a);
|
||||
cudaEventDestroy(start_event);
|
||||
cudaEventDestroy(stop_event);
|
||||
cudaFreeHost(a);
|
||||
cudaFree(d_a);
|
||||
|
||||
if (!bTestResult)
|
||||
{
|
||||
printf("Test failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("Test passed\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include<cuda.h>
|
||||
#include<cuda_runtime.h>
|
||||
#include<iostream>
|
||||
#include<unistd.h>
|
||||
#include<stdio.h>
|
||||
#include<malloc.h>
|
||||
|
||||
#define LEN 1024
|
||||
#define SIZE LEN * sizeof(float)
|
||||
#define ITER 1024*1024
|
||||
|
||||
// CHECK: if(status != hipSuccess) {
|
||||
#define check(msg, status){ \
|
||||
if(status != cudaSuccess) { \
|
||||
printf("%s failed. \n", #msg); \
|
||||
} \
|
||||
}
|
||||
|
||||
__global__ void Inc1(float *Ad, float *Bd){
|
||||
// CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx < 1 ){
|
||||
for(int i=0;i<ITER;i++){
|
||||
Ad[tx] = Ad[tx] + 1.0f;
|
||||
for(int j=0;j<256;j++){
|
||||
Bd[tx] = Ad[tx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Inc2(float *Ad, float *Bd){
|
||||
// CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx < 1024){
|
||||
for(int i=0;i<ITER;i++){
|
||||
Ad[tx] = Ad[tx] + 1.0f;
|
||||
for(int j=0;j<256;j++){
|
||||
Bd[tx] = Ad[tx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(){
|
||||
float *A, *Ad, *Bd;
|
||||
A = new float[LEN];
|
||||
for(int i=0;i<LEN;i++){
|
||||
A[i] = 0.0f;
|
||||
}
|
||||
|
||||
// CHECK: hipError_t status;
|
||||
cudaError_t status;
|
||||
|
||||
// CHECK: status = hipHostRegister(A, SIZE, hipHostRegisterMapped);
|
||||
status = cudaHostRegister(A, SIZE, cudaHostRegisterMapped);
|
||||
check("Registering A",status);
|
||||
|
||||
// CHECK: hipHostGetDevicePointer(&Ad, A, 0);
|
||||
cudaHostGetDevicePointer(&Ad, A, 0);
|
||||
|
||||
// CHECK: hipMalloc((void**) &Bd, SIZE);
|
||||
cudaMalloc((void**) &Bd, SIZE);
|
||||
dim3 dimGrid(LEN/512,1,1);
|
||||
dim3 dimBlock(512,1,1);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(Inc1, dim3(dimGrid), dim3(dimBlock), 0, 0, Ad, Bd);
|
||||
Inc1<<<dimGrid, dimBlock>>>(Ad, Bd);
|
||||
sleep(3);
|
||||
A[0] = -(ITER*1.0f);
|
||||
std::cout<<"Same cache line before completion: \t"<< A[0]<<std::endl;
|
||||
|
||||
// CHECK: hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
std::cout<<"Same cache line after completion: \t"<< A[0]<<std::endl;
|
||||
|
||||
for(int i=0;i<LEN;i++){
|
||||
A[i] = 0.0f;
|
||||
}
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(Inc2, dim3(dimGrid), dim3(dimBlock), 0, 0, Ad, Bd);
|
||||
Inc2<<<dimGrid, dimBlock>>>(Ad, Bd);
|
||||
sleep(3);
|
||||
A[0] = -(ITER*1.0f);
|
||||
std::cout<<"Diff cache line before completion: \t"<<A[0]<<std::endl;
|
||||
|
||||
// CHECK: hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
std::cout<<"Diff cache line after completion: \t"<<A[0]<<std::endl;
|
||||
}
|
||||
@@ -46,3 +46,6 @@ if obj_root is not None:
|
||||
|
||||
config.substitutions.append(("hipify", obj_root+"/hipify-clang"))
|
||||
|
||||
# Clang args for CUDA...
|
||||
config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30 -isystem%s/samples/common/inc" % (config.cuda_root, config.cuda_root)))
|
||||
config.substitutions.append(("%run_test", config.test_source_root + "/run_test.sh"))
|
||||
|
||||
@@ -2,6 +2,7 @@ import sys
|
||||
|
||||
config.llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@"
|
||||
config.obj_root = "@BINARY_DIR@"
|
||||
config.cuda_root = "@CUDA_TOOLKIT_ROOT_DIR@"
|
||||
|
||||
# Support substitution of the tools and libs dirs with user parameters. This is
|
||||
# used when we can't determine the tool dir at configuration time.
|
||||
|
||||
Spustitelný soubor
+28
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -o errexit
|
||||
|
||||
# Run a single LIT test file in a magical way that preserves colour output, to work around
|
||||
# a known flaw in lit.
|
||||
|
||||
# Capture lit substitutions
|
||||
HIPIFY=$1
|
||||
IN_FILE=$2
|
||||
TMP_FILE=$3
|
||||
shift 3
|
||||
|
||||
# Remaining args are the ones to forward to clang proper.
|
||||
|
||||
# Time for the classic insane little trick for making colour output work.
|
||||
# A self-deleting shell-script that does the thing we want to do...
|
||||
TMP_SCRIPT=$(mktemp)
|
||||
cat << EOF > $TMP_SCRIPT
|
||||
set -o errexit
|
||||
set -o xtrace
|
||||
rm $TMP_SCRIPT
|
||||
$HIPIFY -o=$TMP_FILE $IN_FILE -- $@ && cat $TMP_FILE | sed -Ee 's|//.+|// |g' | FileCheck $IN_FILE
|
||||
EOF
|
||||
chmod a+x $TMP_SCRIPT
|
||||
|
||||
# Run the script via socat, spawning a virtual terminal and propagating exit code, and hence failure.
|
||||
socat -du EXEC:$TMP_SCRIPT,pty,stderr STDOUT
|
||||
@@ -0,0 +1,114 @@
|
||||
// RUN: %run_test hipify "%s" "%t" %cuda_args
|
||||
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
cudaError_t error = cmd;\
|
||||
if (error != cudaSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
}\
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Square each element in the array A and write to array C.
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vector_square(T *C_d, const T *A_d, size_t N)
|
||||
{
|
||||
// CHECK: size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
// CHECK: size_t stride = hipBlockDim_x * hipGridDim_x;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
float *A_d, *C_d;
|
||||
float *A_h, *C_h;
|
||||
size_t N = 1000000;
|
||||
size_t Nbytes = N * sizeof(float);
|
||||
|
||||
// CHECK: hipDeviceProp_t props;
|
||||
cudaDeviceProp props;
|
||||
|
||||
// CHECK: CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
|
||||
CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
|
||||
printf ("info: running on device %s\n", props.name);
|
||||
|
||||
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
A_h = (float*)malloc(Nbytes);
|
||||
|
||||
// CHECK: CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
|
||||
C_h = (float*)malloc(Nbytes);
|
||||
|
||||
// CHECK: CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
|
||||
// Fill with Phi + i
|
||||
for (size_t i=0; i<N; i++)
|
||||
{
|
||||
A_h[i] = 1.618f + i;
|
||||
}
|
||||
|
||||
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
// CHECK: CHECK(hipMalloc(&A_d, Nbytes));
|
||||
// CHECK: CHECK(hipMalloc(&C_d, Nbytes));
|
||||
CHECK(cudaMalloc(&A_d, Nbytes));
|
||||
CHECK(cudaMalloc(&C_d, Nbytes));
|
||||
|
||||
|
||||
printf ("info: copy Host2Device\n");
|
||||
// CHECK: CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
|
||||
|
||||
const unsigned blocks = 512;
|
||||
const unsigned threadsPerBlock = 256;
|
||||
|
||||
printf ("info: launch 'vector_square' kernel\n");
|
||||
// CHECK: hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
|
||||
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
|
||||
|
||||
printf ("info: copy Device2Host\n");
|
||||
// CHECK: CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
printf ("info: check result\n");
|
||||
for (size_t i=0; i<N; i++) {
|
||||
if (C_h[i] != A_h[i] * A_h[i]) {
|
||||
// CHECK: CHECK(hipErrorUnknown);
|
||||
CHECK(cudaErrorUnknown);
|
||||
}
|
||||
}
|
||||
printf ("PASSED!\n");
|
||||
}
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele