Files
rocm-systems/tests/src/hipMemcpyAsync.cpp
T
2016-02-17 21:22:07 -06:00

150 wiersze
4.6 KiB
C++

// Test under-development. Calls async mem-copy API, experiment with functionality.
#include "hip_runtime.h"
#include "test_common.h"
unsigned p_streams = 2;
void simpleNegTest()
{
printf ("testing: %s\n",__func__);
hipError_t e;
float *A_malloc, *A_pinned, *A_d;
size_t Nbytes = N*sizeof(float);
A_malloc = (float*)malloc(Nbytes);
HIPCHECK(hipMallocHost(&A_pinned, Nbytes));
HIPCHECK(hipMalloc(&A_d, Nbytes));
// Can't use default with async copy
e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL);
HIPASSERT (e==hipErrorInvalidMemcpyDirection);
// Not sure what happens here, the memory must be pinned.
e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL);
printf (" async memcpy of A_malloc to A_d. Result=%d\n", e);
//HIPASSERT (e==hipErrorInvalidValue);
}
//---
//Classic example showing how to overlap data transfer with compute.
//We divide the work into "chunks" and create a stream for each chunk.
//Each chunk then runs a H2D copy, followed by kernel execution, followed by D2H copyback.
//Work in separate streams is independent which enables concurrency.
// IN: nStreams : number of streams to use for the test
// IN :useNullStream - use NULL stream. Synchronizes everything.
// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D
// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H
void chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H)
{
size_t Nbytes = N*sizeof(int);
printf ("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ",__func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H);
printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0);
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
if (useNullStream) {
nStreams = 1;
stream[0] = NULL;
} else {
for (int i = 0; i < nStreams; ++i) {
HIPCHECK (hipStreamCreate(&stream[i]));
}
}
size_t workLeft = N;
size_t workPerStream = N / nStreams;
for (int i = 0; i < nStreams; ++i) {
size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
size_t workBytes = work * sizeof(int);
size_t offset = i*workPerStream;
if (useSyncMemcpyH2D) {
HIPCHECK ( hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice));
} else {
HIPCHECK ( hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i]));
HIPCHECK ( hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i]));
};
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work);
if (useSyncMemcpyD2H) {
HIPCHECK ( hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost));
} else {
HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
}
}
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true);
};
//---
//Parse arguments specific to this test.
void parseMyArguments(int argc, char *argv[])
{
int more_argc = HipTest::parseStandardArguments(argc, argv, false);
// parse args for this test:
for (int i = 1; i < more_argc; i++) {
const char *arg = argv[i];
if (!strcmp(arg, "--streams")) {
if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
failed("Bad streams argument");
}
} else {
failed("Bad argument '%s'", arg);
}
};
};
int main(int argc, char *argv[])
{
HipTest::parseStandardArguments(argc, argv, true);
parseMyArguments(argc, argv);
printf ("info: set device to %d\n", p_gpuDevice);
HIPCHECK(hipSetDevice(p_gpuDevice));
simpleNegTest();
chunkedAsyncExample(p_streams, true, true, true); // Easy sync version
chunkedAsyncExample(p_streams, false, true, true); // Easy sync version
chunkedAsyncExample(p_streams, false, false, true); // Some async
chunkedAsyncExample(p_streams, false, false, false); // All async
passed();
}