524 baris
20 KiB
C++
524 baris
20 KiB
C++
/*
|
|
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#include <hip_test_kernels.hh>
|
|
#include <hip_test_checkers.hh>
|
|
#include <hip_test_common.hh>
|
|
#define GRIDSIZE 512
|
|
#define BLOCKSIZE 256
|
|
#define NUM_OF_STREAM 3
|
|
#define THREADS_PER_BLOCK 512
|
|
#define GRAPH_LAUNCH_ITERATIONS 1000
|
|
|
|
static __global__ void reduce(float* d_in, double* d_out) {
|
|
int myId = threadIdx.x + blockDim.x * blockIdx.x;
|
|
int tid = threadIdx.x;
|
|
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
d_in[myId] += d_in[myId + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
if (tid == 0) {
|
|
d_out[blockIdx.x] = d_in[myId];
|
|
}
|
|
}
|
|
static __global__ void reduceFinal(double* d_in, double* d_out) {
|
|
int myId = threadIdx.x + blockDim.x * blockIdx.x;
|
|
int tid = threadIdx.x;
|
|
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
d_in[myId] += d_in[myId + s];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
if (tid == 0) {
|
|
*d_out = d_in[myId];
|
|
}
|
|
}
|
|
|
|
static void init_input(float* a, size_t size) {
|
|
unsigned int seed = time(nullptr);
|
|
for (size_t i = 0; i < size; i++) {
|
|
a[i] = (HipTest::RAND_R(&seed) & 0xFF) / static_cast<float>(RAND_MAX);
|
|
}
|
|
}
|
|
|
|
static bool gPassed = true;
|
|
static void* gusrptr;
|
|
static void* ptr0xff = reinterpret_cast<void*>(0xffffffff);
|
|
static size_t NSize = GRIDSIZE * BLOCKSIZE;
|
|
static size_t Nbytes = NSize * sizeof(float);
|
|
|
|
typedef struct userDataStruct {
|
|
float* A_h;
|
|
float* C_h;
|
|
float* A_d;
|
|
float* C_d;
|
|
bool isPassed;
|
|
bool isOpCompleted;
|
|
} usrDataS;
|
|
|
|
// Common callback function.
|
|
static void Fn_validateSq(void* userData) {
|
|
REQUIRE(userData != nullptr);
|
|
usrDataS* ptrUsrData = reinterpret_cast<usrDataS*>(userData);
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
if (ptrUsrData->C_h[i] != (ptrUsrData->A_h[i] * ptrUsrData->A_h[i])) {
|
|
ptrUsrData->isPassed = false;
|
|
return;
|
|
}
|
|
}
|
|
ptrUsrData->isPassed = true;
|
|
}
|
|
|
|
// Test scenario 1
|
|
// simple scenario that validates passing userData to host function.
|
|
static void Fn_ChkUserdataPtr(void* userData) {
|
|
gPassed = true;
|
|
if (gusrptr != userData) {
|
|
gPassed = false;
|
|
}
|
|
}
|
|
|
|
TEST_CASE("Unit_hipLaunchHostFunc_basic") {
|
|
hipStream_t mystream;
|
|
HIP_CHECK(hipStreamCreate(&mystream));
|
|
gusrptr = ptr0xff;
|
|
gPassed = true;
|
|
HIP_CHECK(hipLaunchHostFunc(mystream, Fn_ChkUserdataPtr, gusrptr));
|
|
HIP_CHECK(hipStreamSynchronize(mystream));
|
|
HIP_CHECK(hipStreamDestroy(mystream));
|
|
REQUIRE(gPassed);
|
|
}
|
|
|
|
// Negative test scenario for hipLaunchHostFunc
|
|
TEST_CASE("Unit_hipLaunchHostFunc_Negative") {
|
|
hipStream_t mystream;
|
|
HIP_CHECK(hipStreamCreate(&mystream));
|
|
|
|
SECTION("Pass nullptr as function") {
|
|
REQUIRE(hipLaunchHostFunc(mystream, nullptr, 0) == hipErrorInvalidValue);
|
|
}
|
|
HIP_CHECK(hipStreamDestroy(mystream));
|
|
}
|
|
|
|
// Local Function
|
|
static void launchOperationOnStrm(usrDataS* usrDataptr, hipStream_t stream) {
|
|
usrDataptr->isPassed = false;
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr->A_d)), Nbytes, stream));
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr->C_d)), Nbytes, stream));
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr->A_d, usrDataptr->A_h, Nbytes, hipMemcpyHostToDevice, stream));
|
|
hipLaunchKernelGGL((HipTest::vector_square), dim3(GRIDSIZE), dim3(BLOCKSIZE), 0, stream,
|
|
usrDataptr->A_d, usrDataptr->C_d, NSize);
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr->C_h, usrDataptr->C_d, Nbytes, hipMemcpyDeviceToHost, stream));
|
|
HIP_CHECK(hipLaunchHostFunc(stream, Fn_validateSq, reinterpret_cast<void*>(usrDataptr)));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr->A_d), stream));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr->C_d), stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
REQUIRE(usrDataptr->isPassed);
|
|
}
|
|
|
|
// Test scenario 2
|
|
// scenario that validates the host launch function on 3 different streams,
|
|
// created stream, default/null stream and hipStreamPerThread.
|
|
TEST_CASE("Unit_hipLaunchHostFunc_streams") {
|
|
hipStream_t stream[NUM_OF_STREAM];
|
|
HIP_CHECK(hipStreamCreate(&stream[0]));
|
|
stream[1] = 0; // Null stream
|
|
stream[2] = hipStreamPerThread;
|
|
usrDataS* usrDataptr = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr != nullptr);
|
|
usrDataptr->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->A_h != nullptr);
|
|
usrDataptr->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->C_h != nullptr);
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr->A_h[i] = 21.0f;
|
|
}
|
|
for (int idx = 0; idx < NUM_OF_STREAM; idx++) {
|
|
launchOperationOnStrm(usrDataptr, stream[idx]);
|
|
}
|
|
HIP_CHECK(hipStreamDestroy(stream[0]));
|
|
free(usrDataptr->A_h);
|
|
free(usrDataptr->C_h);
|
|
free(usrDataptr);
|
|
}
|
|
|
|
// Test scenario 3
|
|
// test case to validate hipLaunchHostFunc for multi stream scenario.
|
|
// create 2 different streams and call hipLaunchHostFunc, stream synchronize.
|
|
static void Fn_validateMul_stream(void* userData) {
|
|
REQUIRE(userData != nullptr);
|
|
usrDataS* ptrUsrData = reinterpret_cast<usrDataS*>(userData);
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
if (ptrUsrData->C_h[i] != (ptrUsrData->A_h[i] * ptrUsrData->A_h[i])) {
|
|
ptrUsrData->isPassed = false;
|
|
return;
|
|
}
|
|
}
|
|
ptrUsrData->isPassed = true;
|
|
}
|
|
|
|
TEST_CASE("Unit_hipLaunchHostFunc_multistreams") {
|
|
hipStream_t mystream1, mystream2;
|
|
HIP_CHECK(hipStreamCreateWithFlags(&mystream1, hipStreamNonBlocking));
|
|
HIP_CHECK(hipStreamCreateWithFlags(&mystream2, hipStreamNonBlocking));
|
|
usrDataS* usrDataptr1 = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr1 != nullptr);
|
|
usrDataS* usrDataptr2 = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr2 != nullptr);
|
|
usrDataptr1->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr1->A_h != nullptr);
|
|
usrDataptr1->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr1->C_h != nullptr);
|
|
// input data
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr1->A_h[i] = 11.0f;
|
|
}
|
|
usrDataptr1->isPassed = false;
|
|
usrDataptr2->isPassed = false;
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr1->A_d)), Nbytes, mystream1));
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr1->C_d)), Nbytes, mystream1));
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr1->A_d, usrDataptr1->A_h, Nbytes, hipMemcpyHostToDevice, mystream1));
|
|
const unsigned blocks = GRIDSIZE;
|
|
const unsigned threadsPerBlock = BLOCKSIZE;
|
|
hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), dim3(threadsPerBlock), 0, mystream1,
|
|
usrDataptr1->A_d, usrDataptr1->C_d, NSize);
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr1->C_h, usrDataptr1->C_d, Nbytes, hipMemcpyDeviceToHost, mystream1));
|
|
HIP_CHECK(
|
|
hipLaunchHostFunc(mystream1, Fn_validateMul_stream, reinterpret_cast<void*>(usrDataptr1)));
|
|
// launch kernel function for mystream2
|
|
usrDataptr2->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr2->A_h != nullptr);
|
|
usrDataptr2->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr2->C_h != nullptr);
|
|
// input data
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr2->A_h[i] = 9.0f;
|
|
}
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr2->A_d)), Nbytes, mystream2));
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr2->C_d)), Nbytes, mystream2));
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr2->A_d, usrDataptr2->A_h, Nbytes, hipMemcpyHostToDevice, mystream2));
|
|
hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), dim3(threadsPerBlock), 0, mystream2,
|
|
usrDataptr2->A_d, usrDataptr2->C_d, NSize);
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr2->C_h, usrDataptr2->C_d, Nbytes, hipMemcpyDeviceToHost, mystream2));
|
|
HIP_CHECK(
|
|
hipLaunchHostFunc(mystream2, Fn_validateMul_stream, reinterpret_cast<void*>(usrDataptr2)));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr1->A_d), mystream1));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr1->C_d), mystream1));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr2->A_d), mystream2));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr2->C_d), mystream2));
|
|
HIP_CHECK(hipStreamSynchronize(mystream1));
|
|
HIP_CHECK(hipStreamSynchronize(mystream2));
|
|
HIP_CHECK(hipStreamDestroy(mystream1));
|
|
HIP_CHECK(hipStreamDestroy(mystream2));
|
|
REQUIRE(usrDataptr1->isPassed);
|
|
REQUIRE(usrDataptr2->isPassed);
|
|
free(usrDataptr1->A_h);
|
|
free(usrDataptr1->C_h);
|
|
free(usrDataptr2->A_h);
|
|
free(usrDataptr2->C_h);
|
|
free(usrDataptr2);
|
|
free(usrDataptr1);
|
|
}
|
|
|
|
// Test scenario 4
|
|
// test case to validate hipLaunchHostFunc for the kernel,
|
|
// validate hipLaunchHostFunc after kernel launch.
|
|
static void Fn_Completion_state(void* userData) {
|
|
REQUIRE(userData != nullptr);
|
|
usrDataS* ptrUsrData = reinterpret_cast<usrDataS*>(userData);
|
|
ptrUsrData->isOpCompleted = true;
|
|
}
|
|
|
|
TEST_CASE("Unit_hipLaunchHostFunc_KernelHost") {
|
|
hipStream_t stream1, stream2, stream3;
|
|
HIP_CHECK(hipStreamCreate(&stream1));
|
|
HIP_CHECK(hipStreamCreate(&stream2));
|
|
HIP_CHECK(hipStreamCreate(&stream3));
|
|
usrDataS* usrDataptr = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr != nullptr);
|
|
usrDataptr->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->A_h != nullptr);
|
|
usrDataptr->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->C_h != nullptr);
|
|
// input data
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr->A_h[i] = 7.0f;
|
|
}
|
|
usrDataptr->isOpCompleted = false;
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr->A_d)), Nbytes, stream1));
|
|
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&(usrDataptr->C_d)), Nbytes, stream1));
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr->A_d, usrDataptr->A_h, Nbytes, hipMemcpyHostToDevice, stream1));
|
|
HIP_CHECK(hipLaunchHostFunc(stream1, Fn_Completion_state, reinterpret_cast<void*>(usrDataptr)));
|
|
while (!usrDataptr->isOpCompleted) {
|
|
std::this_thread::sleep_for(std::chrono::microseconds(100000));
|
|
} // Sleep for 100 ms*/
|
|
usrDataptr->isOpCompleted = false;
|
|
const unsigned blocks = GRIDSIZE;
|
|
const unsigned threadsPerBlock = BLOCKSIZE;
|
|
hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks), dim3(threadsPerBlock), 0, stream2,
|
|
usrDataptr->A_d, usrDataptr->C_d, NSize);
|
|
HIP_CHECK(hipLaunchHostFunc(stream2, Fn_Completion_state, reinterpret_cast<void*>(usrDataptr)));
|
|
while (!usrDataptr->isOpCompleted) {
|
|
std::this_thread::sleep_for(std::chrono::microseconds(100000));
|
|
} // Sleep for 100 ms*/
|
|
usrDataptr->isOpCompleted = false;
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(usrDataptr->C_h, usrDataptr->C_d, Nbytes, hipMemcpyDeviceToHost, stream3));
|
|
HIP_CHECK(hipLaunchHostFunc(stream2, Fn_Completion_state, reinterpret_cast<void*>(usrDataptr)));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr->A_d), stream3));
|
|
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(usrDataptr->C_d), stream3));
|
|
while (!usrDataptr->isOpCompleted) {
|
|
std::this_thread::sleep_for(std::chrono::microseconds(100000));
|
|
} // Sleep for 100 ms*/
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
if (usrDataptr->C_h[i] != (usrDataptr->A_h[i] * usrDataptr->A_h[i])) {
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
HIP_CHECK(hipStreamSynchronize(stream3));
|
|
HIP_CHECK(hipStreamDestroy(stream3));
|
|
HIP_CHECK(hipStreamDestroy(stream2));
|
|
HIP_CHECK(hipStreamDestroy(stream1));
|
|
free(usrDataptr->A_h);
|
|
free(usrDataptr->C_h);
|
|
free(usrDataptr);
|
|
}
|
|
|
|
// Test scenario 5
|
|
// scenario that validates the host launch function on multi device
|
|
// environment.
|
|
TEST_CASE("Unit_hipLaunchHostFunc_multidevice", "[multigpu]") {
|
|
int num_devices;
|
|
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
|
if (num_devices < 2) {
|
|
SUCCEED("Skipping the testcases as numDevices < 2");
|
|
return;
|
|
}
|
|
usrDataS* usrDataptr = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr != nullptr);
|
|
usrDataptr->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->A_h != nullptr);
|
|
usrDataptr->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->C_h != nullptr);
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr->A_h[i] = 21.0f;
|
|
}
|
|
for (int dev = 0; dev < num_devices; dev++) {
|
|
HIP_CHECK(hipSetDevice(dev));
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
launchOperationOnStrm(usrDataptr, stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
free(usrDataptr->A_h);
|
|
free(usrDataptr->C_h);
|
|
free(usrDataptr);
|
|
}
|
|
|
|
// Test scenario 6
|
|
// scenario that validates the host launch function on created
|
|
// stream with same priority.
|
|
TEST_CASE("Unit_hipLaunchHostFunc_Samepriority") {
|
|
int priority = 0;
|
|
unsigned int flags = 0;
|
|
usrDataS* usrDataptr = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr != nullptr);
|
|
usrDataptr->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->A_h != nullptr);
|
|
usrDataptr->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->C_h != nullptr);
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr->A_h[i] = 21.0f;
|
|
}
|
|
for (int idx = 0; idx < NUM_OF_STREAM; idx++) {
|
|
hipStream_t stream[NUM_OF_STREAM];
|
|
HIP_CHECK(hipStreamCreateWithPriority(&stream[idx], flags, priority));
|
|
launchOperationOnStrm(usrDataptr, stream[idx]);
|
|
HIP_CHECK(hipStreamDestroy(stream[idx]));
|
|
}
|
|
free(usrDataptr->A_h);
|
|
free(usrDataptr->C_h);
|
|
free(usrDataptr);
|
|
}
|
|
|
|
// Test scenario 7
|
|
// scenario that validates the host launch function on
|
|
// created stream with different priority.
|
|
TEST_CASE("Unit_hipLaunchHostFunc_Diffpriority") {
|
|
int priority;
|
|
int priority_low{};
|
|
int priority_high{};
|
|
unsigned int flags = 0;
|
|
HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_low, &priority_high));
|
|
int numOfPriorities = priority_low - priority_high;
|
|
const float arr_size = numOfPriorities + 1;
|
|
hipStream_t* stream = reinterpret_cast<hipStream_t*>(malloc(arr_size * sizeof(hipStream_t)));
|
|
stream[0] = 0;
|
|
int count = 1;
|
|
// Create a stream for each of the priority levels
|
|
for (priority = priority_high; priority < priority_low; priority++) {
|
|
HIP_CHECK(hipStreamCreateWithPriority(&stream[count++], flags, priority));
|
|
}
|
|
usrDataS* usrDataptr = reinterpret_cast<usrDataS*>(malloc(sizeof(usrDataS)));
|
|
REQUIRE(usrDataptr != nullptr);
|
|
usrDataptr->A_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->A_h != nullptr);
|
|
usrDataptr->C_h = reinterpret_cast<float*>(malloc(Nbytes));
|
|
REQUIRE(usrDataptr->C_h != nullptr);
|
|
for (size_t i = 0; i < NSize; i++) {
|
|
usrDataptr->A_h[i] = 11.0f;
|
|
}
|
|
for (int idx = 0; idx < arr_size; idx++) {
|
|
launchOperationOnStrm(usrDataptr, stream[idx]);
|
|
}
|
|
count = 1;
|
|
for (priority = priority_high; priority < priority_low; priority++) {
|
|
HIP_CHECK(hipStreamDestroy(stream[count++]));
|
|
}
|
|
free(stream);
|
|
free(usrDataptr->A_h);
|
|
free(usrDataptr->C_h);
|
|
free(usrDataptr);
|
|
}
|
|
|
|
// Test scenario 8
|
|
// create a graph by using hipGraphsUsingStreamCapture and call host function.
|
|
|
|
typedef struct callBackData {
|
|
const char* fn_name;
|
|
double* data;
|
|
} callBackData_t;
|
|
double result_gpu = 0.0;
|
|
void myHostNodeCallback(void* data) {
|
|
static int iter = 0;
|
|
iter++;
|
|
// Check status of GPU after stream operations are done
|
|
callBackData_t* tmp = reinterpret_cast<callBackData_t*>(data);
|
|
// checkCudaErrors(tmp->status);
|
|
double* result = reinterpret_cast<double*>(tmp->data);
|
|
const char* function = reinterpret_cast<const char*>(tmp->fn_name);
|
|
if (iter == GRAPH_LAUNCH_ITERATIONS)
|
|
printf("[%s] Host callback final reduced sum = %lf\n", function, *result);
|
|
result_gpu = *result;
|
|
*result = 0.0; // reset the result
|
|
}
|
|
|
|
TEST_CASE("Unit_hipLaunchHostFunc_Graph") {
|
|
size_t size = 1 << 12;
|
|
size_t maxBlocks = 512;
|
|
float *inputVec_d = NULL, *inputVec_h = NULL;
|
|
double *outputVec_d = NULL, *result_d;
|
|
inputVec_h = reinterpret_cast<float*>(malloc(sizeof(float) * size));
|
|
HIP_CHECK(hipMalloc(&inputVec_d, sizeof(float) * size));
|
|
HIP_CHECK(hipMalloc(&outputVec_d, sizeof(double) * maxBlocks));
|
|
HIP_CHECK(hipMalloc(&result_d, sizeof(double)));
|
|
init_input(inputVec_h, size);
|
|
hipStream_t stream1, stream2, stream3, streamForGraph;
|
|
hipEvent_t forkStreamEvent, memsetEvent1, memsetEvent2;
|
|
hipGraph_t graph;
|
|
double result_h = 0.0;
|
|
HIP_CHECK(hipStreamCreate(&stream1));
|
|
HIP_CHECK(hipStreamCreate(&stream2));
|
|
HIP_CHECK(hipStreamCreate(&stream3));
|
|
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
|
HIP_CHECK(hipEventCreate(&forkStreamEvent));
|
|
HIP_CHECK(hipEventCreate(&memsetEvent1));
|
|
HIP_CHECK(hipEventCreate(&memsetEvent2));
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
|
|
HIP_CHECK(hipEventRecord(forkStreamEvent, stream1));
|
|
HIP_CHECK(hipStreamWaitEvent(stream2, forkStreamEvent, 0));
|
|
HIP_CHECK(hipStreamWaitEvent(stream3, forkStreamEvent, 0));
|
|
HIP_CHECK(
|
|
hipMemcpyAsync(inputVec_d, inputVec_h, sizeof(float) * size, hipMemcpyDefault, stream1));
|
|
HIP_CHECK(hipMemsetAsync(outputVec_d, 0, sizeof(double) * maxBlocks, stream2));
|
|
HIP_CHECK(hipEventRecord(memsetEvent1, stream2));
|
|
HIP_CHECK(hipMemsetAsync(result_d, 0, sizeof(double), stream3));
|
|
HIP_CHECK(hipEventRecord(memsetEvent2, stream3));
|
|
HIP_CHECK(hipStreamWaitEvent(stream1, memsetEvent1, 0));
|
|
hipLaunchKernelGGL(reduce, dim3(size / THREADS_PER_BLOCK, 1, 1), dim3(THREADS_PER_BLOCK, 1, 1), 0,
|
|
stream1, inputVec_d, outputVec_d);
|
|
HIP_CHECK(hipStreamWaitEvent(stream1, memsetEvent2, 0));
|
|
hipLaunchKernelGGL(reduceFinal, dim3(1, 1, 1), dim3(THREADS_PER_BLOCK, 1, 1), 0, stream1,
|
|
outputVec_d, result_d);
|
|
HIP_CHECK(hipMemcpyAsync(&result_h, result_d, sizeof(double), hipMemcpyDefault, stream1));
|
|
|
|
callBackData_t hostFnData;
|
|
hostFnData.data = &result_h;
|
|
hostFnData.fn_name = "hipGraphsUsingStreamCapture";
|
|
hipHostFn_t fn = myHostNodeCallback;
|
|
HIP_CHECK(hipLaunchHostFunc(stream1, fn, &hostFnData));
|
|
|
|
HIP_CHECK(hipStreamEndCapture(stream1, &graph));
|
|
hipGraphNode_t* nodes = NULL;
|
|
size_t numNodes = 0;
|
|
HIP_CHECK(hipGraphGetNodes(graph, nodes, &numNodes));
|
|
printf(
|
|
"\nNum of nodes in the graph created using stream"
|
|
"capture API = %zu\n",
|
|
numNodes);
|
|
HIP_CHECK(hipGraphGetRootNodes(graph, nodes, &numNodes));
|
|
printf("root nodes in the graph created using stream capture API = %zu\n", numNodes);
|
|
hipGraphExec_t graphExec;
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
|
|
auto start1 = std::chrono::high_resolution_clock::now();
|
|
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
|
|
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
|
}
|
|
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
|
auto stop = std::chrono::high_resolution_clock::now();
|
|
auto WithInit = std::chrono::duration<double, std::milli>(stop - start);
|
|
auto WithoutInit = std::chrono::duration<double, std::milli>(stop - start1);
|
|
std::cout << "Time taken for hipGraphsUsingStreamCapture with Init: "
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(WithInit).count()
|
|
<< " milliseconds without Init:"
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(WithoutInit).count()
|
|
<< " milliseconds " << std::endl;
|
|
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipEventDestroy(forkStreamEvent));
|
|
HIP_CHECK(hipEventDestroy(memsetEvent1));
|
|
HIP_CHECK(hipEventDestroy(memsetEvent2));
|
|
HIP_CHECK(hipStreamDestroy(stream1));
|
|
HIP_CHECK(hipStreamDestroy(stream2));
|
|
HIP_CHECK(hipStreamDestroy(stream3));
|
|
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
|
double result_h_cpu = 0.0;
|
|
for (size_t i = 0; i < size; i++) {
|
|
result_h_cpu += inputVec_h[i];
|
|
}
|
|
REQUIRE(result_h_cpu == result_gpu);
|
|
HIP_CHECK(hipFree(inputVec_d));
|
|
HIP_CHECK(hipFree(outputVec_d));
|
|
HIP_CHECK(hipFree(result_d));
|
|
free(inputVec_h);
|
|
}
|