2
0
Ficheiros
rocm-systems/projects/hip-tests/catch/unit/memory/hipMallocAsync.cc
T

654 linhas
23 KiB
C++

/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mempool_common.hh"
#include <limits>
#pragma clang diagnostic ignored "-Wunused-parameter"
static bool thread_results[NUMBER_OF_THREADS];
static constexpr auto NUM_ELM{1024 * 1024};
static constexpr int streamPerAsic = 2;
/**
* @addtogroup hipMallocAsync hipMallocAsync
* @{
* @ingroup StreamOTest
* `hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream)`
* - Allocates memory with stream ordered semantics
*/
/**
* Test Description
* ------------------------
* - Basic test to verify proper allocation and stream ordering of hipMallocAsync when one
* memory allocation is performed.
* Test source
* ------------------------
* - /unit/memory/hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Basic_OneAlloc") {
MallocMemPoolAsync_OneAlloc(
[](void** dev_ptr, size_t size, hipMemPool_t mem_pool, hipStream_t stream) {
return hipMallocAsync(dev_ptr, size, stream);
},
MemPools::dev_default);
}
/**
* Test Description
* ------------------------
* - Basic test to verify proper allocation and stream ordering of hipMallocAsync when two
* memory allocations are performed.
* Test source
* ------------------------
* - /unit/memory/hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Basic_TwoAllocs") {
MallocMemPoolAsync_TwoAllocs(
[](void** dev_ptr, size_t size, hipMemPool_t mem_pool, hipStream_t stream) {
return hipMallocAsync(dev_ptr, size, stream);
},
MemPools::dev_default);
}
/**
* Test Description
* ------------------------
* - Basic test to verify that memory allocated with hipMallocAsync can be properly reused.
* Test source
* ------------------------
* - /unit/memory/hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Basic_Reuse") {
MallocMemPoolAsync_Reuse([](void** dev_ptr, size_t size, hipMemPool_t mem_pool,
hipStream_t stream) { return hipMallocAsync(dev_ptr, size, stream); },
MemPools::dev_default);
}
/**
* Test Description
* ------------------------
* - Test to verify hipMallocAsync behavior with invalid arguments:
* -# Nullptr dev_ptr
* -# Invalid stream handle
* -# Size is max size_t
*
* Test source
* ------------------------
* - /unit/memory/hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Negative_Parameters") {
int device_id = 0;
HIP_CHECK(hipSetDevice(device_id));
checkMempoolSupported(0)
int* p = nullptr;
size_t max_size = std::numeric_limits<size_t>::max();
size_t alloc_size = 1024;
MemPoolGuard mempool(MemPools::dev_default, device_id);
StreamGuard stream(Streams::created);
SECTION("dev_ptr is nullptr") {
HIP_CHECK_ERROR(hipMallocAsync(nullptr, alloc_size, stream.stream()), hipErrorInvalidValue);
}
SECTION("Size is max size_t") {
HIP_CHECK_ERROR(hipMallocAsync(reinterpret_cast<void**>(&p), max_size, stream.stream()),
hipErrorOutOfMemory);
}
}
/**
* Common function to allocate memory using hipMallocAsync API through a stream,
* launch kernel and perform vectorADD and validate results. Free memory using
* hipFreeAsync.
*/
static bool checkMallocAsync(hipStream_t stream) {
streamMemAllocTest testObj(NUM_ELM);
// Create host buffer with test data.
testObj.createHostBufferWithData();
// Allocate device memory.
testObj.allocFromDefMempool(stream);
// Transfer data to it asyncronously on stream.
testObj.transferToMempool(stream);
// Execute kernel and transfer result back to host asynchronously on stream.
testObj.runKernel(stream);
testObj.transferFromMempool(stream);
// Free Buffer Asynchronously on stream.
testObj.freeDevBuf(stream);
HIP_CHECK(hipStreamSynchronize(stream));
// verify and validate
REQUIRE(true == testObj.validateResult());
// Destroy resources
testObj.freeHostBuf();
return true;
}
/**
* Test Description
* ------------------------
* - Test case to perform basic scenario.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_basic") {
checkMempoolSupported(0)
// create a stream
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
REQUIRE(true == checkMallocAsync(stream));
HIP_CHECK(hipStreamDestroy(stream));
}
/**
* Test Description
* ------------------------
* - Test case to perform multi stream, allocate memory using
* hipMallocAsync API for a stream1 and stream2, launch kernel and
* perform vectorADD, synchronize stream1 and stream2 and validate
* results. Free memory using hipFreeAsync.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Multistream_Concurrent") {
checkMempoolSupported(0) streamMemAllocTest testObj1(NUM_ELM), testObj2(NUM_ELM);
// create multiple streams
hipStream_t stream1, stream2;
HIP_CHECK(hipStreamCreate(&stream1));
HIP_CHECK(hipStreamCreate(&stream2));
// Create host buffer with test data.
testObj1.createHostBufferWithData();
testObj2.createHostBufferWithData();
// Allocate device memory and transfer data to it asyncronously on streams.
testObj1.allocFromDefMempool(stream1);
testObj2.allocFromDefMempool(stream2);
testObj1.transferToMempool(stream1);
testObj2.transferToMempool(stream2);
// Execute kernel and transfer result back to host asynchronously on streams.
testObj1.runKernel(stream1);
testObj2.runKernel(stream2);
testObj1.transferFromMempool(stream1);
testObj2.transferFromMempool(stream2);
// Free Buffer Asynchronously on streams.
testObj1.freeDevBuf(stream1);
testObj2.freeDevBuf(stream2);
// synchronize both stream1 and stream2
HIP_CHECK(hipStreamSynchronize(stream1));
HIP_CHECK(hipStreamSynchronize(stream2));
// verify and validate
REQUIRE(true == testObj1.validateResult());
REQUIRE(true == testObj2.validateResult());
// Destroy resources
HIP_CHECK(hipStreamDestroy(stream1));
HIP_CHECK(hipStreamDestroy(stream2));
testObj1.freeHostBuf();
testObj2.freeHostBuf();
}
/**
* Test Description
* ------------------------
* - Allocate memory using hipMallocAsync API through stream1 and record event1,
* allocate event1 to stream2 and put stream2 to wait, launch kernel through
* stream2 and perform vectorADD and validate results. Free memory using hipFreeAsync.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_StreamEvent_CrissCross") {
checkMempoolSupported(0) streamMemAllocTest testObj1(NUM_ELM), testObj2(NUM_ELM);
// create two streams.
hipStream_t stream1, stream2;
HIP_CHECK(hipStreamCreate(&stream1));
HIP_CHECK(hipStreamCreate(&stream2));
// create an event
hipEvent_t event1 = nullptr, event2 = nullptr;
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
// Create host buffer with test data.
testObj1.createHostBufferWithData();
testObj2.createHostBufferWithData();
// Allocate device memory and transfer data to it asyncronously on streams.
testObj1.allocFromDefMempool(stream1);
testObj2.allocFromDefMempool(stream2);
testObj1.transferToMempool(stream1);
testObj2.transferToMempool(stream2);
// create event record
HIP_CHECK(hipEventRecord(event1, stream1));
HIP_CHECK(hipEventRecord(event2, stream2));
HIP_CHECK(hipStreamWaitEvent(stream2, event1, 0));
HIP_CHECK(hipStreamWaitEvent(stream1, event2, 0));
// Execute kernel and transfer result back to host asynchronously on streams.
testObj1.runKernel(stream2);
testObj2.runKernel(stream1);
testObj1.transferFromMempool(stream2);
testObj2.transferFromMempool(stream1);
// Free Buffer Asynchronously on streams.
testObj1.freeDevBuf(stream2);
testObj2.freeDevBuf(stream1);
// Wait for stream2.
HIP_CHECK(hipStreamSynchronize(stream2));
HIP_CHECK(hipStreamSynchronize(stream1));
// verify and validate
REQUIRE(true == testObj1.validateResult());
REQUIRE(true == testObj2.validateResult());
// Destroy resources
HIP_CHECK(hipStreamDestroy(stream1));
HIP_CHECK(hipStreamDestroy(stream2));
HIP_CHECK(hipEventDestroy(event1));
HIP_CHECK(hipEventDestroy(event2));
testObj1.freeHostBuf();
testObj2.freeHostBuf();
}
/**
* Test Description
* ------------------------
* - Test case to perform multi device scenario, get number of devices available
* and call checkMallocAsync function for each device available.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Multidevice", "[multigpu]") {
int num_devices;
HIP_CHECK(hipGetDeviceCount(&num_devices));
for (int i = 0; i < num_devices; i++) {
checkMempoolSupported(i) HIP_CHECK(hipSetDevice(i));
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
REQUIRE(true == checkMallocAsync(stream));
HIP_CHECK(hipStreamDestroy(stream));
}
}
/**
* Test Description
* ------------------------
* - Queue the following commands hipMallocAsync, transfer data
* to it asynchrously, launch Kernel, transfer results back to host
* asynchronously and free buffer async in streams across all GPUs.
* The execution in of the queued commands must happen concurrently.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
#if HT_AMD
static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
HIP_CHECK(hipSetDevice(idx));
// Create host buffer with test data.
testObj->createHostBufferWithData();
// Allocate device memory and transfer data to it asyncronously on stream.
testObj->allocFromDefMempool(strm);
testObj->transferToMempool(strm);
// Execute kernel and transfer result back to host asynchronously on stream.
testObj->runKernel(strm);
testObj->transferFromMempool(strm);
// Free Buffer Asynchronously on stream.
testObj->freeDevBuf(strm);
}
TEST_CASE("Unit_hipMallocAsync_Multidevice_Concurrent", "[multigpu]") {
int num_devices;
HIP_CHECK(hipGetDeviceCount(&num_devices));
checkIfMultiDev(num_devices) hipStream_t* stream_buf = new hipStream_t[num_devices];
std::vector<streamMemAllocTest*> tesObjBuf;
// Allocate resources in each device
for (int idx = 0; idx < num_devices; idx++) {
checkMempoolSupported(idx) HIP_CHECK(hipSetDevice(idx));
HIP_CHECK(hipStreamCreate(&stream_buf[idx]));
streamMemAllocTest* testObj = new streamMemAllocTest(NUM_ELM);
tesObjBuf.push_back(testObj);
}
// Queue commands in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
test.join();
}
// Wait for the streams
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
HIP_CHECK(hipStreamSynchronize(stream_buf[idx]));
// verify and validate
REQUIRE(true == tesObjBuf[idx]->validateResult());
}
// Deallocate resources in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
// Destroy resources
tesObjBuf[idx]->freeHostBuf();
HIP_CHECK(hipStreamDestroy(stream_buf[idx]));
delete tesObjBuf[idx];
}
delete[] stream_buf;
}
/**
* Test Description
* ------------------------
* - Queue the following commands hipMallocAsync, transfer data
* to it asynchrously, launch Kernel, transfer results back to host
* asynchronously and free buffer async in streams across all GPUs
* using multiple streams per GPU.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_Multidevice_MultiStream", "[multigpu]") {
int num_devices;
HIP_CHECK(hipGetDeviceCount(&num_devices));
checkIfMultiDev(num_devices)
// 2 stream per ASIC
hipStream_t* stream_buf = new hipStream_t[streamPerAsic * num_devices];
std::vector<streamMemAllocTest*> tesObjBuf;
// Allocate resources in each device
for (int idx = 0; idx < num_devices; idx++) {
checkMempoolSupported(idx) HIP_CHECK(hipSetDevice(idx));
HIP_CHECK(hipStreamCreate(&stream_buf[streamPerAsic * idx]));
HIP_CHECK(hipStreamCreate(&stream_buf[streamPerAsic * idx + 1]));
streamMemAllocTest* testObj1 = new streamMemAllocTest(NUM_ELM);
tesObjBuf.push_back(testObj1);
streamMemAllocTest* testObj2 = new streamMemAllocTest(NUM_ELM);
tesObjBuf.push_back(testObj2);
}
// Queue commands in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
stream_buf[streamPerAsic * idx], idx);
std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
stream_buf[streamPerAsic * idx + 1], idx);
test1.join();
test2.join();
}
// Wait for the streams
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
HIP_CHECK(hipStreamSynchronize(stream_buf[streamPerAsic * idx]));
HIP_CHECK(hipStreamSynchronize(stream_buf[streamPerAsic * idx + 1]));
// verify and validate
REQUIRE(true == tesObjBuf[streamPerAsic * idx]->validateResult());
REQUIRE(true == tesObjBuf[streamPerAsic * idx + 1]->validateResult());
}
// Deallocate resources in each device
for (int idx = 0; idx < num_devices; idx++) {
HIP_CHECK(hipSetDevice(idx));
// Destroy resources
tesObjBuf[streamPerAsic * idx]->freeHostBuf();
tesObjBuf[streamPerAsic * idx + 1]->freeHostBuf();
HIP_CHECK(hipStreamDestroy(stream_buf[streamPerAsic * idx]));
HIP_CHECK(hipStreamDestroy(stream_buf[streamPerAsic * idx + 1]));
delete tesObjBuf[streamPerAsic * idx];
delete tesObjBuf[streamPerAsic * idx + 1];
}
delete[] stream_buf;
}
#endif
/**
* Test Description
* ------------------------
* - Assign device memory using hipMalloc, launch kernel and perform
* vector square and validate. Free memory using hipFreeAsync API.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_ByUsinghipMalloc") {
checkMempoolSupported(0) size_t byte_size = NUM_ELM * sizeof(float);
// create a stream
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
float *A_h, *C_h;
float *A_d, *C_d;
// assign memory to host pointers
A_h = reinterpret_cast<float*>(malloc(byte_size));
C_h = reinterpret_cast<float*>(malloc(byte_size));
// set data to host
for (int i = 0; i < NUM_ELM; i++) {
A_h[i] = 7.0f;
C_h[i] = 0;
}
// assign memory to device pointers
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&A_d), byte_size));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&C_d), byte_size));
HIP_CHECK(hipMemcpyAsync(A_d, A_h, byte_size, hipMemcpyHostToDevice, stream));
hipLaunchKernelGGL(HipTest::vector_square, dim3(NUM_ELM / THREADS_PER_BLOCK),
dim3(THREADS_PER_BLOCK), 0, stream, static_cast<const float*>(A_d), C_d,
NUM_ELM);
HIP_CHECK(hipMemcpyAsync(C_h, C_d, byte_size, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A_d), stream));
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(C_d), stream));
HIP_CHECK(hipStreamSynchronize(stream));
// verify and validate
for (int i = 0; i < NUM_ELM; i++) {
REQUIRE(C_h[i] == (A_h[i] * A_h[i]));
}
HIP_CHECK(hipStreamDestroy(stream));
free(A_h);
free(C_h);
}
/**
* Test Description
* ------------------------
* - Assign device memory using hipMallocAsync, launch kernel and perform
* vector square and validate. Free memory using hipFree API.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_ByUsinghipFree") {
size_t byte_size = NUM_ELM * sizeof(float);
checkMempoolSupported(0)
// create a stream
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
float *A_h, *C_h;
float *A_d, *C_d;
// assign memory to host pointers
A_h = reinterpret_cast<float*>(malloc(byte_size));
C_h = reinterpret_cast<float*>(malloc(byte_size));
// set data to host
for (int i = 0; i < NUM_ELM; i++) {
A_h[i] = 5.0f;
C_h[i] = 0;
}
// assign memory to device pointers
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&A_d), byte_size, stream));
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&C_d), byte_size, stream));
HIP_CHECK(hipMemcpyAsync(A_d, A_h, byte_size, hipMemcpyHostToDevice, stream));
hipLaunchKernelGGL(HipTest::vector_square, dim3(NUM_ELM / THREADS_PER_BLOCK),
dim3(THREADS_PER_BLOCK), 0, stream, static_cast<const float*>(A_d), C_d,
NUM_ELM);
HIP_CHECK(hipMemcpyAsync(C_h, C_d, byte_size, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipFree(reinterpret_cast<void*>(A_d)));
HIP_CHECK(hipFree(reinterpret_cast<void*>(C_d)));
// verify and validate
for (int i = 0; i < NUM_ELM; i++) {
REQUIRE(C_h[i] == (A_h[i] * A_h[i]));
}
HIP_CHECK(hipStreamDestroy(stream));
free(A_h);
free(C_h);
}
/**
* Test Description
* ------------------------
* - Test case to check hipMallocAsync allocation and usage in multiple
* threads. Each thread will use a local stream.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
static void threadTestLocalStream(int threadNum) {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
thread_results[threadNum] = checkMallocAsync(stream);
HIP_CHECK(hipStreamDestroy(stream));
}
static bool testhipMallocAsyncMThreadLocalStrm() {
std::vector<std::thread> tests;
// Spawn the test threads
for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) {
thread_results[idx] = false;
tests.push_back(std::thread(threadTestLocalStream, idx));
}
// Wait for all threads to complete
for (std::thread& t : tests) {
t.join();
}
// Wait for thread
bool status = true;
for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) {
status = status & thread_results[idx];
}
return status;
}
TEST_CASE("Unit_hipMallocAsync_MThread_ThreadLocalStream") {
checkMempoolSupported(0) REQUIRE(true == testhipMallocAsyncMThreadLocalStrm());
}
/**
* Test Description
* ------------------------
* - Test case to check hipMallocAsync allocation and usage in multiple
* threads. Threads will use a common shared stream.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
static void threadTestCommonStream(int threadNum, hipStream_t stream) {
thread_results[threadNum] = checkMallocAsync(stream);
}
static bool testhipMallocAsyncMThreadLocalStrm(hipStream_t stream) {
std::vector<std::thread> tests;
// Spawn the test threads
for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) {
thread_results[idx] = false;
tests.push_back(std::thread(threadTestCommonStream, idx, stream));
}
// Wait for all threads to complete
for (std::thread& t : tests) {
t.join();
}
// Wait for thread
bool status = true;
for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) {
status = status & thread_results[idx];
}
return status;
}
TEST_CASE("Unit_hipMallocAsync_MThread_ThreadSharedStream") {
checkMempoolSupported(0) hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
REQUIRE(true == testhipMallocAsyncMThreadLocalStrm(stream));
HIP_CHECK(hipStreamDestroy(stream));
}
/**
* Test Description
* ------------------------
* - Test case to check MallocAsync functionality on user created stream,
* null stream and hipstreamperthread concurrently. launch kernel and wait
* for all streams to complete and validate results.
* ------------------------
* - catch\unit\memory\hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 6.2
*/
TEST_CASE("Unit_hipMallocAsync_DefaultStreams_Concurrent") {
checkMempoolSupported(0) streamMemAllocTest testObj[3] = {
streamMemAllocTest(NUM_ELM), streamMemAllocTest(NUM_ELM), streamMemAllocTest(NUM_ELM)};
// create multiple streams
hipStream_t stream[3];
HIP_CHECK(hipStreamCreate(&stream[0]));
stream[1] = 0; // Null stream
stream[2] = hipStreamPerThread;
// Queue operations on the 3 streams
for (int idx = 0; idx < 3; idx++) {
// Create host buffer with test data.
testObj[idx].createHostBufferWithData();
// Allocate device memory and transfer data to it asyncronously on stream.
testObj[idx].allocFromDefMempool(stream[idx]);
testObj[idx].transferToMempool(stream[idx]);
// Execute kernel and transfer result back to host asynchronously on stream.
testObj[idx].runKernel(stream[idx]);
testObj[idx].transferFromMempool(stream[idx]);
// Free Buffer Asynchronously on stream.
testObj[idx].freeDevBuf(stream[idx]);
}
// Wait for the 3 streams
for (int idx = 0; idx < 3; idx++) {
HIP_CHECK(hipStreamSynchronize(stream[idx]));
// verify and validate
REQUIRE(true == testObj[idx].validateResult());
// Destroy resources
testObj[idx].freeHostBuf();
}
HIP_CHECK(hipStreamDestroy(stream[0]));
}
/**
* End doxygen group StreamOTest.
* @}
*/