/* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "mempool_common.hh" #include #pragma clang diagnostic ignored "-Wunused-parameter" static bool thread_results[NUMBER_OF_THREADS]; static constexpr auto NUM_ELM{1024 * 1024}; static constexpr int streamPerAsic = 2; /** * @addtogroup hipMallocAsync hipMallocAsync * @{ * @ingroup StreamOTest * `hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream)` * - Allocates memory with stream ordered semantics */ /** * Test Description * ------------------------ * - Basic test to verify proper allocation and stream ordering of hipMallocAsync when one * memory allocation is performed. * Test source * ------------------------ * - /unit/memory/hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Basic_OneAlloc") { MallocMemPoolAsync_OneAlloc( [](void** dev_ptr, size_t size, hipMemPool_t mem_pool, hipStream_t stream) { return hipMallocAsync(dev_ptr, size, stream); }, MemPools::dev_default); } /** * Test Description * ------------------------ * - Basic test to verify proper allocation and stream ordering of hipMallocAsync when two * memory allocations are performed. * Test source * ------------------------ * - /unit/memory/hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Basic_TwoAllocs") { MallocMemPoolAsync_TwoAllocs( [](void** dev_ptr, size_t size, hipMemPool_t mem_pool, hipStream_t stream) { return hipMallocAsync(dev_ptr, size, stream); }, MemPools::dev_default); } /** * Test Description * ------------------------ * - Basic test to verify that memory allocated with hipMallocAsync can be properly reused. * Test source * ------------------------ * - /unit/memory/hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Basic_Reuse") { MallocMemPoolAsync_Reuse([](void** dev_ptr, size_t size, hipMemPool_t mem_pool, hipStream_t stream) { return hipMallocAsync(dev_ptr, size, stream); }, MemPools::dev_default); } /** * Test Description * ------------------------ * - Test to verify hipMallocAsync behavior with invalid arguments: * -# Nullptr dev_ptr * -# Invalid stream handle * -# Size is max size_t * * Test source * ------------------------ * - /unit/memory/hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Negative_Parameters") { int device_id = 0; HIP_CHECK(hipSetDevice(device_id)); checkMempoolSupported(0) int* p = nullptr; size_t max_size = std::numeric_limits::max(); size_t alloc_size = 1024; MemPoolGuard mempool(MemPools::dev_default, device_id); StreamGuard stream(Streams::created); SECTION("dev_ptr is nullptr") { HIP_CHECK_ERROR(hipMallocAsync(nullptr, alloc_size, stream.stream()), hipErrorInvalidValue); } SECTION("Size is max size_t") { HIP_CHECK_ERROR(hipMallocAsync(reinterpret_cast(&p), max_size, stream.stream()), hipErrorOutOfMemory); } } /** * Common function to allocate memory using hipMallocAsync API through a stream, * launch kernel and perform vectorADD and validate results. Free memory using * hipFreeAsync. */ static bool checkMallocAsync(hipStream_t stream) { streamMemAllocTest testObj(NUM_ELM); // Create host buffer with test data. testObj.createHostBufferWithData(); // Allocate device memory. testObj.allocFromDefMempool(stream); // Transfer data to it asyncronously on stream. testObj.transferToMempool(stream); // Execute kernel and transfer result back to host asynchronously on stream. testObj.runKernel(stream); testObj.transferFromMempool(stream); // Free Buffer Asynchronously on stream. testObj.freeDevBuf(stream); HIP_CHECK(hipStreamSynchronize(stream)); // verify and validate REQUIRE(true == testObj.validateResult()); // Destroy resources testObj.freeHostBuf(); return true; } /** * Test Description * ------------------------ * - Test case to perform basic scenario. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_basic") { checkMempoolSupported(0) // create a stream hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); REQUIRE(true == checkMallocAsync(stream)); HIP_CHECK(hipStreamDestroy(stream)); } /** * Test Description * ------------------------ * - Test case to perform multi stream, allocate memory using * hipMallocAsync API for a stream1 and stream2, launch kernel and * perform vectorADD, synchronize stream1 and stream2 and validate * results. Free memory using hipFreeAsync. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Multistream_Concurrent") { checkMempoolSupported(0) streamMemAllocTest testObj1(NUM_ELM), testObj2(NUM_ELM); // create multiple streams hipStream_t stream1, stream2; HIP_CHECK(hipStreamCreate(&stream1)); HIP_CHECK(hipStreamCreate(&stream2)); // Create host buffer with test data. testObj1.createHostBufferWithData(); testObj2.createHostBufferWithData(); // Allocate device memory and transfer data to it asyncronously on streams. testObj1.allocFromDefMempool(stream1); testObj2.allocFromDefMempool(stream2); testObj1.transferToMempool(stream1); testObj2.transferToMempool(stream2); // Execute kernel and transfer result back to host asynchronously on streams. testObj1.runKernel(stream1); testObj2.runKernel(stream2); testObj1.transferFromMempool(stream1); testObj2.transferFromMempool(stream2); // Free Buffer Asynchronously on streams. testObj1.freeDevBuf(stream1); testObj2.freeDevBuf(stream2); // synchronize both stream1 and stream2 HIP_CHECK(hipStreamSynchronize(stream1)); HIP_CHECK(hipStreamSynchronize(stream2)); // verify and validate REQUIRE(true == testObj1.validateResult()); REQUIRE(true == testObj2.validateResult()); // Destroy resources HIP_CHECK(hipStreamDestroy(stream1)); HIP_CHECK(hipStreamDestroy(stream2)); testObj1.freeHostBuf(); testObj2.freeHostBuf(); } /** * Test Description * ------------------------ * - Allocate memory using hipMallocAsync API through stream1 and record event1, * allocate event1 to stream2 and put stream2 to wait, launch kernel through * stream2 and perform vectorADD and validate results. Free memory using hipFreeAsync. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_StreamEvent_CrissCross") { checkMempoolSupported(0) streamMemAllocTest testObj1(NUM_ELM), testObj2(NUM_ELM); // create two streams. hipStream_t stream1, stream2; HIP_CHECK(hipStreamCreate(&stream1)); HIP_CHECK(hipStreamCreate(&stream2)); // create an event hipEvent_t event1 = nullptr, event2 = nullptr; HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); // Create host buffer with test data. testObj1.createHostBufferWithData(); testObj2.createHostBufferWithData(); // Allocate device memory and transfer data to it asyncronously on streams. testObj1.allocFromDefMempool(stream1); testObj2.allocFromDefMempool(stream2); testObj1.transferToMempool(stream1); testObj2.transferToMempool(stream2); // create event record HIP_CHECK(hipEventRecord(event1, stream1)); HIP_CHECK(hipEventRecord(event2, stream2)); HIP_CHECK(hipStreamWaitEvent(stream2, event1, 0)); HIP_CHECK(hipStreamWaitEvent(stream1, event2, 0)); // Execute kernel and transfer result back to host asynchronously on streams. testObj1.runKernel(stream2); testObj2.runKernel(stream1); testObj1.transferFromMempool(stream2); testObj2.transferFromMempool(stream1); // Free Buffer Asynchronously on streams. testObj1.freeDevBuf(stream2); testObj2.freeDevBuf(stream1); // Wait for stream2. HIP_CHECK(hipStreamSynchronize(stream2)); HIP_CHECK(hipStreamSynchronize(stream1)); // verify and validate REQUIRE(true == testObj1.validateResult()); REQUIRE(true == testObj2.validateResult()); // Destroy resources HIP_CHECK(hipStreamDestroy(stream1)); HIP_CHECK(hipStreamDestroy(stream2)); HIP_CHECK(hipEventDestroy(event1)); HIP_CHECK(hipEventDestroy(event2)); testObj1.freeHostBuf(); testObj2.freeHostBuf(); } /** * Test Description * ------------------------ * - Test case to perform multi device scenario, get number of devices available * and call checkMallocAsync function for each device available. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Multidevice", "[multigpu]") { int num_devices; HIP_CHECK(hipGetDeviceCount(&num_devices)); for (int i = 0; i < num_devices; i++) { checkMempoolSupported(i) HIP_CHECK(hipSetDevice(i)); hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); REQUIRE(true == checkMallocAsync(stream)); HIP_CHECK(hipStreamDestroy(stream)); } } /** * Test Description * ------------------------ * - Queue the following commands hipMallocAsync, transfer data * to it asynchrously, launch Kernel, transfer results back to host * asynchronously and free buffer async in streams across all GPUs. * The execution in of the queued commands must happen concurrently. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ #if HT_AMD static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) { HIP_CHECK(hipSetDevice(idx)); // Create host buffer with test data. testObj->createHostBufferWithData(); // Allocate device memory and transfer data to it asyncronously on stream. testObj->allocFromDefMempool(strm); testObj->transferToMempool(strm); // Execute kernel and transfer result back to host asynchronously on stream. testObj->runKernel(strm); testObj->transferFromMempool(strm); // Free Buffer Asynchronously on stream. testObj->freeDevBuf(strm); } TEST_CASE("Unit_hipMallocAsync_Multidevice_Concurrent", "[multigpu]") { int num_devices; HIP_CHECK(hipGetDeviceCount(&num_devices)); checkIfMultiDev(num_devices) hipStream_t* stream_buf = new hipStream_t[num_devices]; std::vector tesObjBuf; // Allocate resources in each device for (int idx = 0; idx < num_devices; idx++) { checkMempoolSupported(idx) HIP_CHECK(hipSetDevice(idx)); HIP_CHECK(hipStreamCreate(&stream_buf[idx])); streamMemAllocTest* testObj = new streamMemAllocTest(NUM_ELM); tesObjBuf.push_back(testObj); } // Queue commands in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx); test.join(); } // Wait for the streams for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); HIP_CHECK(hipStreamSynchronize(stream_buf[idx])); // verify and validate REQUIRE(true == tesObjBuf[idx]->validateResult()); } // Deallocate resources in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); // Destroy resources tesObjBuf[idx]->freeHostBuf(); HIP_CHECK(hipStreamDestroy(stream_buf[idx])); delete tesObjBuf[idx]; } delete[] stream_buf; } /** * Test Description * ------------------------ * - Queue the following commands hipMallocAsync, transfer data * to it asynchrously, launch Kernel, transfer results back to host * asynchronously and free buffer async in streams across all GPUs * using multiple streams per GPU. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_Multidevice_MultiStream", "[multigpu]") { int num_devices; HIP_CHECK(hipGetDeviceCount(&num_devices)); checkIfMultiDev(num_devices) // 2 stream per ASIC hipStream_t* stream_buf = new hipStream_t[streamPerAsic * num_devices]; std::vector tesObjBuf; // Allocate resources in each device for (int idx = 0; idx < num_devices; idx++) { checkMempoolSupported(idx) HIP_CHECK(hipSetDevice(idx)); HIP_CHECK(hipStreamCreate(&stream_buf[streamPerAsic * idx])); HIP_CHECK(hipStreamCreate(&stream_buf[streamPerAsic * idx + 1])); streamMemAllocTest* testObj1 = new streamMemAllocTest(NUM_ELM); tesObjBuf.push_back(testObj1); streamMemAllocTest* testObj2 = new streamMemAllocTest(NUM_ELM); tesObjBuf.push_back(testObj2); } // Queue commands in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx], stream_buf[streamPerAsic * idx], idx); std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1], stream_buf[streamPerAsic * idx + 1], idx); test1.join(); test2.join(); } // Wait for the streams for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); HIP_CHECK(hipStreamSynchronize(stream_buf[streamPerAsic * idx])); HIP_CHECK(hipStreamSynchronize(stream_buf[streamPerAsic * idx + 1])); // verify and validate REQUIRE(true == tesObjBuf[streamPerAsic * idx]->validateResult()); REQUIRE(true == tesObjBuf[streamPerAsic * idx + 1]->validateResult()); } // Deallocate resources in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); // Destroy resources tesObjBuf[streamPerAsic * idx]->freeHostBuf(); tesObjBuf[streamPerAsic * idx + 1]->freeHostBuf(); HIP_CHECK(hipStreamDestroy(stream_buf[streamPerAsic * idx])); HIP_CHECK(hipStreamDestroy(stream_buf[streamPerAsic * idx + 1])); delete tesObjBuf[streamPerAsic * idx]; delete tesObjBuf[streamPerAsic * idx + 1]; } delete[] stream_buf; } #endif /** * Test Description * ------------------------ * - Assign device memory using hipMalloc, launch kernel and perform * vector square and validate. Free memory using hipFreeAsync API. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_ByUsinghipMalloc") { checkMempoolSupported(0) size_t byte_size = NUM_ELM * sizeof(float); // create a stream hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); float *A_h, *C_h; float *A_d, *C_d; // assign memory to host pointers A_h = reinterpret_cast(malloc(byte_size)); C_h = reinterpret_cast(malloc(byte_size)); // set data to host for (int i = 0; i < NUM_ELM; i++) { A_h[i] = 7.0f; C_h[i] = 0; } // assign memory to device pointers HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), byte_size)); HIP_CHECK(hipMalloc(reinterpret_cast(&C_d), byte_size)); HIP_CHECK(hipMemcpyAsync(A_d, A_h, byte_size, hipMemcpyHostToDevice, stream)); hipLaunchKernelGGL(HipTest::vector_square, dim3(NUM_ELM / THREADS_PER_BLOCK), dim3(THREADS_PER_BLOCK), 0, stream, static_cast(A_d), C_d, NUM_ELM); HIP_CHECK(hipMemcpyAsync(C_h, C_d, byte_size, hipMemcpyDeviceToHost, stream)); HIP_CHECK(hipFreeAsync(reinterpret_cast(A_d), stream)); HIP_CHECK(hipFreeAsync(reinterpret_cast(C_d), stream)); HIP_CHECK(hipStreamSynchronize(stream)); // verify and validate for (int i = 0; i < NUM_ELM; i++) { REQUIRE(C_h[i] == (A_h[i] * A_h[i])); } HIP_CHECK(hipStreamDestroy(stream)); free(A_h); free(C_h); } /** * Test Description * ------------------------ * - Assign device memory using hipMallocAsync, launch kernel and perform * vector square and validate. Free memory using hipFree API. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_ByUsinghipFree") { size_t byte_size = NUM_ELM * sizeof(float); checkMempoolSupported(0) // create a stream hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); float *A_h, *C_h; float *A_d, *C_d; // assign memory to host pointers A_h = reinterpret_cast(malloc(byte_size)); C_h = reinterpret_cast(malloc(byte_size)); // set data to host for (int i = 0; i < NUM_ELM; i++) { A_h[i] = 5.0f; C_h[i] = 0; } // assign memory to device pointers HIP_CHECK(hipMallocAsync(reinterpret_cast(&A_d), byte_size, stream)); HIP_CHECK(hipMallocAsync(reinterpret_cast(&C_d), byte_size, stream)); HIP_CHECK(hipMemcpyAsync(A_d, A_h, byte_size, hipMemcpyHostToDevice, stream)); hipLaunchKernelGGL(HipTest::vector_square, dim3(NUM_ELM / THREADS_PER_BLOCK), dim3(THREADS_PER_BLOCK), 0, stream, static_cast(A_d), C_d, NUM_ELM); HIP_CHECK(hipMemcpyAsync(C_h, C_d, byte_size, hipMemcpyDeviceToHost, stream)); HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipFree(reinterpret_cast(A_d))); HIP_CHECK(hipFree(reinterpret_cast(C_d))); // verify and validate for (int i = 0; i < NUM_ELM; i++) { REQUIRE(C_h[i] == (A_h[i] * A_h[i])); } HIP_CHECK(hipStreamDestroy(stream)); free(A_h); free(C_h); } /** * Test Description * ------------------------ * - Test case to check hipMallocAsync allocation and usage in multiple * threads. Each thread will use a local stream. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ static void threadTestLocalStream(int threadNum) { hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); thread_results[threadNum] = checkMallocAsync(stream); HIP_CHECK(hipStreamDestroy(stream)); } static bool testhipMallocAsyncMThreadLocalStrm() { std::vector tests; // Spawn the test threads for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) { thread_results[idx] = false; tests.push_back(std::thread(threadTestLocalStream, idx)); } // Wait for all threads to complete for (std::thread& t : tests) { t.join(); } // Wait for thread bool status = true; for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) { status = status & thread_results[idx]; } return status; } TEST_CASE("Unit_hipMallocAsync_MThread_ThreadLocalStream") { checkMempoolSupported(0) REQUIRE(true == testhipMallocAsyncMThreadLocalStrm()); } /** * Test Description * ------------------------ * - Test case to check hipMallocAsync allocation and usage in multiple * threads. Threads will use a common shared stream. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ static void threadTestCommonStream(int threadNum, hipStream_t stream) { thread_results[threadNum] = checkMallocAsync(stream); } static bool testhipMallocAsyncMThreadLocalStrm(hipStream_t stream) { std::vector tests; // Spawn the test threads for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) { thread_results[idx] = false; tests.push_back(std::thread(threadTestCommonStream, idx, stream)); } // Wait for all threads to complete for (std::thread& t : tests) { t.join(); } // Wait for thread bool status = true; for (int idx = 0; idx < NUMBER_OF_THREADS; idx++) { status = status & thread_results[idx]; } return status; } TEST_CASE("Unit_hipMallocAsync_MThread_ThreadSharedStream") { checkMempoolSupported(0) hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); REQUIRE(true == testhipMallocAsyncMThreadLocalStrm(stream)); HIP_CHECK(hipStreamDestroy(stream)); } /** * Test Description * ------------------------ * - Test case to check MallocAsync functionality on user created stream, * null stream and hipstreamperthread concurrently. launch kernel and wait * for all streams to complete and validate results. * ------------------------ * - catch\unit\memory\hipMallocAsync.cc * Test requirements * ------------------------ * - HIP_VERSION >= 6.2 */ TEST_CASE("Unit_hipMallocAsync_DefaultStreams_Concurrent") { checkMempoolSupported(0) streamMemAllocTest testObj[3] = { streamMemAllocTest(NUM_ELM), streamMemAllocTest(NUM_ELM), streamMemAllocTest(NUM_ELM)}; // create multiple streams hipStream_t stream[3]; HIP_CHECK(hipStreamCreate(&stream[0])); stream[1] = 0; // Null stream stream[2] = hipStreamPerThread; // Queue operations on the 3 streams for (int idx = 0; idx < 3; idx++) { // Create host buffer with test data. testObj[idx].createHostBufferWithData(); // Allocate device memory and transfer data to it asyncronously on stream. testObj[idx].allocFromDefMempool(stream[idx]); testObj[idx].transferToMempool(stream[idx]); // Execute kernel and transfer result back to host asynchronously on stream. testObj[idx].runKernel(stream[idx]); testObj[idx].transferFromMempool(stream[idx]); // Free Buffer Asynchronously on stream. testObj[idx].freeDevBuf(stream[idx]); } // Wait for the 3 streams for (int idx = 0; idx < 3; idx++) { HIP_CHECK(hipStreamSynchronize(stream[idx])); // verify and validate REQUIRE(true == testObj[idx].validateResult()); // Destroy resources testObj[idx].freeHostBuf(); } HIP_CHECK(hipStreamDestroy(stream[0])); } /** * End doxygen group StreamOTest. * @} */