/* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Test Case Description: 1) This testcase verifies the basic scenario - supported on all devices */ #include "mempool_common.hh" #include #include #include #include #include #include #include #include static hipMemPoolProps kPoolProps; void initMemPoolProps() { kPoolProps.allocType = hipMemAllocationTypePinned; kPoolProps.handleTypes = hipMemHandleTypeNone; kPoolProps.location.type = hipMemLocationTypeDevice; kPoolProps.location.id = 0; kPoolProps.win32SecurityAttributes = nullptr; }; /* This testcase verifies HIP Mem Pool API basic scenario - supported on all devices */ TEST_CASE("Unit_hipMemPoolApi_Basic") { int mem_pool_support = 0; HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); if (!mem_pool_support) { SUCCEED("Runtime doesn't support Memory Pool. Skip the test case."); return; } int numElements = 64 * 1024 * 1024; float *A = nullptr, *B = nullptr; hipMemPool_t mem_pool = nullptr; int device = 0; HIP_CHECK(hipDeviceGetDefaultMemPool(&mem_pool, device)); HIP_CHECK(hipDeviceSetMemPool(device, mem_pool)); HIP_CHECK(hipDeviceGetMemPool(&mem_pool, device)); hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); HIP_CHECK(hipMallocAsync(reinterpret_cast(&A), numElements * sizeof(float), stream)); INFO("hipMallocAsync result: " << A); HIP_CHECK(hipFreeAsync(A, stream)); // Reset the default memory pool usage for the following tests hipMemPoolAttr attr = hipMemPoolAttrUsedMemHigh; std::uint64_t value64 = 0; HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value64)); size_t min_bytes_to_hold = 1024 * 1024; HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold)); attr = hipMemPoolReuseFollowEventDependencies; int value = 0; HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value)); hipMemAccessDesc desc_list = {{hipMemLocationTypeDevice, 0}, hipMemAccessFlagsProtReadWrite}; int count = 1; HIP_CHECK(hipMemPoolSetAccess(mem_pool, &desc_list, count)); hipMemAccessFlags flags = hipMemAccessFlagsProtNone; hipMemLocation location = {hipMemLocationTypeDevice, 0}; HIP_CHECK(hipMemPoolGetAccess(&flags, mem_pool, &location)); initMemPoolProps(); HIP_CHECK(hipMemPoolCreate(&mem_pool, &kPoolProps)); HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream)); HIP_CHECK(hipMemPoolDestroy(mem_pool)); HIP_CHECK(hipStreamDestroy(stream)); } TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") { int mem_pool_support = 0; HIP_CHECK(hipSetDevice(0)); HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); if (!mem_pool_support) { SUCCEED("Runtime doesn't support Memory Pool. Skip the test case."); return; } unsigned int* notified = nullptr; HIP_CHECK(hipHostMalloc(¬ified, sizeof(unsigned int))); *notified = 0; initMemPoolProps(); hipMemPool_t mem_pool; HIP_CHECK(hipMemPoolCreate(&mem_pool, &kPoolProps)); float *B, *C; hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); size_t numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream)); numElements = 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&C), numElements * sizeof(float), mem_pool, stream)); int blocks = 1024; hipMemPoolAttr attr; notifiedKernel<<>>(B, notified); HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream)); attr = hipMemPoolAttrReservedMemCurrent; std::uint64_t res_before_sync = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &res_before_sync)); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream)); std::uint64_t res_after_sync = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &res_after_sync)); // Sync must releaae memory to OS REQUIRE(res_after_sync <= res_before_sync); int value = 0; attr = hipMemPoolReuseFollowEventDependencies; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value)); // Default enabled REQUIRE(1 == value); attr = hipMemPoolReuseAllowOpportunistic; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value)); // Default enabled REQUIRE(1 == value); attr = hipMemPoolReuseAllowInternalDependencies; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value)); // Default enabled REQUIRE(1 == value); attr = hipMemPoolAttrReleaseThreshold; std::uint64_t value64 = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Default is 0 REQUIRE(0 == value64); attr = hipMemPoolAttrReservedMemHigh; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Must be bigger than current REQUIRE(value64 >= res_after_sync); attr = hipMemPoolAttrUsedMemCurrent; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the current usage query works - just small buffer left REQUIRE(sizeof(float) * 1024 == value64); attr = hipMemPoolAttrUsedMemHigh; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the high watermark usage works - the both buffers must be reported REQUIRE(sizeof(float) * (8 * 1024 * 1024 + 1024) == value64); HIP_CHECK(hipMemPoolDestroy(mem_pool)); HIP_CHECK(hipFreeAsync(reinterpret_cast(C), stream)); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipHostFree(notified)); } TEST_CASE("Unit_hipMemPoolApi_BasicTrim") { int mem_pool_support = 0; HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); if (!mem_pool_support) { SUCCEED("Runtime doesn't support Memory Pool. Skip the test case."); return; } unsigned int* notified = nullptr; HIP_CHECK(hipHostMalloc(¬ified, sizeof(unsigned int))); *notified = 0; initMemPoolProps(); hipMemPool_t mem_pool; HIP_CHECK(hipMemPoolCreate(&mem_pool, &kPoolProps)); float *B, *C; hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); size_t numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream)); numElements = 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&C), numElements * sizeof(float), mem_pool, stream)); int blocks = 2; notifiedKernel<<>>(B, notified); hipMemPoolAttr attr; attr = hipMemPoolAttrReleaseThreshold; // The pool must hold 128MB std::uint64_t threshold = 128 * 1024 * 1024; HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &threshold)); // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream)); // Get reserved memory before trim attr = hipMemPoolAttrReservedMemCurrent; std::uint64_t res_before_trim = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &res_before_trim)); size_t min_bytes_to_hold = sizeof(float) * 1024; HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold)); std::uint64_t res_after_trim = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &res_after_trim)); // Trim must be a nop because execution isn't done REQUIRE(res_before_trim == res_after_trim); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream)); std::uint64_t res_after_sync = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &res_after_sync)); // Since hipMemPoolAttrReleaseThreshold is 128 MB sync does nothing to the freed memory REQUIRE(res_after_trim == res_after_sync); HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold)); HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &res_after_trim)); // Validate memory after real trim. The pool must hold less memory than before REQUIRE(res_after_trim < res_after_sync); attr = hipMemPoolAttrReleaseThreshold; std::uint64_t value64 = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the threshold query works REQUIRE(threshold == value64); attr = hipMemPoolAttrUsedMemCurrent; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the current usage query works - just small buffer left REQUIRE(sizeof(float) * 1024 == value64); attr = hipMemPoolAttrUsedMemHigh; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the high watermark usage works - the both buffers must be reported REQUIRE(sizeof(float) * (8 * 1024 * 1024 + 1024) == value64); HIP_CHECK(hipMemPoolDestroy(mem_pool)); HIP_CHECK(hipFreeAsync(reinterpret_cast(C), stream)); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipHostFree(notified)); } TEST_CASE("Unit_hipMemPoolApi_BasicReuse") { int mem_pool_support = 0; HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); if (!mem_pool_support) { SUCCEED("Runtime doesn't support Memory Pool. Skip the test case."); return; } unsigned int* notified = nullptr; HIP_CHECK(hipHostMalloc(¬ified, sizeof(unsigned int))); *notified = 0; initMemPoolProps(); hipMemPool_t mem_pool; HIP_CHECK(hipMemPoolCreate(&mem_pool, &kPoolProps)); float *A, *B, *C; hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); size_t numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&A), numElements * sizeof(float), mem_pool, stream)); numElements = 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&C), numElements * sizeof(float), mem_pool, stream)); int blocks = 2; notifiedKernel<<>>(A, notified); hipMemPoolAttr attr; // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream)); numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream)); // Runtime must reuse the pointer REQUIRE(A == B); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified = 1; // Notify kernel loop to exit // Make a sync before the second kernel launch to make sure memory B isn't gone HIP_CHECK(hipStreamSynchronize(stream)); // Second kernel launch with new memory *notified = 0; notifiedKernel<<>>(B, notified); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream)); attr = hipMemPoolAttrUsedMemCurrent; std::uint64_t value64 = 0; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the current usage reports the both buffers REQUIRE(sizeof(float) * (8 * 1024 * 1024 + 1024) == value64); attr = hipMemPoolAttrUsedMemHigh; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the high watermark usage works - the both buffers must be reported REQUIRE(sizeof(float) * (8 * 1024 * 1024 + 1024) == value64); HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream)); attr = hipMemPoolAttrUsedMemCurrent; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the current usage reports just one buffer, because the above free doesn't hold memory REQUIRE(sizeof(float) * 1024 == value64); HIP_CHECK(hipMemPoolDestroy(mem_pool)); HIP_CHECK(hipFreeAsync(reinterpret_cast(C), stream)); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipHostFree(notified)); } TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { int mem_pool_support = 0; HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); if (!mem_pool_support) { SUCCEED("Runtime doesn't support Memory Pool. Skip the test case."); return; } unsigned int *notified1 = nullptr, *notified2 = nullptr; HIP_CHECK(hipHostMalloc(¬ified1, sizeof(unsigned int))); HIP_CHECK(hipHostMalloc(¬ified2, sizeof(unsigned int))); *notified1 = 0; *notified2 = 0; initMemPoolProps(); hipMemPool_t mem_pool; HIP_CHECK(hipMemPoolCreate(&mem_pool, &kPoolProps)); hipMemPoolAttr attr; int blocks = 2; float *A, *B, *C; hipStream_t stream1, stream2; // Create 2 async non-blocking streams HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamNonBlocking)); size_t numElements = 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&C), numElements * sizeof(float), mem_pool, stream1)); int value = 0; SECTION("Disallow Opportunistic - No Reuse") { numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&A), numElements * sizeof(float), mem_pool, stream1)); // Disable all default pool states attr = hipMemPoolReuseFollowEventDependencies; HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); attr = hipMemPoolReuseAllowOpportunistic; HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); attr = hipMemPoolReuseAllowInternalDependencies; HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); // Run kernel in the first stream notifiedKernel<<>>(A, notified1); // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream1)); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified1 = 1; // Notify kernel loop to exit // Sleep for 1 second GPU should be idle by now std::this_thread::sleep_for(std::chrono::milliseconds(1000)); numElements = 8 * 1024 * 1024; // Allocate memory for the second stream HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream2)); // Without Opportunistic state runtime must allocate another buffer REQUIRE(A != B); // Run kernel with the new memory in the second streamn notifiedKernel<<>>(B, notified2); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified2 = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream1)); HIP_CHECK(hipStreamSynchronize(stream2)); HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream2)); } SECTION("Allow Opportunistic - Reuse") { numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&A), numElements * sizeof(float), mem_pool, stream1)); value = 1; attr = hipMemPoolReuseAllowOpportunistic; // Enable Opportunistic HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); // Run kernel in the first stream notifiedKernel<<>>(A, notified1); // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream1)); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified1 = 1; // Notify kernel loop to exit // Sleep for 1 second GPU should be idle by now std::this_thread::sleep_for(std::chrono::milliseconds(1000)); numElements = 8 * 1024 * 1024; // Allocate memory for the second stream HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream2)); // With Opportunistic state runtime will reuse freed buffer A REQUIRE(A == B); // Run kernel with the new memory in the second stream notifiedKernel<<>>(B, notified2); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified2 = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream1)); HIP_CHECK(hipStreamSynchronize(stream2)); HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream2)); } SECTION("Allow Opportunistic - No Reuse") { numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&A), numElements * sizeof(float), mem_pool, stream1)); value = 1; attr = hipMemPoolReuseAllowOpportunistic; // Enable Opportunistic HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); // Run kernel in the first stream notifiedKernel<<>>(A, notified1); // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream1)); numElements = 8 * 1024 * 1024; // Allocate memory for the second stream HIP_CHECK(hipMallocFromPoolAsync(reinterpret_cast(&B), numElements * sizeof(float), mem_pool, stream2)); // With Opportunistic state runtime can't reuse freed buffer A, because it's still busy with the // kernel REQUIRE(A != B); // Run kernel with the new memory in the second stream notifiedKernel<<>>(B, notified2); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified1 = 1; // Notify kernel loop to exit *notified2 = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream1)); HIP_CHECK(hipStreamSynchronize(stream2)); HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream2)); } HIP_CHECK(hipFreeAsync(reinterpret_cast(C), stream1)); HIP_CHECK(hipMemPoolDestroy(mem_pool)); HIP_CHECK(hipStreamDestroy(stream1)); HIP_CHECK(hipStreamDestroy(stream2)); HIP_CHECK(hipHostFree(notified1)); HIP_CHECK(hipHostFree(notified2)); } TEST_CASE("Unit_hipMemPoolApi_Default") { int mem_pool_support = 0; HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); if (!mem_pool_support) { SUCCEED("Runtime doesn't support Memory Pool. Skip the test case."); return; } unsigned int* notified = nullptr; HIP_CHECK(hipHostMalloc(¬ified, sizeof(unsigned int))); *notified = 0; hipMemPool_t mem_pool; HIP_CHECK(hipDeviceGetDefaultMemPool(&mem_pool, 0)); float *A, *B, *C; hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); size_t numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocAsync(reinterpret_cast(&A), numElements * sizeof(float), stream)); numElements = 1024; HIP_CHECK(hipMallocAsync(reinterpret_cast(&C), numElements * sizeof(float), stream)); int blocks = 2; notifiedKernel<<>>(A, notified); hipMemPoolAttr attr; // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream)); numElements = 8 * 1024 * 1024; HIP_CHECK(hipMallocAsync(reinterpret_cast(&B), numElements * sizeof(float), stream)); // Runtime must reuse the pointer REQUIRE(A == B); // Make a sync before the second kernel launch to make sure memory B isn't gone std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream)); // Second kernel launch with new memory *notified = 0; notifiedKernel<<>>(B, notified); HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream)); std::this_thread::sleep_for(std::chrono::milliseconds(500)); *notified = 1; // Notify kernel loop to exit HIP_CHECK(hipStreamSynchronize(stream)); std::uint64_t value64 = 0; attr = hipMemPoolAttrReservedMemCurrent; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the current reserved is at least allocation size of buffer C (4KB) REQUIRE(sizeof(float) * 1024 <= value64); attr = hipMemPoolAttrUsedMemHigh; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the high watermark usage works - the both buffers must be reported REQUIRE(sizeof(float) * (8 * 1024 * 1024 + 1024) == value64); attr = hipMemPoolAttrUsedMemCurrent; HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attr, &value64)); // Make sure the current usage reports just one buffer, because the above free doesn't hold memory REQUIRE(sizeof(float) * 1024 == value64); HIP_CHECK(hipFreeAsync(reinterpret_cast(C), stream)); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipHostFree(notified)); }