From aa5f494517eb2ff9df33ab366e8d6be0386cfd23 Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Fri, 5 Aug 2022 16:21:15 +0530 Subject: [PATCH] SWDEV-312831 - Adding device side malloc/new tests (#2635) Following tests are added in this patch 1) Functional Tests 2) Stress Tests 3) Multiprocess Tests Change-Id: Ifd2310fe8f5d555a9af9b87c296fdf6dd4a0a552 [ROCm/hip-tests commit: de0eb210235116369a4fb7c7d7400aff18bee85d] --- .../hip-tests/catch/multiproc/CMakeLists.txt | 1 + .../catch/multiproc/deviceAllocationMproc.cc | 319 ++++ .../hip-tests/catch/stress/CMakeLists.txt | 1 + .../stress/deviceallocation/CMakeLists.txt | 8 + .../Stress_deviceAllocationStress.cc | 487 ++++++ .../catch/unit/deviceLib/CMakeLists.txt | 7 + .../hip-tests/catch/unit/deviceLib/defs.h | 36 + .../catch/unit/deviceLib/deviceAllocCommon.h | 132 ++ .../catch/unit/deviceLib/deviceAllocation.cc | 1494 +++++++++++++++++ .../catch/unit/deviceLib/kerDevAllocMultCO.cc | 39 + .../unit/deviceLib/kerDevAllocSingleKer.cc | 57 + .../catch/unit/deviceLib/kerDevFreeMultCO.cc | 47 + .../catch/unit/deviceLib/kerDevWriteMultCO.cc | 36 + 13 files changed, 2664 insertions(+) create mode 100644 projects/hip-tests/catch/multiproc/deviceAllocationMproc.cc create mode 100644 projects/hip-tests/catch/stress/deviceallocation/CMakeLists.txt create mode 100644 projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc create mode 100644 projects/hip-tests/catch/unit/deviceLib/defs.h create mode 100644 projects/hip-tests/catch/unit/deviceLib/deviceAllocCommon.h create mode 100644 projects/hip-tests/catch/unit/deviceLib/deviceAllocation.cc create mode 100644 projects/hip-tests/catch/unit/deviceLib/kerDevAllocMultCO.cc create mode 100644 projects/hip-tests/catch/unit/deviceLib/kerDevAllocSingleKer.cc create mode 100644 projects/hip-tests/catch/unit/deviceLib/kerDevFreeMultCO.cc create mode 100644 projects/hip-tests/catch/unit/deviceLib/kerDevWriteMultCO.cc diff --git a/projects/hip-tests/catch/multiproc/CMakeLists.txt b/projects/hip-tests/catch/multiproc/CMakeLists.txt index c8da597abd..128384ea3a 100644 --- a/projects/hip-tests/catch/multiproc/CMakeLists.txt +++ b/projects/hip-tests/catch/multiproc/CMakeLists.txt @@ -14,6 +14,7 @@ set(LINUX_TEST_SRC hipMemCoherencyTstMProc.cc hipIpcEventHandle.cc hipIpcMemAccessTest.cc + deviceAllocationMproc.cc ) # the last argument linker libraries is required for this test but optional to the function diff --git a/projects/hip-tests/catch/multiproc/deviceAllocationMproc.cc b/projects/hip-tests/catch/multiproc/deviceAllocationMproc.cc new file mode 100644 index 0000000000..0958cb8781 --- /dev/null +++ b/projects/hip-tests/catch/multiproc/deviceAllocationMproc.cc @@ -0,0 +1,319 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#ifdef __linux__ +#include +#include +#include +#include +#endif + +#define SIZE 2097152 +// GPU threads +#define BLOCKSIZE 512 +#define GRIDSIZE 256 + +__device__ static char* dev_common_ptr = nullptr; + +/** + * This kernel allocates a memory chunk using malloc(). + */ +static __global__ void kerTestDeviceMalloc(size_t size) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + if (myId == 0) { + dev_common_ptr = reinterpret_cast (malloc(size)); + if (dev_common_ptr == nullptr) { + printf("Device Allocation Failed! \n"); + return; + } + } +} + +/** + * This kernel writes to the memory location allocated in kernel + * kerTestDeviceMalloc or kerTestDeviceNew. + */ +static __global__ void kerTestDeviceWrite() { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + if (dev_common_ptr == nullptr) { + printf("Device Allocation Failed! \n"); + return; + } + *(dev_common_ptr + myId) = SCHAR_MAX; +} + +/** + * This kernel frees the memory chunk allocated in kernel + * kerTestDeviceMalloc using free(). + */ +static __global__ void kerTestDeviceFree(int *result) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + if (myId == 0) { + if (dev_common_ptr != nullptr) { + *result = 1; + for (int idx = 0; idx < (BLOCKSIZE*GRIDSIZE); idx++) { + if (*(dev_common_ptr + myId) != SCHAR_MAX) { + *result = 0; + break; + } + } + free(dev_common_ptr); + } else { + *result = 0; + } + } +} + +/** + * This kernel allocates a memory chunk using new operator. + */ +static __global__ void kerTestDeviceNew(size_t size) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + if (myId == 0) { + dev_common_ptr = new char[size]; + if (dev_common_ptr == nullptr) { + printf("Device Allocation Failed! \n"); + return; + } + } +} + +/** + * This kernel frees the memory chunk allocated in kernel + * kerTestDeviceNew using delete operator. + */ +static __global__ void kerTestDeviceDelete(int *result) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + if (myId == 0) { + if (dev_common_ptr != nullptr) { + *result = 1; + for (int idx = 0; idx < (BLOCKSIZE*GRIDSIZE); idx++) { + if (*(dev_common_ptr + myId) != SCHAR_MAX) { + *result = 0; + break; + } + } + delete[] dev_common_ptr; + } else { + *result = 0; + } + } +} + +/** + * Test device malloc()/new in both Parent and Child Process. + * Allocate SIZE bytes in both parent and child process. Verify + * the allocated size in both parent and child process. + */ +static bool testDeviceAllocMulProc(bool testmalloc) { + int fd[2]; + pid_t childpid; + bool testResult = false; + size_t avail = 0, tot = 0; + // create pipe descriptors + pipe(fd); + // fork process + childpid = fork(); + if (childpid > 0) { // Parent + close(fd[1]); + // Allocate in parent + if (testmalloc) { + kerTestDeviceMalloc<<<1, 1>>>(SIZE); + } else { + kerTestDeviceNew<<<1, 1>>>(SIZE); + } + HIP_CHECK(hipDeviceSynchronize()); + // Check allocated memory size + HIP_CHECK(hipMemGetInfo(&avail, &tot)); + if ((tot - avail) < SIZE) { + return false; + } + // parent will wait to read the device cnt + read(fd[0], &testResult, sizeof(testResult)); + // close the read-descriptor + close(fd[0]); + // wait for child exit + wait(NULL); + // At this point the child process exits. + // Ensure that device memory allocated from child is freed. + HIP_CHECK(hipMemGetInfo(&avail, &tot)); + if ((tot - avail) < SIZE) { + testResult = false; + } + } else if (!childpid) { // Child + // Wait for hipDeviceSetLimit() completion in parent. + close(fd[0]); + // Allocate in child + if (testmalloc) { + kerTestDeviceMalloc<<<1, 1>>>(SIZE); + } else { + kerTestDeviceNew<<<1, 1>>>(SIZE); + } + HIP_CHECK(hipDeviceSynchronize()); + // Check allocated memory size + HIP_CHECK(hipMemGetInfo(&avail, &tot)); + if ((tot - avail) < SIZE) { + testResult = false; + } else { + testResult = true; + } + // send the value on the write-descriptor: + write(fd[1], &testResult, sizeof(testResult)); + // close the write descriptor: + close(fd[1]); + exit(0); + } + return testResult; +} + +/** + * Test device malloc()/new, write and free()/delete[] + * from both Parent and Child Process. From both Parent and + * Child Process invoke the kernel to allocate memory, the + * kernel to write to the allocated memory and a third kernel + * to verify the memory contents and free it. + */ +static bool testDeviceMemMulProc(bool testmalloc) { + int fd[2]; + bool testResult = false; + pid_t childpid; + int testResultChild = 0; + size_t size = BLOCKSIZE*GRIDSIZE; + // create pipe descriptors + pipe(fd); + // fork process + childpid = fork(); + if (childpid > 0) { // Parent + close(fd[1]); + int *result_d{nullptr}, *result_h{nullptr}; + HIP_CHECK(hipMalloc(&result_d, sizeof(int))); + result_h = reinterpret_cast (malloc(sizeof(int))); + REQUIRE(result_h != nullptr); + // Allocate in parent + if (testmalloc) { + kerTestDeviceMalloc<<<1, 1>>>(size); + } else { + kerTestDeviceNew<<<1, 1>>>(size); + } + // Write + kerTestDeviceWrite<<>>(); + // Free + if (testmalloc) { + kerTestDeviceFree<<<1, 1>>>(result_d); + } else { + kerTestDeviceDelete<<<1, 1>>>(result_d); + } + HIP_CHECK(hipDeviceSynchronize()); + *result_h = 0; + HIP_CHECK(hipMemcpy(result_h, result_d, sizeof(int), + hipMemcpyDefault)); + if (*result_h == 0) { + testResult = false; + } else { + testResult = true; + } + // parent will wait to read the device cnt + read(fd[0], &testResultChild, sizeof(int)); + if (testResultChild == 0) { + testResult &= false; + } else { + testResult &= true; + } + // close the read-descriptor + close(fd[0]); + hipFree(result_d); + free(result_h); + // wait for child exit + wait(NULL); + } else if (!childpid) { // Child + // Wait for hipDeviceSetLimit() completion in parent. + close(fd[0]); + int *result_d{nullptr}, *result_h{nullptr}; + HIP_CHECK(hipMalloc(&result_d, sizeof(int))); + result_h = reinterpret_cast (malloc(sizeof(int))); + REQUIRE(result_h != nullptr); + // Allocate in child + if (testmalloc) { + kerTestDeviceMalloc<<<1, 1>>>(size); + } else { + kerTestDeviceNew<<<1, 1>>>(size); + } + // Write + kerTestDeviceWrite<<>>(); + // Free + if (testmalloc) { + kerTestDeviceFree<<<1, 1>>>(result_d); + } else { + kerTestDeviceDelete<<<1, 1>>>(result_d); + } + HIP_CHECK(hipDeviceSynchronize()); + *result_h = 0; + HIP_CHECK(hipMemcpy(result_h, result_d, sizeof(int), + hipMemcpyDefault)); + // send the value on the write-descriptor: + write(fd[1], result_h, sizeof(int)); + // close the write descriptor: + close(fd[1]); + hipFree(result_d); + free(result_h); + exit(0); + } + return testResult; +} + +/** + * Multiprocess device side malloc test. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_MultProcess") { + auto res = testDeviceAllocMulProc(true); + REQUIRE(res == true); +} + +/** + * Multiprocess device side new test. + */ +TEST_CASE("Unit_deviceAllocation_New_MultProcess") { + auto res = testDeviceAllocMulProc(false); + REQUIRE(res == true); +} + +/** + * Multiprocess device side malloc, write and free test. + */ +TEST_CASE("Unit_deviceAllocation_MallocFree_MultProcess") { + auto res = testDeviceMemMulProc(true); + REQUIRE(res == true); +} + +/** + * Multiprocess device side new, write and delete test. + */ +TEST_CASE("Unit_deviceAllocation_NewDelete_MultProcess") { + auto res = testDeviceMemMulProc(false); + REQUIRE(res == true); +} diff --git a/projects/hip-tests/catch/stress/CMakeLists.txt b/projects/hip-tests/catch/stress/CMakeLists.txt index 7ecabc21e1..f69313b245 100644 --- a/projects/hip-tests/catch/stress/CMakeLists.txt +++ b/projects/hip-tests/catch/stress/CMakeLists.txt @@ -6,3 +6,4 @@ if(HIP_PLATFORM MATCHES "amd") add_subdirectory(printf) add_subdirectory(stream) endif() +add_subdirectory(deviceallocation) diff --git a/projects/hip-tests/catch/stress/deviceallocation/CMakeLists.txt b/projects/hip-tests/catch/stress/deviceallocation/CMakeLists.txt new file mode 100644 index 0000000000..ee7eb2a580 --- /dev/null +++ b/projects/hip-tests/catch/stress/deviceallocation/CMakeLists.txt @@ -0,0 +1,8 @@ +# Common Tests - Test independent of all platforms +set(TEST_SRC + Stress_deviceAllocationStress.cc +) + +hip_add_exe_to_target(NAME devalloc + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME stress_test) diff --git a/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc b/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc new file mode 100644 index 0000000000..e60bf66bb4 --- /dev/null +++ b/projects/hip-tests/catch/stress/deviceallocation/Stress_deviceAllocationStress.cc @@ -0,0 +1,487 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include +#include +// Size Macros +#define MEMORY_CHUNK_SIZE (1024*1024) +#define MEMORY_CHUNK_SIZE_ODD (1025*1025) +#define MAXIMUM_CHUNKS (256*1024) +// Subtest Macros +#define NO_ALLOCATION_ONHOST 0 +#define ALLOCATE_ONHOST_HIPMALLOCMANAGED 1 +#define ALLOCATE_ONHOST_HIPMALLOC 2 +// Test Type Macros +#define TEST_MALLOC_FREE 1 +#define TEST_NEW_DELETE 2 +// GPU threads +#define BLOCKSIZE 512 +#define GRIDSIZE 512 +// Test parameters +// Two different loops +#define NUM_OF_LOOP_SINGLE_KER 100000 +#define NUM_OF_LOOP_MULTIPLE_KER 20000 + +// The following flag is defined for platforms (nvidia) +// which honors device memory limit. For AMD this flag +// is disabled and defect is raised. +#if HT_NVIDIA +#define HT_HONORS_DEVICEMEMORY_LIMIT +#endif + +#ifdef HT_HONORS_DEVICEMEMORY_LIMIT +__device__ static char* dev_mem_glob[MAXIMUM_CHUNKS]; +#endif +__device__ static int* dev_mem[GRIDSIZE]; +__device__ static int* dev_common_ptr; + +#ifdef HT_HONORS_DEVICEMEMORY_LIMIT +/** + * This kernel checks kernel allocation of size more than available + * memory. + */ +static __global__ void kerTestDynamicAllocNeg(int test_type, + size_t perThreadSize, + int *ret) { + // Allocate + char* ptr = nullptr; + printf("Memory to allocate in GPU = %zu \n", perThreadSize); + if (test_type == TEST_MALLOC_FREE) { + ptr = reinterpret_cast (malloc(perThreadSize)); + } else { + ptr = new char[perThreadSize]; + } + printf("Allocation Done \n"); + if (ptr == nullptr) { + printf("Allocation Failed. PASSED! \n"); + *ret = 0; + return; + } else { + // Free memory + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } + *ret = -1; + } +} + +/** + * This kernel allocates memory till nullptr is returned. + */ +static __global__ void kerAllocTillExhaust(int test_type, + size_t *total_allocated_mem, + size_t mem_chunk_size) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 of block 0 + if (0 == myId) { + for (int idx = 0; idx < MAXIMUM_CHUNKS; idx++) { + dev_mem_glob[idx] = nullptr; + } + int idx = 0; + if (test_type == TEST_MALLOC_FREE) { + do { + dev_mem_glob[idx] = + reinterpret_cast (malloc(mem_chunk_size)); + if (idx >= MAXIMUM_CHUNKS) { + break; + } + } while (dev_mem_glob[idx++] != nullptr); + } else { + do { + dev_mem_glob[idx] = + reinterpret_cast (new char[mem_chunk_size]); + if (idx >= MAXIMUM_CHUNKS) { + break; + } + } while (dev_mem_glob[idx++] != nullptr); + } + idx = 0; + *total_allocated_mem = 0; + while ((dev_mem_glob[idx] != nullptr) && + (idx < MAXIMUM_CHUNKS)) { + *total_allocated_mem = *total_allocated_mem + mem_chunk_size; + idx++; + } + } +} + +/** + * This kernel deletes the memory. + */ +static __global__ void kerFreeAll(int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + if (0 == myId) { + if (test_type == TEST_MALLOC_FREE) { + int idx = 0; + while (dev_mem_glob[idx] != nullptr) { + free(dev_mem_glob[idx++]); + if (idx >= MAXIMUM_CHUNKS) { + break; + } + } + } else { + int idx = 0; + while (dev_mem_glob[idx] != nullptr) { + delete[] (dev_mem_glob[idx++]); + if (idx >= MAXIMUM_CHUNKS) { + break; + } + } + } + } +} +#endif +/** + * This kernel allocates memory once in thread 0 of each block and + * access this memory in all threads of the block. The memory is + * finally deleted in last thread of each block. + */ +static __global__ void kerBlockLevelMemoryAllocation(int *outputBuf, + int test_type) { + int myThreadId = threadIdx.x, lastThreadId = (blockDim.x - 1); + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 + if (0 == myThreadId) { + if (test_type == TEST_MALLOC_FREE) { + dev_mem[blockIdx.x] = + reinterpret_cast (malloc(blockDim.x*sizeof(int))); + } else { + dev_mem[blockIdx.x] = + reinterpret_cast (new int[blockDim.x]); + } + } + // All threads wait at this barrier + __syncthreads(); + // Check allocated memory in all threads in block before access + if (dev_mem[blockIdx.x] == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + int *ptr = reinterpret_cast (dev_mem[blockIdx.x]); + // Copy to buffer + ptr[myThreadId] = myId; + // All threads wait + __syncthreads(); + // Copy memory to host and free the memory in thread + if (lastThreadId == myThreadId) { + for (size_t idx = 0; idx < blockDim.x; idx++) { + outputBuf[idx + blockDim.x * blockIdx.x] = ptr[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } + } +} + +/** + * This kernel allocates memory in one thread. + */ +static __global__ void kerAlloc(int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 of block 0 + if (0 == myId) { + if (test_type == TEST_MALLOC_FREE) { + dev_common_ptr = + reinterpret_cast (malloc(blockDim.x*gridDim.x*sizeof(int))); + } else { + dev_common_ptr = + reinterpret_cast (new int[blockDim.x*gridDim.x]); + } + } +} + +/** + * This kernel writes to memory allocated in . + */ +static __global__ void kerWrite() { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Check allocated memory in all threads in block before access + if (dev_common_ptr == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + // Copy to buffer + dev_common_ptr[myId] = myId; +} + +/** + * This kernel copies the contents of memory allocated in + * to host and deletes the memory from thread 0. + */ +static __global__ void kerFree(int *outputBuf, int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Check allocated memory in all threads in block before access + if (dev_common_ptr == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + if (0 == myId) { + for (size_t idx = 0; idx < (blockDim.x*gridDim.x); idx++) { + outputBuf[idx] = dev_common_ptr[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(dev_common_ptr); + } else { + delete[] dev_common_ptr; + } + } +} + +#ifdef HT_HONORS_DEVICEMEMORY_LIMIT +/** + * Local function: Launch kerAllocTillExhaust<<<>>> and + * kerFreeAll<<<>>> to test memory allocation till all device + * memory is exhausted. + */ +static bool TestAllocationOfAllAvailableMemory(int test_type, + int category, size_t mem_chunk_size) { + size_t avail1 = 0, avail2 = 0, tot = 0; + constexpr size_t host_alloc = 2147483648; // 2 GB + HIP_CHECK(hipMemGetInfo(&avail1, &tot)); +#if HT_NVIDIA + HIP_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, avail1)); +#endif + size_t *tot_alloc_mem_d = nullptr, *tot_alloc_mem_h = nullptr; + tot_alloc_mem_h = + reinterpret_cast (malloc(sizeof(size_t))); + REQUIRE(nullptr != tot_alloc_mem_h); + HIP_CHECK(hipMalloc(&tot_alloc_mem_d, sizeof(size_t))); + REQUIRE(nullptr != tot_alloc_mem_d); + char *devptrHost = nullptr; + if (category == ALLOCATE_ONHOST_HIPMALLOCMANAGED) { + HIP_CHECK(hipMallocManaged(&devptrHost, host_alloc)); + } else if (category == ALLOCATE_ONHOST_HIPMALLOC) { + HIP_CHECK(hipMalloc(&devptrHost, host_alloc)); + } + HIP_CHECK(hipMemGetInfo(&avail1, &tot)); + INFO("Total available memory " << tot); + INFO("Available memory before allocation " << avail1); + // Launch Test Kernel + kerAllocTillExhaust<<<1, 1>>>(test_type, tot_alloc_mem_d, + mem_chunk_size); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(tot_alloc_mem_h, tot_alloc_mem_d, + sizeof(size_t), hipMemcpyDefault)); + HIP_CHECK(hipMemGetInfo(&avail2, &tot)); + kerFreeAll<<<1, 1>>>(test_type); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + bool bPassed = false; + INFO("Available memory after allocation " << avail2); + if (category == NO_ALLOCATION_ONHOST) { + size_t allocated_dev_mem = (tot - avail2); + if (allocated_dev_mem >= *tot_alloc_mem_h) { + bPassed = true; + } + } else if ((category == ALLOCATE_ONHOST_HIPMALLOCMANAGED) || + (category == ALLOCATE_ONHOST_HIPMALLOC)) { + size_t allocated_dev_mem = (tot - avail2 - host_alloc); + if (allocated_dev_mem >= *tot_alloc_mem_h) { + bPassed = true; + } + hipFree(devptrHost); + } + hipFree(tot_alloc_mem_d); + free(tot_alloc_mem_h); + return bPassed; +} +#endif +/** + * Local function: Launch kerBlockLevelMemoryAllocation<<<>>> + * in a loop to stress test allocation and deallocation. + */ +static bool TestMemoryAllocationInLoop(int test_type, + bool isMultikernel = false) { + int *outputVec_d{nullptr}, *outputVec_h{nullptr}; + int arraysize = (BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast (malloc(sizeof(int) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize))); + bool bPassed = true; + // Launch Test Kernel + int max_index = 0; + if (isMultikernel) { + max_index = NUM_OF_LOOP_MULTIPLE_KER; + } else { + max_index = NUM_OF_LOOP_SINGLE_KER; + } + for (int idx = 0; idx < max_index; idx++) { + if (isMultikernel) { + kerAlloc<<>>(test_type); + kerWrite<<>>(); + kerFree<<>>(outputVec_d, test_type); + } else { + kerBlockLevelMemoryAllocation<<>>(outputVec_d, + test_type); + } + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, + hipMemcpyDefault)); + bPassed = true; + for (int idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != idx) { + bPassed = false; + break; + } + } + if (!bPassed) break; + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +#ifdef HT_HONORS_DEVICEMEMORY_LIMIT +/** + * Scenario: Test malloc till nullptr is returned using even chunksize. + */ +TEST_CASE("Stress_deviceAllocation_malloc_Even") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_MALLOC_FREE, + NO_ALLOCATION_ONHOST, MEMORY_CHUNK_SIZE)); +} + +/** + * Scenario: Test malloc till nullptr is returned using odd chunksize. + */ +TEST_CASE("Stress_deviceAllocation_malloc_Odd") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_MALLOC_FREE, + NO_ALLOCATION_ONHOST, MEMORY_CHUNK_SIZE_ODD)); +} + +/** + * Scenario: Test new till nullptr is returned using even chunksize. + */ +TEST_CASE("Stress_deviceAllocation_new_Even") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_NEW_DELETE, + NO_ALLOCATION_ONHOST, MEMORY_CHUNK_SIZE)); +} + +/** + * Scenario: Test new till nullptr is returned using odd chunksize. + */ +TEST_CASE("Stress_deviceAllocation_new_Odd") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_NEW_DELETE, + NO_ALLOCATION_ONHOST, MEMORY_CHUNK_SIZE_ODD)); +} + +/** + * Scenario: This test checks device allocation using malloc till nullptr + * is returned. Device memory is also allocated using hipmallocmanaged + * from host. + */ +TEST_CASE("Stress_deviceAllocation_malloc_hipmallocmanaged") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_MALLOC_FREE, + ALLOCATE_ONHOST_HIPMALLOCMANAGED, MEMORY_CHUNK_SIZE)); +} + +/** + * Scenario: This test checks device allocation using new till nullptr + * is returned. Device memory is also allocated using hipmallocmanaged + * from host. + */ +TEST_CASE("Stress_deviceAllocation_new_hipmallocmanaged") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_NEW_DELETE, + ALLOCATE_ONHOST_HIPMALLOCMANAGED, MEMORY_CHUNK_SIZE)); +} + +/** + * Scenario: This test checks device allocation using malloc till nullptr + * is returned. Device memory is also allocated using hipmalloc from host. + */ +TEST_CASE("Stress_deviceAllocation_malloc_hipmalloc") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_MALLOC_FREE, + ALLOCATE_ONHOST_HIPMALLOC, MEMORY_CHUNK_SIZE)); +} + +/** + * Scenario: This test checks device allocation using new till nullptr + * is returned. Device memory is also allocated using hipmalloc from host. + */ +TEST_CASE("Stress_deviceAllocation_new_hipmalloc") { + REQUIRE(true == TestAllocationOfAllAvailableMemory(TEST_NEW_DELETE, + ALLOCATE_ONHOST_HIPMALLOC, MEMORY_CHUNK_SIZE)); +} + +/** + * Scenario: This test validates device allocation negative scenario + * when size > available memory. + */ +TEST_CASE("Stress_deviceAllocation_Negative") { + int *ret_d{nullptr}, *ret_h{nullptr}; + size_t avail = 0, tot = 0; + HIP_CHECK(hipMemGetInfo(&avail, &tot)); + printf("Available Memory in GPU = %zu \n", avail); + ret_h = reinterpret_cast (malloc(sizeof(int))); + REQUIRE(ret_h != nullptr); + HIP_CHECK(hipMalloc(&ret_d, (sizeof(int)))); + SECTION("Test allocation with malloc") { + kerTestDynamicAllocNeg<<<1, 1>>>(TEST_MALLOC_FREE, (avail + 1), ret_d); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipMemcpy(ret_h, ret_d, sizeof(int), hipMemcpyDefault)); + REQUIRE(0 == *ret_h); + } + + SECTION("Test allocation with new") { + kerTestDynamicAllocNeg<<<1, 1>>>(TEST_NEW_DELETE, (avail + 1), ret_d); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipMemcpy(ret_h, ret_d, sizeof(int), hipMemcpyDefault)); + REQUIRE(0 == *ret_h); + } + hipFree(ret_d); + free(ret_h); +} +#endif +/** + * Scenario: This test performs stress test of malloc/free in a loop + * using single kernel. + */ +TEST_CASE("Stress_deviceAllocation_malloc_loop_singlekernel") { + REQUIRE(true == TestMemoryAllocationInLoop(TEST_MALLOC_FREE, false)); +} + +/** + * Scenario: This test performs stress test of new/delete in a loop + * using single kernel. + */ +TEST_CASE("Stress_deviceAllocation_new_loop_singlekernel") { + REQUIRE(true == TestMemoryAllocationInLoop(TEST_NEW_DELETE, false)); +} + +/** + * Scenario: This test performs stress test of malloc/free in a loop + * using multiple kernel. + */ +TEST_CASE("Stress_deviceAllocation_malloc_loop_multkernel") { + REQUIRE(true == TestMemoryAllocationInLoop(TEST_MALLOC_FREE, true)); +} + +/** + * Scenario: This test performs stress test of new/delete in a loop + * using multiple kernel. + */ +TEST_CASE("Stress_deviceAllocation_new_loop_multkernel") { + REQUIRE(true == TestMemoryAllocationInLoop(TEST_NEW_DELETE, true)); +} diff --git a/projects/hip-tests/catch/unit/deviceLib/CMakeLists.txt b/projects/hip-tests/catch/unit/deviceLib/CMakeLists.txt index 89027dcbf5..5805e11b5e 100644 --- a/projects/hip-tests/catch/unit/deviceLib/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/deviceLib/CMakeLists.txt @@ -13,6 +13,7 @@ set(TEST_SRC syncthreadsand.cc syncthreadscount.cc syncthreadsor.cc + deviceAllocation.cc ) # AMD only tests @@ -43,6 +44,11 @@ set(AMD_ARCH_SPEC_TEST_SRC unsafeAtomicAdd_NonCoherent_withunsafeflag.cc ) +add_custom_target(kerDevAllocMultCO.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${CMAKE_CURRENT_SOURCE_DIR}/kerDevAllocMultCO.cc -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/deviceLib/kerDevAllocMultCO.code -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I${CMAKE_CURRENT_SOURCE_DIR}/../../include) +add_custom_target(kerDevWriteMultCO.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${CMAKE_CURRENT_SOURCE_DIR}/kerDevWriteMultCO.cc -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/deviceLib/kerDevWriteMultCO.code -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I${CMAKE_CURRENT_SOURCE_DIR}/../../include) +add_custom_target(kerDevFreeMultCO.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${CMAKE_CURRENT_SOURCE_DIR}/kerDevFreeMultCO.cc -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/deviceLib/kerDevFreeMultCO.code -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I${CMAKE_CURRENT_SOURCE_DIR}/../../include) +add_custom_target(kerDevAllocSingleKer.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${CMAKE_CURRENT_SOURCE_DIR}/kerDevAllocSingleKer.cc -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/deviceLib/kerDevAllocSingleKer.code -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I${CMAKE_CURRENT_SOURCE_DIR}/../../include) + if(HIP_PLATFORM MATCHES "amd") if (DEFINED OFFLOAD_ARCH_STR) string(FIND ${OFFLOAD_ARCH_STR} "gfx90a" ARCH_CHECK) @@ -78,3 +84,4 @@ elseif(HIP_PLATFORM MATCHES "nvidia") TEST_TARGET_NAME build_tests COMPILE_OPTIONS --Wno-deprecated-declarations) endif() +add_dependencies(build_tests kerDevAllocMultCO.code kerDevWriteMultCO.code kerDevFreeMultCO.code kerDevAllocSingleKer.code) diff --git a/projects/hip-tests/catch/unit/deviceLib/defs.h b/projects/hip-tests/catch/unit/deviceLib/defs.h new file mode 100644 index 0000000000..a1b6a5efdf --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/defs.h @@ -0,0 +1,36 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#pragma once + +#define INTERNAL_BUFFER_SIZE 8 +// Test Type +#define TEST_MALLOC_FREE 1 +#define TEST_NEW_DELETE 2 +// Kernel Params +#define BLOCKSIZE 64 +#define GRIDSIZE 32 +// Code Obj +#define DEV_ALLOC_SINGKER_COBJ "kerDevAllocSingleKer.code" +#define DEV_ALLOC_SINGKER_COBJ_FUNC "ker_TestDynamicAllocInAllThreads_CodeObj" +#define DEV_ALLOC_MULCOBJ "kerDevAllocMultCO.code" +#define DEV_WRITE_MULCOBJ "kerDevWriteMultCO.code" +#define DEV_FREE_MULCOBJ "kerDevFreeMultCO.code" +#define DEV_ALLOC_MULCODEOBJ_ALLOC "ker_Alloc_MultCodeObj" +#define DEV_ALLOC_MULCODEOBJ_WRITE "ker_Write_MultCodeObj" +#define DEV_ALLOC_MULCODEOBJ_FREE "ker_Free_MultCodeObj" diff --git a/projects/hip-tests/catch/unit/deviceLib/deviceAllocCommon.h b/projects/hip-tests/catch/unit/deviceLib/deviceAllocCommon.h new file mode 100644 index 0000000000..d5448ddce4 --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/deviceAllocCommon.h @@ -0,0 +1,132 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once +#include +#include +#include +#include +#include +#include "./defs.h" + +static __device__ int* deviceAlloc(int test_type); +static __device__ void deviceWrite(int myId, int *devmem); +static __device__ void deviceFree(int *outputBuf, int *devmem, + int test_type, int myId); + +/** + * Allocation base and derived class to test dynamic allocation. + */ +class baseAlloc{ + public: + virtual __device__ int* alloc(size_t size) = 0; + virtual __device__ void free(int* ptr) = 0; +}; + +class derivedAlloc: public baseAlloc{ + public: + virtual __device__ int* alloc(size_t size) { + return new int[size]; + } + virtual __device__ void free(int* ptr) { + delete ptr; + } +}; + +/** + * Allocation Structure to test dynamic allocation. + */ +struct deviceAllocFunc{ + int* (*alloc)(int); + void (*write)(int, int*); + void (*free)(int*, int*, int, int); +}; + +/** + * Simple Structure to test dynamic allocation. + */ +struct simpleStruct{ + int32_t i; + double d; + float f; + int16_t s; + char c; + int32_t iarr[INTERNAL_BUFFER_SIZE]; + bool operator!=(const struct simpleStruct &inpStr) { + if ((i != inpStr.i) || (d != inpStr.d) || + (f != inpStr.f) || (s != inpStr.s) || (c != inpStr.c)) { + return true; + } + for (int32_t idx = 0; idx < INTERNAL_BUFFER_SIZE; idx++) { + if (iarr[idx] != inpStr.iarr[idx]) { + return true; + } + } + return false; + } +}; + +/** + * Simple Structure containing thread information + */ +struct threadInfo{ + int threadid; + int blockid; + int32_t ival; + double dval; + float fval; + int16_t sval; + char cval; +}; + +/** + * C/C++ Union + */ +union testInfoUnion{ + int32_t ival; + double dval; + float fval; + int16_t sval; + char cval; +}; + +/** + * Complex (nested) Structure to test dynamic allocation using malloc. + */ +struct complexStructure{ + struct threadInfo *sthreadInfo; + __device__ void alloc_internal_members(int test_type, size_t size) { + sthreadInfo = nullptr; + if (test_type == TEST_MALLOC_FREE) { + sthreadInfo = reinterpret_cast( + malloc(size*sizeof(struct threadInfo))); + } else { + sthreadInfo = new struct threadInfo[size]; + } + } + + __device__ void free_internal_members(int test_type) { + if (test_type == TEST_MALLOC_FREE) { + free(sthreadInfo); + } else { + delete[] sthreadInfo; + } + sthreadInfo = nullptr; + } +}; diff --git a/projects/hip-tests/catch/unit/deviceLib/deviceAllocation.cc b/projects/hip-tests/catch/unit/deviceLib/deviceAllocation.cc new file mode 100644 index 0000000000..4982e4f976 --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/deviceAllocation.cc @@ -0,0 +1,1494 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "deviceAllocCommon.h" + +__device__ static void* dev_mem_glob; +__device__ struct deviceAllocFunc allocfunc{&deviceAlloc, &deviceWrite, + &deviceFree}; +__device__ class derivedAlloc classalloc; +constexpr auto num_threads = 5; +static bool thread_results[num_threads]; +__device__ static void* dev_ptr[num_threads][GRIDSIZE]; + +/** + * This kernel allocates and deallocates in every thread + * of every block. + */ +template +static __global__ void kerTestDynamicAllocInAllThread(T *outputBuf, + int test_type, T value, size_t perThreadSize) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + size_t size = 0; + T* ptr = nullptr; + if (test_type == TEST_MALLOC_FREE) { + size = perThreadSize * sizeof(T); + ptr = reinterpret_cast (malloc(size)); + } else { + size = perThreadSize; + ptr = new T[perThreadSize]; + } + if (ptr == nullptr) { + printf("Device Allocation in thread %d Failed! \n", myId); + return; + } + // Set memory + for (size_t idx = 0; idx < perThreadSize; idx++) { + ptr[idx] = value; + } + // Copy to output buffer + for (size_t idx = 0; idx < perThreadSize; idx++) { + outputBuf[myId*perThreadSize + idx] = ptr[idx]; + } + // Free memory + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } +} + +/** + * This kernel allocates and deallocates using virtual functions in every + * thread of every block. + */ +static __global__ void kerTestDynamicAllocVirtualFunc(int *outputBuf, + size_t perThreadSize) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + baseAlloc *palloc = &classalloc; + // Allocate + int* ptr = palloc->alloc(perThreadSize); + + if (ptr == nullptr) { + printf("Device Allocation in thread %d Failed! \n", myId); + return; + } + // Set memory + for (size_t idx = 0; idx < perThreadSize; idx++) { + ptr[idx] = myId; + } + // Copy to output buffer + for (size_t idx = 0; idx < perThreadSize; idx++) { + outputBuf[myId*perThreadSize + idx] = ptr[idx]; + } + // Free memory + palloc->free(ptr); +} + +/** + * This kernel allocates memory in one thread, + * access/modifies it in all threads of block and copies + * data to host and frees the memory in another thread. + */ +template +static __global__ void kerTestAccessInAllThreadsInBlock(T *outputBuf, + int test_type, T value, int host_thr_idx) { + int myThreadId = threadIdx.x, lastThreadId = (blockDim.x - 1); + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 + if (0 == myThreadId) { + if (test_type == TEST_MALLOC_FREE) { + dev_ptr[host_thr_idx][blockIdx.x] = + reinterpret_cast (malloc(blockDim.x*sizeof(T))); + } else { + dev_ptr[host_thr_idx][blockIdx.x] = + reinterpret_cast (new T[blockDim.x]); + } + } + // All threads wait at this barrier + __syncthreads(); + // Check allocated memory in all threads in block before access + if (dev_ptr[host_thr_idx][blockIdx.x] == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + T *ptr = reinterpret_cast (dev_ptr[host_thr_idx][blockIdx.x]); + // Copy to buffer + ptr[myThreadId] = value; + // All threads wait + __syncthreads(); + // Copy memory to host and free the memory in thread + if (lastThreadId == myThreadId) { + for (size_t idx = 0; idx < blockDim.x; idx++) { + outputBuf[idx + blockDim.x * blockIdx.x] = ptr[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } + } +} + +/** + * This kernel allocates a nested structure per block in one thread, + * access/modifies it in all threads of block and copies + * data to host and frees the memory in another thread. + */ +static __global__ void kerTestAccessInAllThreads_CmplxStr(int test_type, + int *result) { + int myThreadId = threadIdx.x; + int lastThreadId = (blockDim.x - 1); + int myBlockId = blockIdx.x; + int myGid = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 + if (0 == myThreadId) { + if (test_type == TEST_MALLOC_FREE) { + dev_ptr[0][blockIdx.x] = + reinterpret_cast (malloc(sizeof(struct complexStructure))); + } else { + dev_ptr[0][blockIdx.x] = + reinterpret_cast (new struct complexStructure); + } + struct complexStructure *ptr = + reinterpret_cast (dev_ptr[0][blockIdx.x]); + ptr->alloc_internal_members(test_type, BLOCKSIZE); + } + // All threads wait at this barrier + __syncthreads(); + // Check allocated memory in all threads in block before access + if (dev_ptr[0][blockIdx.x] == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myGid); + return; + } + struct complexStructure *ptr = + reinterpret_cast (dev_ptr[0][blockIdx.x]); + if (ptr->sthreadInfo == nullptr) { + printf("Structure Allocation Failed in thread = %d \n", myGid); + return; + } + // Copy to buffer + ptr->sthreadInfo[myThreadId].threadid = myThreadId; + ptr->sthreadInfo[myThreadId].blockid = myBlockId; + ptr->sthreadInfo[myThreadId].ival = INT_MAX; + ptr->sthreadInfo[myThreadId].dval = DBL_MAX; + ptr->sthreadInfo[myThreadId].fval = FLT_MAX; + ptr->sthreadInfo[myThreadId].sval = SHRT_MAX; + ptr->sthreadInfo[myThreadId].cval = SCHAR_MAX; + // All threads wait + __syncthreads(); + // Copy memory to host and free the memory in thread + if (lastThreadId == myThreadId) { + int match = 1; + for (int idx = 0; idx < BLOCKSIZE; idx++) { + if ((ptr->sthreadInfo[idx].threadid != idx) || + (ptr->sthreadInfo[idx].blockid != myBlockId) || + (ptr->sthreadInfo[idx].ival != INT_MAX) || + (ptr->sthreadInfo[idx].dval != DBL_MAX) || + (ptr->sthreadInfo[idx].fval != FLT_MAX) || + (ptr->sthreadInfo[idx].sval != SHRT_MAX) || + (ptr->sthreadInfo[idx].cval != SCHAR_MAX)) { + match = 0; + break; + } + } + result[blockIdx.x] = match; + ptr->free_internal_members(test_type); + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete ptr; + } + } +} + +/** + * This kernel allocates a union per block in one thread, + * access/modifies it in all threads of block and copies + * data to host and frees the memory in another thread. + */ +static __global__ void kerTestAccessInAllThreadsForUnion( + testInfoUnion *outputBuf, int test_type) { + int myThreadId = threadIdx.x, lastThreadId = (blockDim.x - 1); + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 + if (0 == myThreadId) { + if (test_type == TEST_MALLOC_FREE) { + dev_ptr[0][blockIdx.x] = + reinterpret_cast (malloc(blockDim.x*sizeof(testInfoUnion))); + } else { + dev_ptr[0][blockIdx.x] = + reinterpret_cast (new testInfoUnion[blockDim.x]); + } + } + // All threads wait at this barrier + __syncthreads(); + // Check allocated memory in all threads in block before access + if (dev_ptr[0][blockIdx.x] == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + testInfoUnion *ptr = + reinterpret_cast (dev_ptr[0][blockIdx.x]); + // Copy to buffer + switch (myId % 5) { + case 0: ptr[myThreadId].ival = INT_MAX; break; + case 1: ptr[myThreadId].dval = DBL_MAX; break; + case 2: ptr[myThreadId].fval = FLT_MAX; break; + case 3: ptr[myThreadId].sval = SHRT_MAX; break; + case 4: ptr[myThreadId].cval = SCHAR_MAX; break; + } + // All threads wait + __syncthreads(); + // Copy memory to host and free the memory in thread + if (lastThreadId == myThreadId) { + for (size_t idx = 0; idx < blockDim.x; idx++) { + outputBuf[idx + blockDim.x * blockIdx.x] = ptr[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } + } +} + +/** + * This kernel allocates memory in one thread. + */ +template +static __global__ void kerAlloc(int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 of block 0 + if (0 == myId) { + if (test_type == TEST_MALLOC_FREE) { + dev_mem_glob = + reinterpret_cast (malloc(blockDim.x*gridDim.x*sizeof(T))); + } else { + dev_mem_glob = + reinterpret_cast (new T[blockDim.x*gridDim.x]); + } + } +} + +/** + * This kernel writes to memory allocated in . + */ +template +static __global__ void kerWrite(T value) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Check allocated memory in all threads in block before access + if (dev_mem_glob == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + T *ptr = reinterpret_cast (dev_mem_glob); + // Copy to buffer + ptr[myId] = value; +} + +/** + * This kernel copies the contents of memory allocated in + * to host and deletes the memory from thread 0. + */ +template +static __global__ void kerFree(T *outputBuf, int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Check allocated memory in all threads in block before access + if (dev_mem_glob == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + + T *ptr = reinterpret_cast (dev_mem_glob); + if (0 == myId) { + for (size_t idx = 0; idx < (blockDim.x*gridDim.x); idx++) { + outputBuf[idx] = ptr[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } + } +} + +/** + * This device function allocates memory in one thread. + */ +static __device__ int* deviceAlloc(int test_type) { + int *ptr = nullptr; + if (test_type == TEST_MALLOC_FREE) { + ptr = + reinterpret_cast (malloc(INTERNAL_BUFFER_SIZE*sizeof(int))); + } else { + ptr = + reinterpret_cast (new int[INTERNAL_BUFFER_SIZE]); + } + return ptr; +} + +/** + * This device function writes to memory allocated in deviceAlloc(). + */ +static __device__ void deviceWrite(int myId, int *devmem) { + // Check allocated memory in all threads in block before access + if (devmem == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + // Copy to buffer + for (size_t idx = 0; idx < INTERNAL_BUFFER_SIZE; idx++) { + devmem[idx] = myId; + } +} + +/** + * This device function copies the contents of memory allocated + * in deviceAlloc() to host and deletes the memory from thread 0. + */ +static __device__ void deviceFree(int *outputBuf, int *devmem, + int test_type, int myId) { + // Check allocated memory in all threads in block before access + if (devmem == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + for (size_t idx = 0; idx < INTERNAL_BUFFER_SIZE; idx++) { + outputBuf[myId*INTERNAL_BUFFER_SIZE + idx] = devmem[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(devmem); + } else { + delete[] devmem; + } +} + +/** + * This kernel invokes __device__ allocation functions via pointers. + */ +static __global__ void kerTestAllocationUsingDevFunc(int *outputBuf, + int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + struct deviceAllocFunc *func = &allocfunc; + int *dev_ptr = nullptr; + dev_ptr = func->alloc(test_type); + func->write(myId, dev_ptr); + func->free(outputBuf, dev_ptr, test_type, myId); +} + +/** + * Local function: Allocate local and device memory from host, + * launches kerTestDynamicAllocInAllThread<<<>>> and copies data back + * to host to validate. + */ +template +static bool TestAllocInAllThread(int test_type, + T value, size_t sizeBufferPerThread) { + T *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (sizeBufferPerThread * BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast (malloc(sizeof(T) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize))); + // Launch Test Kernel + kerTestDynamicAllocInAllThread<<>>( + outputVec_d, test_type, value, sizeBufferPerThread); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != value) { + bPassed = false; + break; + } + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Local function: Allocate local and device memory from host, + * launches kerTestAccessInAllThreadsInBlock<<<>>> and copies data back + * to host to validate. + */ +template +static bool TestMemoryAccessInAllThread(int test_type, int thread_idx) { + T *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (BLOCKSIZE * GRIDSIZE); + T data_value = std::numeric_limits::max(); + outputVec_h = reinterpret_cast (malloc(sizeof(T) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize))); + // Launch Test Kernel + kerTestAccessInAllThreadsInBlock<<>>(outputVec_d, + test_type, data_value, thread_idx); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != data_value) { + bPassed = false; + break; + } + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Local function: Launch kerAlloc<<<>>> + */ +template +static void runTestMemoryAccessInAllThread(int test_type, int thread_idx) { + thread_results[thread_idx] = TestMemoryAccessInAllThread(test_type, + thread_idx); +} + +/** + * Local function: Launch kerAlloc<<<>>>, kerWrite<<<>>> and kerFree<<<>>> + * to test kernel allocated memory access across multiple kernels and multiple + * streams. + */ +template +static bool TestMemoryAcrossMulKernels(int test_type, + bool multistream = false) { + T *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (BLOCKSIZE * GRIDSIZE); + T data_value = std::numeric_limits::max(); + outputVec_h = reinterpret_cast (malloc(sizeof(T) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize))); + // Launch Test Kernel + if (multistream) { + hipStream_t stream1, stream2, stream3; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + kerAlloc<<>>(test_type); + HIP_CHECK(hipStreamSynchronize(stream1)); + kerWrite<<>>(data_value); + HIP_CHECK(hipStreamSynchronize(stream2)); + kerFree<<>>(outputVec_d, test_type); + HIP_CHECK(hipStreamSynchronize(stream3)); + HIP_CHECK(hipStreamDestroy(stream1)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream3)); + } else { + kerAlloc<<>>(test_type); + kerWrite<<>>(data_value); + kerFree<<>>(outputVec_d, test_type); + HIP_CHECK(hipDeviceSynchronize()); + } + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != data_value) { + bPassed = false; + break; + } + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Local function: Launch kerAlloc<<<>>> + */ +template +static void runKerAlloc(int test_type) { + kerAlloc<<>>(test_type); +} + +/** + * Local function: Launch kerWrite<<<>>> + */ +template +static void runKerWrite(T data_value) { + kerWrite<<>>(data_value); +} + +/** + * Local function: Launch kerFree<<<>>> + */ +template +static void runKerFree(T *outputVec_d, int test_type) { + kerFree<<>>(outputVec_d, test_type); +} + +/** + * Local function: Launch kerAlloc<<<>>>, kerWrite<<<>>> and kerFree<<<>>> + * across multiple threads. + */ +template +static bool TestDevMemAllocMulKerMulThrd(int test_type) { + T *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (BLOCKSIZE * GRIDSIZE); + T data_value = std::numeric_limits::max(); + outputVec_h = reinterpret_cast (malloc(sizeof(T) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize))); + // Launch all Test Kernel threads + std::thread threadAlloc(runKerAlloc, test_type); + threadAlloc.join(); + std::thread threadWrite(runKerWrite, data_value); + threadWrite.join(); + std::thread threadFree(runKerFree, outputVec_d, test_type); + threadFree.join(); + // Wait for all kernels in device + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != data_value) { + bPassed = false; + break; + } + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} +/** + * Local function: Allocate local and device memory from host, + * launches kerTestAccessInAllThreads_CmplxStr<<<>>> and copies data back + * to host to validate. + */ +static bool TestMemoryAccessInAllThread_CmplxStr(int test_type) { + int *result_d{nullptr}, *result_h{nullptr}; + size_t arraysize = BLOCKSIZE; + result_h = reinterpret_cast (malloc(sizeof(int) * arraysize)); + REQUIRE(result_h != nullptr); + HIP_CHECK(hipMalloc(&result_d, (sizeof(int) * arraysize))); + HIP_CHECK(hipMemset(result_d, 0, (sizeof(int) * arraysize))); + // Launch Test Kernel + kerTestAccessInAllThreads_CmplxStr<<>>( + test_type, result_d); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(result_h, result_d, sizeof(int) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < GRIDSIZE; idx++) { + if (result_h[idx] != 1) { + bPassed = false; + break; + } + } + hipFree(result_d); + free(result_h); + return bPassed; +} + +/** + * Local function: Allocate host and device memory of type union, + * launches kerTestAccessInAllThreadsForUnion<<<>>> and copies data back + * to host to validate. + */ +static bool TestMemoryAccessInAllThread_Union(int test_type) { + testInfoUnion *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast + (malloc(sizeof(testInfoUnion) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, + (sizeof(testInfoUnion) * arraysize))); + // Launch Test Kernel + kerTestAccessInAllThreadsForUnion<<>>(outputVec_d, + test_type); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, + sizeof(testInfoUnion) * arraysize, hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + switch (idx % 5) { + case 0: + if (outputVec_h[idx].ival != INT_MAX) { + bPassed = false; + } + break; + case 1: + if (outputVec_h[idx].dval != DBL_MAX) { + bPassed = false; + } + break; + case 2: + if (outputVec_h[idx].fval != FLT_MAX) { + bPassed = false; + } + break; + case 3: + if (outputVec_h[idx].sval != SHRT_MAX) { + bPassed = false; + } + break; + case 4: + if (outputVec_h[idx].cval != SCHAR_MAX) { + bPassed = false; + } + break; + } + if (bPassed == false) break; + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Local function: Allocate local and device memory from host, + * launches ker_TestDynamicAllocInAllThreads_CodeObj<<<>>> and + * copies data back to host to validate. + */ +static bool TestAlloc_Load_SingleKer_AllocFree(int test_type, + int value, size_t sizeBufferPerThread) { + int *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (sizeBufferPerThread * BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast (malloc(sizeof(int) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize))); + // Launch Test Kernel + hipModule_t Module; + hipFunction_t Function; + HIP_CHECK(hipModuleLoad(&Module, DEV_ALLOC_SINGKER_COBJ)); + HIP_CHECK(hipModuleGetFunction(&Function, Module, + DEV_ALLOC_SINGKER_COBJ_FUNC)); + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + struct { + void* _Output_d; + int _test_type; + int _value; + size_t _size; + } args; + args._Output_d = reinterpret_cast(outputVec_d); + args._test_type = test_type; + args._value = value; + args._size = sizeBufferPerThread; + size_t size = sizeof(args); + + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END}; + HIP_CHECK(hipModuleLaunchKernel(Function, GRIDSIZE, 1, 1, + BLOCKSIZE, 1, 1, 0, + stream, NULL, + reinterpret_cast(&config))); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != value) { + bPassed = false; + break; + } + } + HIP_CHECK(hipModuleUnload(Module)); + HIP_CHECK(hipStreamDestroy(stream)); + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Local function: Allocate local and device memory from host, + * launches ker_Alloc_MultCodeObj<<<>>>, ker_Write_MultCodeObj<<<>>> and + * ker_Free_MultCodeObj<<<>>> copies data back to host to validate. + */ +static bool TestAlloc_Load_MultKernels(int test_type, + int value) { + int *outputVec_d{nullptr}, *outputVec_h{nullptr}; + int **dev_addr{nullptr}; + size_t arraysize = (BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast (malloc(sizeof(int) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize))); + HIP_CHECK(hipMalloc(&dev_addr, (sizeof(int*)))); + // Launch Test Kernel + hipModule_t ModuleAlloc, ModuleWrite, ModuleFree; + hipFunction_t FunctionAlloc, FunctionAcess, FunctionFree; + // Load ker_Alloc_MultCodeObj + HIP_CHECK(hipModuleLoad(&ModuleAlloc, DEV_ALLOC_MULCOBJ)); + HIP_CHECK(hipModuleLoad(&ModuleWrite, DEV_WRITE_MULCOBJ)); + HIP_CHECK(hipModuleLoad(&ModuleFree, DEV_FREE_MULCOBJ)); + HIP_CHECK(hipModuleGetFunction(&FunctionAlloc, ModuleAlloc, + DEV_ALLOC_MULCODEOBJ_ALLOC)); + // Load ker_Write_MultCodeObj + HIP_CHECK(hipModuleGetFunction(&FunctionAcess, ModuleWrite, + DEV_ALLOC_MULCODEOBJ_WRITE)); + // Load ker_Free_MultCodeObj + HIP_CHECK(hipModuleGetFunction(&FunctionFree, ModuleFree, + DEV_ALLOC_MULCODEOBJ_FREE)); + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + struct { + void **__dev_addr; + int _test_type; + } args1; + args1.__dev_addr = reinterpret_cast(dev_addr); + args1._test_type = test_type; + size_t size1 = sizeof(args1); + + void* config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args1, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1, + HIP_LAUNCH_PARAM_END}; + struct { + void **__dev_addr; + int _value; + } args2; + args2.__dev_addr = reinterpret_cast(dev_addr); + args2._value = value; + size_t size2 = sizeof(args2); + + void* config2[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args2, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size2, + HIP_LAUNCH_PARAM_END}; + struct { + void* _output; + void **__dev_addr; + int _test_type; + } args3; + args3._output = reinterpret_cast(outputVec_d); + args3.__dev_addr = reinterpret_cast(dev_addr); + args3._test_type = test_type; + size_t size3 = sizeof(args3); + + void* config3[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args3, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size3, + HIP_LAUNCH_PARAM_END}; + // Launch ker_Alloc_MultCodeObj + HIP_CHECK(hipModuleLaunchKernel(FunctionAlloc, GRIDSIZE, 1, 1, + BLOCKSIZE, 1, 1, 0, + stream, NULL, + reinterpret_cast(&config1))); + // Launch ker_Write_MultCodeObj + HIP_CHECK(hipModuleLaunchKernel(FunctionAcess, GRIDSIZE, 1, 1, + BLOCKSIZE, 1, 1, 0, + stream, NULL, + reinterpret_cast(&config2))); + // Launch ker_Free_MultCodeObj + HIP_CHECK(hipModuleLaunchKernel(FunctionFree, GRIDSIZE, 1, 1, + BLOCKSIZE, 1, 1, 0, + stream, NULL, + reinterpret_cast(&config3))); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != value) { + bPassed = false; + break; + } + } + HIP_CHECK(hipModuleUnload(ModuleAlloc)); + HIP_CHECK(hipModuleUnload(ModuleWrite)); + HIP_CHECK(hipModuleUnload(ModuleFree)); + HIP_CHECK(hipStreamDestroy(stream)); + hipFree(dev_addr); + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Local function: Launch kerAlloc<<<>>>, kerWrite<<<>>> and kerFree<<<>>> + * to test kernel allocated memory access across multiple kernels using + * hipGraph. + */ +template +static bool TestMemoryAcrossMulKernelsUsingGraph(int test_type) { + T *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (BLOCKSIZE * GRIDSIZE); + T data_value = std::numeric_limits::max(); + outputVec_h = reinterpret_cast (malloc(sizeof(T) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize))); + // Launch Test Kernels using graph + hipGraph_t graph; + hipStream_t streamForGraph; + hipGraphExec_t graphExec; + HIP_CHECK(hipStreamCreate(&streamForGraph)); + HIP_CHECK(hipGraphCreate(&graph, 0)); + // Create Allocation Kernel Node + hipGraphNode_t kernelnode_1; + hipKernelNodeParams kernelNodeParams1{}; + void* kernelArgs1[] = {reinterpret_cast(&test_type)}; + kernelNodeParams1.func = reinterpret_cast(kerAlloc); + kernelNodeParams1.gridDim = dim3(GRIDSIZE); + kernelNodeParams1.blockDim = dim3(BLOCKSIZE); + kernelNodeParams1.sharedMemBytes = 0; + kernelNodeParams1.kernelParams = reinterpret_cast(kernelArgs1); + kernelNodeParams1.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph, nullptr, 0, + &kernelNodeParams1)); + // Create Write Kernel Node + hipGraphNode_t kernelnode_2; + hipKernelNodeParams kernelNodeParams2{}; + void* kernelArgs2[] = {reinterpret_cast(&data_value)}; + kernelNodeParams2.func = reinterpret_cast(kerWrite); + kernelNodeParams2.gridDim = dim3(GRIDSIZE); + kernelNodeParams2.blockDim = dim3(BLOCKSIZE); + kernelNodeParams2.sharedMemBytes = 0; + kernelNodeParams2.kernelParams = reinterpret_cast(kernelArgs2); + kernelNodeParams2.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph, nullptr, 0, + &kernelNodeParams2)); + // Create Free Kernel Node + hipGraphNode_t kernelnode_3; + hipKernelNodeParams kernelNodeParams3{}; + void* kernelArgs3[] = + {&outputVec_d, reinterpret_cast(&test_type)}; + kernelNodeParams3.func = reinterpret_cast(kerFree); + kernelNodeParams3.gridDim = dim3(GRIDSIZE); + kernelNodeParams3.blockDim = dim3(BLOCKSIZE); + kernelNodeParams3.sharedMemBytes = 0; + kernelNodeParams3.kernelParams = reinterpret_cast(kernelArgs3); + kernelNodeParams3.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_3, graph, nullptr, 0, + &kernelNodeParams3)); + // Create Memcpy Node + hipGraphNode_t memcpyD2H; + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, + outputVec_h, outputVec_d, (sizeof(T) * arraysize), + hipMemcpyDeviceToHost)); + // Create dependencies for graph + HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode_1, + &kernelnode_2, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode_2, + &kernelnode_3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode_3, + &memcpyD2H, 1)); + // Instantiate and launch the graphs + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); + HIP_CHECK(hipStreamSynchronize(streamForGraph)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != data_value) { + bPassed = false; + break; + } + } + HIP_CHECK(hipStreamDestroy(streamForGraph)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} +/** + * Local function: Allocate local and device memory from host, + * launches kerTestAllocationUsingDevFunc<<<>>> and copies data back + * to host to validate. + */ +static bool TestAllocInDeviceFunc(int test_type) { + int *outputVec_d{nullptr}, *outputVec_h{nullptr}; + size_t arraysize = (INTERNAL_BUFFER_SIZE * BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast (malloc(sizeof(int) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize))); + // Launch Test Kernel + kerTestAllocationUsingDevFunc<<>>(outputVec_d, + test_type); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != (idx / INTERNAL_BUFFER_SIZE)) { + bPassed = false; + break; + } + } + hipFree(outputVec_d); + free(outputVec_h); + return bPassed; +} + +/** + * Scenario: This test validates device allocation and deallocation + * using malloc/free in every gpu thread and block for primitive data + * types like char, short, int etc. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_PrimitiveDataType") { + constexpr size_t sizePerThread = 128; + + // malloc()/free() tests + SECTION("Test char datatype allocation with malloc") { + REQUIRE(true == TestAllocInAllThread(TEST_MALLOC_FREE, + SCHAR_MAX, sizePerThread)); + } + + SECTION("Test short datatype allocation with malloc") { + REQUIRE(true == TestAllocInAllThread(TEST_MALLOC_FREE, + SHRT_MAX, sizePerThread)); + } + + SECTION("Test int datatype allocation with malloc") { + REQUIRE(true == TestAllocInAllThread(TEST_MALLOC_FREE, + INT_MAX, sizePerThread)); + } + + SECTION("Test float datatype allocation with malloc") { + REQUIRE(true == TestAllocInAllThread(TEST_MALLOC_FREE, + FLT_MAX, sizePerThread)); + } + + SECTION("Test double datatype allocation with malloc") { + REQUIRE(true == TestAllocInAllThread(TEST_MALLOC_FREE, + DBL_MAX, sizePerThread)); + } +} + +/** + * Scenario: This test validates device allocation and deallocation + * using new/delete in every gpu thread and block for primitive data + * types like char, short, int etc. + */ +TEST_CASE("Unit_deviceAllocation_New_PerThread_PrimitiveDataType") { + constexpr size_t sizePerThread = 128; + + // new/delete tests + SECTION("Test char datatype allocation with new") { + REQUIRE(true == TestAllocInAllThread(TEST_NEW_DELETE, + SCHAR_MAX, sizePerThread)); + } + + SECTION("Test short datatype allocation with new") { + REQUIRE(true == TestAllocInAllThread(TEST_NEW_DELETE, + SHRT_MAX, sizePerThread)); + } + + SECTION("Test int datatype allocation with new") { + REQUIRE(true == TestAllocInAllThread(TEST_NEW_DELETE, + INT_MAX, sizePerThread)); + } + + SECTION("Test float datatype allocation with new") { + REQUIRE(true == TestAllocInAllThread(TEST_NEW_DELETE, + FLT_MAX, sizePerThread)); + } + + SECTION("Test double datatype allocation with new") { + REQUIRE(true == TestAllocInAllThread(TEST_NEW_DELETE, + DBL_MAX, sizePerThread)); + } +} + +/** + * Scenario: This test validates device allocation and deallocation + * using malloc/free in every gpu thread and block for structure. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_StructDataType") { + constexpr size_t sizePerThread = 64; + struct simpleStruct sampleStr{INT_MAX, DBL_MAX, FLT_MAX, SHRT_MAX, + SCHAR_MAX, {1, 2, 3, 4, 5, 6, 7, 8}}; + REQUIRE(true == TestAllocInAllThread(TEST_MALLOC_FREE, + sampleStr, sizePerThread)); +} + +/** + * Scenario: This test validates device allocation and deallocation + * using new/delete in every gpu thread and block for structure. + */ +TEST_CASE("Unit_deviceAllocation_New_PerThread_StructDataType") { + constexpr size_t sizePerThread = 64; + struct simpleStruct sampleStr{INT_MAX, DBL_MAX, FLT_MAX, SHRT_MAX, + SCHAR_MAX, {1, 2, 3, 4, 5, 6, 7, 8}}; + REQUIRE(true == TestAllocInAllThread(TEST_NEW_DELETE, + sampleStr, sizePerThread)); +} + +/** + * Scenario: This test validates device memory allocation and free + * in 1 thread and access in block for different primitive types like + * char, short, int etc. + */ +TEST_CASE("Unit_deviceAllocation_InOneThread_AccessInAllThreads") { + // malloc()/free() tests + SECTION("Test char datatype allocation with malloc") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_MALLOC_FREE, 0)); + } + + SECTION("Test short datatype allocation with malloc") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_MALLOC_FREE, 0)); + } + + SECTION("Test int datatype allocation with malloc") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_MALLOC_FREE, 0)); + } + + SECTION("Test float datatype allocation with malloc") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_MALLOC_FREE, 0)); + } + + SECTION("Test double datatype allocation with malloc") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_MALLOC_FREE, 0)); + } + + // new/delete tests + SECTION("Test char datatype allocation with new") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_NEW_DELETE, 0)); + } + + SECTION("Test short datatype allocation with new") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_NEW_DELETE, 0)); + } + + SECTION("Test int datatype allocation with new") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_NEW_DELETE, 0)); + } + + SECTION("Test float datatype allocation with new") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_NEW_DELETE, 0)); + } + + SECTION("Test double datatype allocation with new") { + REQUIRE(true == TestMemoryAccessInAllThread(TEST_NEW_DELETE, 0)); + } +} + +/** + * Scenario: This test validates device allocation malloc, access and free + * across multiple kernels for different primitive types like char, short, + * int etc. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_AcrossKernels") { + // malloc()/free() tests + SECTION("Test char datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE)); + } + + SECTION("Test short datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE)); + } + + SECTION("Test int datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE)); + } + + SECTION("Test float datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE)); + } + + SECTION("Test double datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE)); + } +} + +/** + * Scenario: This test validates device new, access and delete + * across multiple kernels for different primitive types like char, short, + * int etc. + */ +TEST_CASE("Unit_deviceAllocation_New_AcrossKernels") { + // new/delete tests + SECTION("Test char datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE)); + } + + SECTION("Test short datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE)); + } + + SECTION("Test int datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE)); + } + + SECTION("Test float datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE)); + } + + SECTION("Test double datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE)); + } +} + +/** + * Scenarios: + * A) This test validates device allocation malloc, access and free + * across multiple kernels for nested structure. + * B) This test also validates memory allocation and deallocation through + * __device__ functions. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_ComplexDataType") { + // malloc()/free() tests + REQUIRE(true == TestMemoryAccessInAllThread_CmplxStr(TEST_MALLOC_FREE)); +} + +/** + * Scenario: + * A) This test validates device allocation malloc, access and free + * across multiple kernels for nested structure. + * B) This test also validates memory allocation and deallocation through + * __device__ functions. + */ +TEST_CASE("Unit_deviceAllocation_New_ComplexDataType") { + // new/delete tests + REQUIRE(true == TestMemoryAccessInAllThread_CmplxStr(TEST_NEW_DELETE)); +} + +/** + * Scenario: This test validates device allocation malloc, access and free + * across multiple kernels for Union data type. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_UnionType") { + // malloc()/free() tests + REQUIRE(true == TestMemoryAccessInAllThread_Union(TEST_MALLOC_FREE)); +} + +/** + * Scenario: This test validates device allocation new, access and delete + * across multiple kernels for Union data type. + */ +TEST_CASE("Unit_deviceAllocation_New_UnionType") { + // new/delete tests + REQUIRE(true == TestMemoryAccessInAllThread_Union(TEST_NEW_DELETE)); +} + +/** + * Scenario: This test validates device allocation and deallocation + * using malloc/free in every gpu thread and block using Single + * Code Object kernel. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_SingleCodeObj") { + constexpr size_t sizePerThread = 128; + + REQUIRE(true == TestAlloc_Load_SingleKer_AllocFree(TEST_MALLOC_FREE, + INT_MAX, sizePerThread)); +} + +/** + * Scenario: This test validates device allocation and deallocation + * using new/delete in every gpu thread and block using Single + * Code Object kernel. + */ +TEST_CASE("Unit_deviceAllocation_New_SingleCodeObj") { + constexpr size_t sizePerThread = 128; + + REQUIRE(true == TestAlloc_Load_SingleKer_AllocFree(TEST_NEW_DELETE, + INT_MAX, sizePerThread)); +} + +#if HT_NVIDIA +/** + * Scenario: This test validates device allocation and deallocation + * using malloc/free in multikernel and multistream environment. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_MultKerMultStrm") { + // malloc()/free() tests + SECTION("Test char datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE, + true)); + } + + SECTION("Test short datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE, + true)); + } + + SECTION("Test int datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE, + true)); + } + + SECTION("Test float datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE, + true)); + } + + SECTION("Test double datatype allocation with malloc") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_MALLOC_FREE, + true)); + } +} + +/** + * Scenario: This test validates device allocation and deallocation + * using new/delete in multikernel and multistream environment. + */ +TEST_CASE("Unit_deviceAllocation_New_PerThread_MultKerMultStrm") { + // new/delete tests + SECTION("Test char datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE, + true)); + } + + SECTION("Test short datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE, + true)); + } + + SECTION("Test int datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE, + true)); + } + + SECTION("Test float datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE, + true)); + } + + SECTION("Test double datatype allocation with new") { + REQUIRE(true == TestMemoryAcrossMulKernels(TEST_NEW_DELETE, + true)); + } +} +#endif + +/** + * Scenario: This test validates device allocation and deallocation + * using malloc/free in graph. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_Graph") { + // malloc()/free() tests + SECTION("Test char datatype allocation with malloc") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_MALLOC_FREE)); + } + + SECTION("Test short datatype allocation with malloc") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_MALLOC_FREE)); + } + + SECTION("Test int datatype allocation with malloc") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_MALLOC_FREE)); + } + + SECTION("Test float datatype allocation with malloc") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_MALLOC_FREE)); + } + + SECTION("Test double datatype allocation with malloc") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_MALLOC_FREE)); + } +} + +/** + * Scenario: This test validates device allocation and deallocation + * using new/delete in graph. + */ +TEST_CASE("Unit_deviceAllocation_New_PerThread_Graph") { + // new/delete tests + SECTION("Test char datatype allocation with new") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_NEW_DELETE)); + } + + SECTION("Test short datatype allocation with new") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_NEW_DELETE)); + } + + SECTION("Test int datatype allocation with new") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_NEW_DELETE)); + } + + SECTION("Test float datatype allocation with new") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_NEW_DELETE)); + } + + SECTION("Test double datatype allocation with new") { + REQUIRE(true == + TestMemoryAcrossMulKernelsUsingGraph(TEST_NEW_DELETE)); + } +} + +/** + * Scenario: This test validates device allocation malloc, access and free + * using pointers to device functions. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_DeviceFunc") { + // malloc/free tests + REQUIRE(true == TestAllocInDeviceFunc(TEST_MALLOC_FREE)); +} + +/** + * Scenario: This test validates device allocation new, access and delete + * using pointers to device functions. + */ +TEST_CASE("Unit_deviceAllocation_New_DeviceFunc") { + // new/delete tests + REQUIRE(true == TestAllocInDeviceFunc(TEST_NEW_DELETE)); +} + +/** + * Scenario: This test validates device allocation using vitual functions + */ +TEST_CASE("Unit_deviceAllocation_VirtualFunction") { + int *outputVec_d{nullptr}, *outputVec_h{nullptr}; + constexpr size_t sizeBufferPerThread = 8; + size_t arraysize = (sizeBufferPerThread * BLOCKSIZE * GRIDSIZE); + outputVec_h = reinterpret_cast (malloc(sizeof(int) * arraysize)); + REQUIRE(outputVec_h != nullptr); + HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize))); + // Launch Test Kernel + kerTestDynamicAllocVirtualFunc<<>>( + outputVec_d, sizeBufferPerThread); + HIP_CHECK(hipDeviceSynchronize()); + // Copy to host buffer + HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, + hipMemcpyDefault)); + bool bPassed = true; + for (size_t idx = 0; idx < arraysize; idx++) { + if (outputVec_h[idx] != (idx / sizeBufferPerThread)) { + bPassed = false; + break; + } + } + REQUIRE(true == bPassed); + hipFree(outputVec_d); + free(outputVec_h); +} + +/** + * Scenario: This test validates device allocation malloc, access and free + * across multiple kernels launched using threads. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_MulKernels_MulThreads") { + // malloc()/free() tests + SECTION("Test char datatype allocation with malloc") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_MALLOC_FREE)); + } + + SECTION("Test short datatype allocation with malloc") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_MALLOC_FREE)); + } + + SECTION("Test int datatype allocation with malloc") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_MALLOC_FREE)); + } + + SECTION("Test float datatype allocation with malloc") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_MALLOC_FREE)); + } + + SECTION("Test double datatype allocation with malloc") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_MALLOC_FREE)); + } +} + +/** + * Scenario: This test validates device new, access and delete + * across multiple kernels launched using threads. + */ +TEST_CASE("Unit_deviceAllocation_New_MulKernels_MulThreads") { + // new/delete tests + SECTION("Test char datatype allocation with new") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_NEW_DELETE)); + } + + SECTION("Test short datatype allocation with new") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_NEW_DELETE)); + } + + SECTION("Test int datatype allocation with new") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_NEW_DELETE)); + } + + SECTION("Test float datatype allocation with new") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_NEW_DELETE)); + } + + SECTION("Test double datatype allocation with new") { + REQUIRE(true == TestDevMemAllocMulKerMulThrd(TEST_NEW_DELETE)); + } +} + +#if HT_AMD +// Scenarios Unit_deviceAllocation_Malloc_SingKernels_MulThreads and +// are failing on NVIDIA platform. +/** + * Scenario: This test validates device allocation malloc, access and free + * in a single kernel launched using threads. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_SingKernels_MulThreads") { + // malloc()/free() tests + std::vector tests; + // Spawn the test threads + for (int idx = 0; idx < num_threads; idx++) { + thread_results[idx] = false; + tests.push_back(std::thread(runTestMemoryAccessInAllThread, + TEST_MALLOC_FREE, idx)); + } + // Wait for all threads to complete + for (std::thread &t : tests) { + t.join(); + } + // Verify All Results + for (int idx = 0; idx < num_threads; idx++) { + REQUIRE(thread_results[idx]); + } +} + +/** + * Scenario: This test validates device new, access and delete + * in a single kernel launched using threads. + */ +TEST_CASE("Unit_deviceAllocation_New_SingKernels_MulThreads") { + // new/delete tests + std::vector tests; + // Spawn the test threads + for (int idx = 0; idx < num_threads; idx++) { + thread_results[idx] = false; + tests.push_back(std::thread(runTestMemoryAccessInAllThread, + TEST_NEW_DELETE, idx)); + } + // Wait for all threads to complete + for (std::thread &t : tests) { + t.join(); + } + // Verify All Results + for (int idx = 0; idx < num_threads; idx++) { + REQUIRE(thread_results[idx]); + } +} +#endif + +/** + * Scenario: This test validates Allocation and Deallocation in multiple + * code object kernels defined in different source files. + */ +TEST_CASE("Unit_deviceAllocation_Malloc_MulCodeObj") { + REQUIRE(true == TestAlloc_Load_MultKernels(TEST_MALLOC_FREE, + INT_MAX)); +} + +/** + * Scenario: This test validates Allocation and Deallocation in multiple + * code object kernels defined in different source files. + */ +TEST_CASE("Unit_deviceAllocation_New_MulCodeObj") { + REQUIRE(true == TestAlloc_Load_MultKernels(TEST_NEW_DELETE, + INT_MAX)); +} diff --git a/projects/hip-tests/catch/unit/deviceLib/kerDevAllocMultCO.cc b/projects/hip-tests/catch/unit/deviceLib/kerDevAllocMultCO.cc new file mode 100644 index 0000000000..f84aabc3c2 --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/kerDevAllocMultCO.cc @@ -0,0 +1,39 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include "./defs.h" + +/** + * This kernel allocates memory in thread 0. + */ +extern "C" __global__ void ker_Alloc_MultCodeObj(int **dev_mem, + int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate memory in thread 0 of block 0 + if (0 == myId) { + if (test_type == TEST_MALLOC_FREE) { + *dev_mem = + reinterpret_cast (malloc(blockDim.x*gridDim.x*sizeof(int))); + } else { + *dev_mem = + reinterpret_cast (new int[blockDim.x*gridDim.x]); + } + } +} diff --git a/projects/hip-tests/catch/unit/deviceLib/kerDevAllocSingleKer.cc b/projects/hip-tests/catch/unit/deviceLib/kerDevAllocSingleKer.cc new file mode 100644 index 0000000000..4cda6ce3aa --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/kerDevAllocSingleKer.cc @@ -0,0 +1,57 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include "./defs.h" +/** + * This kernel allocates and deallocates memory in every thread. + */ +extern "C" __global__ void ker_TestDynamicAllocInAllThreads_CodeObj( + int *outputBuf, int test_type, int value, + size_t perThreadSize) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Allocate + size_t size = 0; + int* ptr = nullptr; + if (test_type == TEST_MALLOC_FREE) { + size = perThreadSize * sizeof(int); + ptr = reinterpret_cast (malloc(size)); + } else { + size = perThreadSize; + ptr = new int[perThreadSize]; + } + if (ptr == nullptr) { + printf("Device Allocation in thread %d Failed! \n", myId); + return; + } + // Set memory + for (size_t idx = 0; idx < perThreadSize; idx++) { + ptr[idx] = value; + } + // Copy to output buffer + for (size_t idx = 0; idx < perThreadSize; idx++) { + outputBuf[myId*perThreadSize + idx] = ptr[idx]; + } + // Free memory + if (test_type == TEST_MALLOC_FREE) { + free(ptr); + } else { + delete[] ptr; + } +} diff --git a/projects/hip-tests/catch/unit/deviceLib/kerDevFreeMultCO.cc b/projects/hip-tests/catch/unit/deviceLib/kerDevFreeMultCO.cc new file mode 100644 index 0000000000..571ed5d6f6 --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/kerDevFreeMultCO.cc @@ -0,0 +1,47 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include "./defs.h" + +/** + * This kernel copies the contents of memory allocated in + * ker_Alloc_MultCodeObj<<<>>> to host and deletes the memory + * from thread 0. + */ +extern "C" __global__ void ker_Free_MultCodeObj(int *outputBuf, + int **dev_mem, int test_type) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Check allocated memory in all threads in block before access + if (*dev_mem == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + + if (0 == myId) { + for (size_t idx = 0; idx < (blockDim.x*gridDim.x); idx++) { + outputBuf[idx] = (*dev_mem)[idx]; + } + if (test_type == TEST_MALLOC_FREE) { + free(*dev_mem); + } else { + delete[] (*dev_mem); + } + } +} diff --git a/projects/hip-tests/catch/unit/deviceLib/kerDevWriteMultCO.cc b/projects/hip-tests/catch/unit/deviceLib/kerDevWriteMultCO.cc new file mode 100644 index 0000000000..c1a1876ce5 --- /dev/null +++ b/projects/hip-tests/catch/unit/deviceLib/kerDevWriteMultCO.cc @@ -0,0 +1,36 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include "./defs.h" + +/** + * This kernel writes to memory allocated in ker_Alloc_MultCodeObj<<<>>>. + */ +extern "C" __global__ void ker_Write_MultCodeObj(int **dev_mem, + int value) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + // Check allocated memory in all threads in block before access + if (*dev_mem == nullptr) { + printf("Device Allocation Failed in thread = %d \n", myId); + return; + } + // Copy to buffer + (*dev_mem)[myId] = value; +}