rocm-systems/projects/hip-tests/catch/unit/deviceLib/deviceAllocation.cc

/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include "deviceAllocCommon.h"

__device__ static void* dev_mem_glob;
__device__ struct deviceAllocFunc allocfunc{&deviceAlloc, &deviceWrite, &deviceFree};
__device__ class derivedAlloc classalloc;
constexpr auto num_threads = 5;
static bool thread_results[num_threads];
__device__ static void* dev_ptr[num_threads][GRIDSIZE];

/**
 * This kernel allocates and deallocates in every thread
 * of every block.
 */
template <typename T> static __global__ void kerTestDynamicAllocInAllThread(T* outputBuf,
                                                                            int test_type, T value,
                                                                            size_t perThreadSize) {
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  // Allocate
  size_t size = 0;
  T* ptr = nullptr;
  if (test_type == TEST_MALLOC_FREE) {
    size = perThreadSize * sizeof(T);
    ptr = reinterpret_cast<T*>(malloc(size));
  } else {
    size = perThreadSize;
    ptr = new T[perThreadSize];
  }
  if (ptr == nullptr) {
    printf("Device Allocation in thread %d Failed! \n", myId);
    return;
  }
  // Set memory
  for (size_t idx = 0; idx < perThreadSize; idx++) {
    ptr[idx] = value;
  }
  // Copy to output buffer
  for (size_t idx = 0; idx < perThreadSize; idx++) {
    outputBuf[myId * perThreadSize + idx] = ptr[idx];
  }
  // Free memory
  if (test_type == TEST_MALLOC_FREE) {
    free(ptr);
  } else {
    delete[] ptr;
  }
}

/**
 * This kernel allocates and deallocates using virtual functions in every
 * thread of every block.
 */
static __global__ void kerTestDynamicAllocVirtualFunc(int* outputBuf, size_t perThreadSize) {
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  baseAlloc* palloc = &classalloc;
  // Allocate
  int* ptr = palloc->alloc(perThreadSize);

  if (ptr == nullptr) {
    printf("Device Allocation in thread %d Failed! \n", myId);
    return;
  }
  // Set memory
  for (size_t idx = 0; idx < perThreadSize; idx++) {
    ptr[idx] = myId;
  }
  // Copy to output buffer
  for (size_t idx = 0; idx < perThreadSize; idx++) {
    outputBuf[myId * perThreadSize + idx] = ptr[idx];
  }
  // Free memory
  palloc->free(ptr);
}

/**
 * This kernel allocates memory in one thread,
 * access/modifies it in all threads of block and copies
 * data to host and frees the memory in another thread.
 */
template <typename T>
static __global__ void kerTestAccessInAllThreadsInBlock(T* outputBuf, int test_type, T value,
                                                        int host_thr_idx) {
  int myThreadId = threadIdx.x, lastThreadId = (blockDim.x - 1);
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  // Allocate memory in thread 0
  if (0 == myThreadId) {
    if (test_type == TEST_MALLOC_FREE) {
      dev_ptr[host_thr_idx][blockIdx.x] = reinterpret_cast<void*>(malloc(blockDim.x * sizeof(T)));
    } else {
      dev_ptr[host_thr_idx][blockIdx.x] = reinterpret_cast<void*>(new T[blockDim.x]);
    }
  }
  // All threads wait at this barrier
  __syncthreads();
  // Check allocated memory in all threads in block before access
  if (dev_ptr[host_thr_idx][blockIdx.x] == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myId);
    return;
  }
  T* ptr = reinterpret_cast<T*>(dev_ptr[host_thr_idx][blockIdx.x]);
  // Copy to buffer
  ptr[myThreadId] = value;
  // All threads wait
  __syncthreads();
  // Copy memory to host and free the memory in thread <blockDim.x - 1>
  if (lastThreadId == myThreadId) {
    for (size_t idx = 0; idx < blockDim.x; idx++) {
      outputBuf[idx + blockDim.x * blockIdx.x] = ptr[idx];
    }
    if (test_type == TEST_MALLOC_FREE) {
      free(ptr);
    } else {
      delete[] ptr;
    }
  }
}

/**
 * This kernel allocates a nested structure per block in one thread,
 * access/modifies it in all threads of block and copies
 * data to host and frees the memory in another thread.
 */
static __global__ void kerTestAccessInAllThreads_CmplxStr(int test_type, int* result) {
  int myThreadId = threadIdx.x;
  int lastThreadId = (blockDim.x - 1);
  int myBlockId = blockIdx.x;
  int myGid = threadIdx.x + blockDim.x * blockIdx.x;
  // Allocate memory in thread 0
  if (0 == myThreadId) {
    if (test_type == TEST_MALLOC_FREE) {
      dev_ptr[0][blockIdx.x] = reinterpret_cast<void*>(malloc(sizeof(struct complexStructure)));
    } else {
      dev_ptr[0][blockIdx.x] = reinterpret_cast<void*>(new struct complexStructure);
    }
    struct complexStructure* ptr =
        reinterpret_cast<struct complexStructure*>(dev_ptr[0][blockIdx.x]);
    ptr->alloc_internal_members(test_type, BLOCKSIZE);
  }
  // All threads wait at this barrier
  __syncthreads();
  // Check allocated memory in all threads in block before access
  if (dev_ptr[0][blockIdx.x] == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myGid);
    return;
  }
  struct complexStructure* ptr = reinterpret_cast<struct complexStructure*>(dev_ptr[0][blockIdx.x]);
  if (ptr->sthreadInfo == nullptr) {
    printf("Structure Allocation Failed in thread = %d \n", myGid);
    return;
  }
  // Copy to buffer
  ptr->sthreadInfo[myThreadId].threadid = myThreadId;
  ptr->sthreadInfo[myThreadId].blockid = myBlockId;
  ptr->sthreadInfo[myThreadId].ival = INT_MAX;
  ptr->sthreadInfo[myThreadId].dval = DBL_MAX;
  ptr->sthreadInfo[myThreadId].fval = FLT_MAX;
  ptr->sthreadInfo[myThreadId].sval = SHRT_MAX;
  ptr->sthreadInfo[myThreadId].cval = SCHAR_MAX;
  // All threads wait
  __syncthreads();
  // Copy memory to host and free the memory in thread <blockDim.x - 1>
  if (lastThreadId == myThreadId) {
    int match = 1;
    for (int idx = 0; idx < BLOCKSIZE; idx++) {
      if ((ptr->sthreadInfo[idx].threadid != idx) || (ptr->sthreadInfo[idx].blockid != myBlockId) ||
          (ptr->sthreadInfo[idx].ival != INT_MAX) || (ptr->sthreadInfo[idx].dval != DBL_MAX) ||
          (ptr->sthreadInfo[idx].fval != FLT_MAX) || (ptr->sthreadInfo[idx].sval != SHRT_MAX) ||
          (ptr->sthreadInfo[idx].cval != SCHAR_MAX)) {
        match = 0;
        break;
      }
    }
    result[blockIdx.x] = match;
    ptr->free_internal_members(test_type);
    if (test_type == TEST_MALLOC_FREE) {
      free(ptr);
    } else {
      delete ptr;
    }
  }
}

/**
 * This kernel allocates a union per block in one thread,
 * access/modifies it in all threads of block and copies
 * data to host and frees the memory in another thread.
 */
static __global__ void kerTestAccessInAllThreadsForUnion(testInfoUnion* outputBuf, int test_type) {
  int myThreadId = threadIdx.x, lastThreadId = (blockDim.x - 1);
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  // Allocate memory in thread 0
  if (0 == myThreadId) {
    if (test_type == TEST_MALLOC_FREE) {
      dev_ptr[0][blockIdx.x] = reinterpret_cast<void*>(malloc(blockDim.x * sizeof(testInfoUnion)));
    } else {
      dev_ptr[0][blockIdx.x] = reinterpret_cast<void*>(new testInfoUnion[blockDim.x]);
    }
  }
  // All threads wait at this barrier
  __syncthreads();
  // Check allocated memory in all threads in block before access
  if (dev_ptr[0][blockIdx.x] == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myId);
    return;
  }
  testInfoUnion* ptr = reinterpret_cast<testInfoUnion*>(dev_ptr[0][blockIdx.x]);
  // Copy to buffer
  switch (myId % 5) {
    case 0:
      ptr[myThreadId].ival = INT_MAX;
      break;
    case 1:
      ptr[myThreadId].dval = DBL_MAX;
      break;
    case 2:
      ptr[myThreadId].fval = FLT_MAX;
      break;
    case 3:
      ptr[myThreadId].sval = SHRT_MAX;
      break;
    case 4:
      ptr[myThreadId].cval = SCHAR_MAX;
      break;
  }
  // All threads wait
  __syncthreads();
  // Copy memory to host and free the memory in thread <blockDim.x - 1>
  if (lastThreadId == myThreadId) {
    for (size_t idx = 0; idx < blockDim.x; idx++) {
      outputBuf[idx + blockDim.x * blockIdx.x] = ptr[idx];
    }
    if (test_type == TEST_MALLOC_FREE) {
      free(ptr);
    } else {
      delete[] ptr;
    }
  }
}

/**
 * This kernel allocates memory in one thread.
 */
template <typename T> static __global__ void kerAlloc(int test_type) {
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  // Allocate memory in thread 0 of block 0
  if (0 == myId) {
    if (test_type == TEST_MALLOC_FREE) {
      dev_mem_glob = reinterpret_cast<void*>(malloc(blockDim.x * gridDim.x * sizeof(T)));
    } else {
      dev_mem_glob = reinterpret_cast<void*>(new T[blockDim.x * gridDim.x]);
    }
  }
}

/**
 * This kernel writes to memory allocated in <kerAlloc>.
 */
template <typename T> static __global__ void kerWrite(T value) {
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  // Check allocated memory in all threads in block before access
  if (dev_mem_glob == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myId);
    return;
  }
  T* ptr = reinterpret_cast<T*>(dev_mem_glob);
  // Copy to buffer
  ptr[myId] = value;
}

/**
 * This kernel copies the contents of memory allocated in <kerAlloc>
 * to host and deletes the memory from thread 0.
 */
template <typename T> static __global__ void kerFree(T* outputBuf, int test_type) {
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  // Check allocated memory in all threads in block before access
  if (dev_mem_glob == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myId);
    return;
  }

  T* ptr = reinterpret_cast<T*>(dev_mem_glob);
  if (0 == myId) {
    for (size_t idx = 0; idx < (blockDim.x * gridDim.x); idx++) {
      outputBuf[idx] = ptr[idx];
    }
    if (test_type == TEST_MALLOC_FREE) {
      free(ptr);
    } else {
      delete[] ptr;
    }
  }
}

/**
 * This device function allocates memory in one thread.
 */
static __device__ int* deviceAlloc(int test_type) {
  int* ptr = nullptr;
  if (test_type == TEST_MALLOC_FREE) {
    ptr = reinterpret_cast<int*>(malloc(INTERNAL_BUFFER_SIZE * sizeof(int)));
  } else {
    ptr = reinterpret_cast<int*>(new int[INTERNAL_BUFFER_SIZE]);
  }
  return ptr;
}

/**
 * This device function writes to memory allocated in deviceAlloc().
 */
static __device__ void deviceWrite(int myId, int* devmem) {
  // Check allocated memory in all threads in block before access
  if (devmem == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myId);
    return;
  }
  // Copy to buffer
  for (size_t idx = 0; idx < INTERNAL_BUFFER_SIZE; idx++) {
    devmem[idx] = myId;
  }
}

/**
 * This device function copies the contents of memory allocated
 * in deviceAlloc() to host and deletes the memory from thread 0.
 */
static __device__ void deviceFree(int* outputBuf, int* devmem, int test_type, int myId) {
  // Check allocated memory in all threads in block before access
  if (devmem == nullptr) {
    printf("Device Allocation Failed in thread = %d \n", myId);
    return;
  }
  for (size_t idx = 0; idx < INTERNAL_BUFFER_SIZE; idx++) {
    outputBuf[myId * INTERNAL_BUFFER_SIZE + idx] = devmem[idx];
  }
  if (test_type == TEST_MALLOC_FREE) {
    free(devmem);
  } else {
    delete[] devmem;
  }
}

/**
 * This kernel invokes __device__ allocation functions via pointers.
 */
static __global__ void kerTestAllocationUsingDevFunc(int* outputBuf, int test_type) {
  int myId = threadIdx.x + blockDim.x * blockIdx.x;
  struct deviceAllocFunc* func = &allocfunc;
  int* dev_ptr = nullptr;
  dev_ptr = func->alloc(test_type);
  func->write(myId, dev_ptr);
  func->free(outputBuf, dev_ptr, test_type, myId);
}

/**
 * Local function: Allocate local and device memory from host,
 * launches kerTestDynamicAllocInAllThread<<<>>> and copies data back
 * to host to validate.
 */
template <typename T>
static bool TestAllocInAllThread(int test_type, T value, size_t sizeBufferPerThread) {
  T *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (sizeBufferPerThread * BLOCKSIZE * GRIDSIZE);
  outputVec_h = reinterpret_cast<T*>(malloc(sizeof(T) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize)));
  // Launch Test Kernel
  kerTestDynamicAllocInAllThread<T>
      <<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, test_type, value, sizeBufferPerThread);
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Local function: Allocate local and device memory from host,
 * launches kerTestAccessInAllThreadsInBlock<<<>>> and copies data back
 * to host to validate.
 */
template <typename T> static bool TestMemoryAccessInAllThread(int test_type, int thread_idx) {
  T *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (BLOCKSIZE * GRIDSIZE);
  T data_value = std::numeric_limits<T>::max();
  outputVec_h = reinterpret_cast<T*>(malloc(sizeof(T) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize)));
  // Launch Test Kernel
  kerTestAccessInAllThreadsInBlock<T>
      <<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, test_type, data_value, thread_idx);
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != data_value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Local function: Launch kerAlloc<<<>>>
 */
template <typename T> static void runTestMemoryAccessInAllThread(int test_type, int thread_idx) {
  thread_results[thread_idx] = TestMemoryAccessInAllThread<T>(test_type, thread_idx);
}

/**
 * Local function: Launch kerAlloc<<<>>>, kerWrite<<<>>> and kerFree<<<>>>
 * to test kernel allocated memory access across multiple kernels and multiple
 * streams.
 */
template <typename T>
static bool TestMemoryAcrossMulKernels(int test_type, bool multistream = false) {
  T *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (BLOCKSIZE * GRIDSIZE);
  T data_value = std::numeric_limits<T>::max();
  outputVec_h = reinterpret_cast<T*>(malloc(sizeof(T) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize)));
  // Launch Test Kernel
  if (multistream) {
    hipStream_t stream1, stream2, stream3;
    HIP_CHECK(hipStreamCreate(&stream1));
    HIP_CHECK(hipStreamCreate(&stream2));
    HIP_CHECK(hipStreamCreate(&stream3));
    kerAlloc<T><<<GRIDSIZE, BLOCKSIZE, 0, stream1>>>(test_type);
    HIP_CHECK(hipStreamSynchronize(stream1));
    kerWrite<T><<<GRIDSIZE, BLOCKSIZE, 0, stream2>>>(data_value);
    HIP_CHECK(hipStreamSynchronize(stream2));
    kerFree<T><<<GRIDSIZE, BLOCKSIZE, 0, stream3>>>(outputVec_d, test_type);
    HIP_CHECK(hipStreamSynchronize(stream3));
    HIP_CHECK(hipStreamDestroy(stream1));
    HIP_CHECK(hipStreamDestroy(stream2));
    HIP_CHECK(hipStreamDestroy(stream3));
  } else {
    kerAlloc<T><<<GRIDSIZE, BLOCKSIZE>>>(test_type);
    kerWrite<T><<<GRIDSIZE, BLOCKSIZE>>>(data_value);
    kerFree<T><<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, test_type);
    HIP_CHECK(hipDeviceSynchronize());
  }
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != data_value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Local function: Launch kerAlloc<<<>>>
 */
template <typename T> static void runKerAlloc(int test_type) {
  kerAlloc<T><<<GRIDSIZE, BLOCKSIZE>>>(test_type);
}

/**
 * Local function: Launch kerWrite<<<>>>
 */
template <typename T> static void runKerWrite(T data_value) {
  kerWrite<T><<<GRIDSIZE, BLOCKSIZE>>>(data_value);
}

/**
 * Local function: Launch kerFree<<<>>>
 */
template <typename T> static void runKerFree(T* outputVec_d, int test_type) {
  kerFree<T><<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, test_type);
}

/**
 * Local function: Launch kerAlloc<<<>>>, kerWrite<<<>>> and kerFree<<<>>>
 * across multiple threads.
 */
template <typename T> static bool TestDevMemAllocMulKerMulThrd(int test_type) {
  T *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (BLOCKSIZE * GRIDSIZE);
  T data_value = std::numeric_limits<T>::max();
  outputVec_h = reinterpret_cast<T*>(malloc(sizeof(T) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize)));
  // Launch all Test Kernel threads
  std::thread threadAlloc(runKerAlloc<T>, test_type);
  threadAlloc.join();
  std::thread threadWrite(runKerWrite<T>, data_value);
  threadWrite.join();
  std::thread threadFree(runKerFree<T>, outputVec_d, test_type);
  threadFree.join();
  // Wait for all kernels in device
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(T) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != data_value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}
/**
 * Local function: Allocate local and device memory from host,
 * launches kerTestAccessInAllThreads_CmplxStr<<<>>> and copies data back
 * to host to validate.
 */
static bool TestMemoryAccessInAllThread_CmplxStr(int test_type) {
  int *result_d{nullptr}, *result_h{nullptr};
  size_t arraysize = GRIDSIZE;
  result_h = reinterpret_cast<int*>(malloc(sizeof(int) * arraysize));
  REQUIRE(result_h != nullptr);
  HIP_CHECK(hipMalloc(&result_d, (sizeof(int) * arraysize)));
  HIP_CHECK(hipMemset(result_d, 0, (sizeof(int) * arraysize)));
  // Launch Test Kernel
  kerTestAccessInAllThreads_CmplxStr<<<GRIDSIZE, BLOCKSIZE>>>(test_type, result_d);
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(result_h, result_d, sizeof(int) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < GRIDSIZE; idx++) {
    if (result_h[idx] != 1) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipFree(result_d));
  free(result_h);
  return bPassed;
}

/**
 * Local function: Allocate host and device memory of type union,
 * launches kerTestAccessInAllThreadsForUnion<<<>>> and copies data back
 * to host to validate.
 */
static bool TestMemoryAccessInAllThread_Union(int test_type) {
  testInfoUnion *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (BLOCKSIZE * GRIDSIZE);
  outputVec_h = reinterpret_cast<testInfoUnion*>(malloc(sizeof(testInfoUnion) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(testInfoUnion) * arraysize)));
  // Launch Test Kernel
  kerTestAccessInAllThreadsForUnion<<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, test_type);
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(
      hipMemcpy(outputVec_h, outputVec_d, sizeof(testInfoUnion) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    switch (idx % 5) {
      case 0:
        if (outputVec_h[idx].ival != INT_MAX) {
          bPassed = false;
        }
        break;
      case 1:
        if (outputVec_h[idx].dval != DBL_MAX) {
          bPassed = false;
        }
        break;
      case 2:
        if (outputVec_h[idx].fval != FLT_MAX) {
          bPassed = false;
        }
        break;
      case 3:
        if (outputVec_h[idx].sval != SHRT_MAX) {
          bPassed = false;
        }
        break;
      case 4:
        if (outputVec_h[idx].cval != SCHAR_MAX) {
          bPassed = false;
        }
        break;
    }
    if (bPassed == false) break;
  }
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Local function: Allocate local and device memory from host,
 * launches ker_TestDynamicAllocInAllThreads_CodeObj<<<>>> and
 * copies data back to host to validate.
 */
static bool TestAlloc_Load_SingleKer_AllocFree(int test_type, int value,
                                               size_t sizeBufferPerThread) {
  int *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (sizeBufferPerThread * BLOCKSIZE * GRIDSIZE);
  outputVec_h = reinterpret_cast<int*>(malloc(sizeof(int) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize)));
  // Launch Test Kernel
  hipModule_t Module;
  hipFunction_t Function;
  HIP_CHECK(hipModuleLoad(&Module, DEV_ALLOC_SINGKER_COBJ));
  HIP_CHECK(hipModuleGetFunction(&Function, Module, DEV_ALLOC_SINGKER_COBJ_FUNC));
  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));

  struct {
    void* _Output_d;
    int _test_type;
    int _value;
    size_t _size;
  } args;
  args._Output_d = reinterpret_cast<void*>(outputVec_d);
  args._test_type = test_type;
  args._value = value;
  args._size = sizeBufferPerThread;
  size_t size = sizeof(args);

  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
                    HIP_LAUNCH_PARAM_END};
  HIP_CHECK(hipModuleLaunchKernel(Function, GRIDSIZE, 1, 1, BLOCKSIZE, 1, 1, 0, stream, NULL,
                                  reinterpret_cast<void**>(&config)));
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipModuleUnload(Module));
  HIP_CHECK(hipStreamDestroy(stream));
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Local function: Allocate local and device memory from host,
 * launches ker_Alloc_MultCodeObj<<<>>>, ker_Write_MultCodeObj<<<>>> and
 * ker_Free_MultCodeObj<<<>>> copies data back to host to validate.
 */
static bool TestAlloc_Load_MultKernels(int test_type, int value) {
  int *outputVec_d{nullptr}, *outputVec_h{nullptr};
  int** dev_addr{nullptr};
  size_t arraysize = (BLOCKSIZE * GRIDSIZE);
  outputVec_h = reinterpret_cast<int*>(malloc(sizeof(int) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize)));
  HIP_CHECK(hipMalloc(&dev_addr, (sizeof(int*))));
  // Launch Test Kernel
  hipModule_t ModuleAlloc, ModuleWrite, ModuleFree;
  hipFunction_t FunctionAlloc, FunctionAcess, FunctionFree;
  // Load ker_Alloc_MultCodeObj
  HIP_CHECK(hipModuleLoad(&ModuleAlloc, DEV_ALLOC_MULCOBJ));
  HIP_CHECK(hipModuleLoad(&ModuleWrite, DEV_WRITE_MULCOBJ));
  HIP_CHECK(hipModuleLoad(&ModuleFree, DEV_FREE_MULCOBJ));
  HIP_CHECK(hipModuleGetFunction(&FunctionAlloc, ModuleAlloc, DEV_ALLOC_MULCODEOBJ_ALLOC));
  // Load ker_Write_MultCodeObj
  HIP_CHECK(hipModuleGetFunction(&FunctionAcess, ModuleWrite, DEV_ALLOC_MULCODEOBJ_WRITE));
  // Load ker_Free_MultCodeObj
  HIP_CHECK(hipModuleGetFunction(&FunctionFree, ModuleFree, DEV_ALLOC_MULCODEOBJ_FREE));
  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));

  struct {
    void** __dev_addr;
    int _test_type;
  } args1;
  args1.__dev_addr = reinterpret_cast<void**>(dev_addr);
  args1._test_type = test_type;
  size_t size1 = sizeof(args1);

  void* config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args1, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
                     HIP_LAUNCH_PARAM_END};
  struct {
    void** __dev_addr;
    int _value;
  } args2;
  args2.__dev_addr = reinterpret_cast<void**>(dev_addr);
  args2._value = value;
  size_t size2 = sizeof(args2);

  void* config2[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args2, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size2,
                     HIP_LAUNCH_PARAM_END};
  struct {
    void* _output;
    void** __dev_addr;
    int _test_type;
  } args3;
  args3._output = reinterpret_cast<void*>(outputVec_d);
  args3.__dev_addr = reinterpret_cast<void**>(dev_addr);
  args3._test_type = test_type;
  size_t size3 = sizeof(args3);

  void* config3[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args3, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size3,
                     HIP_LAUNCH_PARAM_END};
  // Launch ker_Alloc_MultCodeObj
  HIP_CHECK(hipModuleLaunchKernel(FunctionAlloc, GRIDSIZE, 1, 1, BLOCKSIZE, 1, 1, 0, stream, NULL,
                                  reinterpret_cast<void**>(&config1)));
  // Launch ker_Write_MultCodeObj
  HIP_CHECK(hipModuleLaunchKernel(FunctionAcess, GRIDSIZE, 1, 1, BLOCKSIZE, 1, 1, 0, stream, NULL,
                                  reinterpret_cast<void**>(&config2)));
  // Launch ker_Free_MultCodeObj
  HIP_CHECK(hipModuleLaunchKernel(FunctionFree, GRIDSIZE, 1, 1, BLOCKSIZE, 1, 1, 0, stream, NULL,
                                  reinterpret_cast<void**>(&config3)));
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipModuleUnload(ModuleAlloc));
  HIP_CHECK(hipModuleUnload(ModuleWrite));
  HIP_CHECK(hipModuleUnload(ModuleFree));
  HIP_CHECK(hipStreamDestroy(stream));
  HIP_CHECK(hipFree(dev_addr));
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Local function: Launch kerAlloc<<<>>>, kerWrite<<<>>> and kerFree<<<>>>
 * to test kernel allocated memory access across multiple kernels using
 * hipGraph.
 */
template <typename T> static bool TestMemoryAcrossMulKernelsUsingGraph(int test_type) {
  T *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (BLOCKSIZE * GRIDSIZE);
  T data_value = std::numeric_limits<T>::max();
  outputVec_h = reinterpret_cast<T*>(malloc(sizeof(T) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(T) * arraysize)));
  // Launch Test Kernels using graph
  hipGraph_t graph;
  hipStream_t streamForGraph;
  hipGraphExec_t graphExec;
  HIP_CHECK(hipStreamCreate(&streamForGraph));
  HIP_CHECK(hipGraphCreate(&graph, 0));
  // Create Allocation Kernel Node
  hipGraphNode_t kernelnode_1;
  hipKernelNodeParams kernelNodeParams1{};
  void* kernelArgs1[] = {reinterpret_cast<void*>(&test_type)};
  kernelNodeParams1.func = reinterpret_cast<void*>(kerAlloc<T>);
  kernelNodeParams1.gridDim = dim3(GRIDSIZE);
  kernelNodeParams1.blockDim = dim3(BLOCKSIZE);
  kernelNodeParams1.sharedMemBytes = 0;
  kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams1.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph, nullptr, 0, &kernelNodeParams1));
  // Create Write Kernel Node
  hipGraphNode_t kernelnode_2;
  hipKernelNodeParams kernelNodeParams2{};
  void* kernelArgs2[] = {reinterpret_cast<void*>(&data_value)};
  kernelNodeParams2.func = reinterpret_cast<void*>(kerWrite<T>);
  kernelNodeParams2.gridDim = dim3(GRIDSIZE);
  kernelNodeParams2.blockDim = dim3(BLOCKSIZE);
  kernelNodeParams2.sharedMemBytes = 0;
  kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
  kernelNodeParams2.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph, nullptr, 0, &kernelNodeParams2));
  // Create Free Kernel Node
  hipGraphNode_t kernelnode_3;
  hipKernelNodeParams kernelNodeParams3{};
  void* kernelArgs3[] = {&outputVec_d, reinterpret_cast<void*>(&test_type)};
  kernelNodeParams3.func = reinterpret_cast<void*>(kerFree<T>);
  kernelNodeParams3.gridDim = dim3(GRIDSIZE);
  kernelNodeParams3.blockDim = dim3(BLOCKSIZE);
  kernelNodeParams3.sharedMemBytes = 0;
  kernelNodeParams3.kernelParams = reinterpret_cast<void**>(kernelArgs3);
  kernelNodeParams3.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kernelnode_3, graph, nullptr, 0, &kernelNodeParams3));
  // Create Memcpy Node
  hipGraphNode_t memcpyD2H;
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, outputVec_h, outputVec_d,
                                    (sizeof(T) * arraysize), hipMemcpyDeviceToHost));
  // Create dependencies for graph
  HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode_1, &kernelnode_2, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode_2, &kernelnode_3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode_3, &memcpyD2H, 1));
  // Instantiate and launch the graphs
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
  HIP_CHECK(hipStreamSynchronize(streamForGraph));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != data_value) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipStreamDestroy(streamForGraph));
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}
/**
 * Local function: Allocate local and device memory from host,
 * launches kerTestAllocationUsingDevFunc<<<>>> and copies data back
 * to host to validate.
 */
static bool TestAllocInDeviceFunc(int test_type) {
  int *outputVec_d{nullptr}, *outputVec_h{nullptr};
  size_t arraysize = (INTERNAL_BUFFER_SIZE * BLOCKSIZE * GRIDSIZE);
  outputVec_h = reinterpret_cast<int*>(malloc(sizeof(int) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize)));
  // Launch Test Kernel
  kerTestAllocationUsingDevFunc<<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, test_type);
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != (idx / INTERNAL_BUFFER_SIZE)) {
      bPassed = false;
      break;
    }
  }
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
  return bPassed;
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using malloc/free in every gpu thread and block for primitive data
 * types like char, short, int etc.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_PrimitiveDataType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  constexpr size_t sizePerThread = 128;

  // malloc()/free() tests
  SECTION("Test char datatype allocation with malloc") {
    REQUIRE(true == TestAllocInAllThread<char>(TEST_MALLOC_FREE, SCHAR_MAX, sizePerThread));
  }

  SECTION("Test short datatype allocation with malloc") {
    REQUIRE(true == TestAllocInAllThread<int16_t>(TEST_MALLOC_FREE, SHRT_MAX, sizePerThread));
  }

  SECTION("Test int datatype allocation with malloc") {
    REQUIRE(true == TestAllocInAllThread<int32_t>(TEST_MALLOC_FREE, INT_MAX, sizePerThread));
  }

  SECTION("Test float datatype allocation with malloc") {
    REQUIRE(true == TestAllocInAllThread<float>(TEST_MALLOC_FREE, FLT_MAX, sizePerThread));
  }

  SECTION("Test double datatype allocation with malloc") {
    REQUIRE(true == TestAllocInAllThread<double>(TEST_MALLOC_FREE, DBL_MAX, sizePerThread));
  }
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using new/delete in every gpu thread and block for primitive data
 * types like char, short, int etc.
 */
TEST_CASE("Unit_deviceAllocation_New_PerThread_PrimitiveDataType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  constexpr size_t sizePerThread = 128;

  // new/delete tests
  SECTION("Test char datatype allocation with new") {
    REQUIRE(true == TestAllocInAllThread<char>(TEST_NEW_DELETE, SCHAR_MAX, sizePerThread));
  }

  SECTION("Test short datatype allocation with new") {
    REQUIRE(true == TestAllocInAllThread<int16_t>(TEST_NEW_DELETE, SHRT_MAX, sizePerThread));
  }

  SECTION("Test int datatype allocation with new") {
    REQUIRE(true == TestAllocInAllThread<int32_t>(TEST_NEW_DELETE, INT_MAX, sizePerThread));
  }

  SECTION("Test float datatype allocation with new") {
    REQUIRE(true == TestAllocInAllThread<float>(TEST_NEW_DELETE, FLT_MAX, sizePerThread));
  }

  SECTION("Test double datatype allocation with new") {
    REQUIRE(true == TestAllocInAllThread<double>(TEST_NEW_DELETE, DBL_MAX, sizePerThread));
  }
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using malloc/free in every gpu thread and block for structure.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_StructDataType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  constexpr size_t sizePerThread = 64;
  struct simpleStruct sampleStr{INT_MAX,  DBL_MAX,   FLT_MAX,
                                SHRT_MAX, SCHAR_MAX, {1, 2, 3, 4, 5, 6, 7, 8}};
  REQUIRE(true ==
          TestAllocInAllThread<struct simpleStruct>(TEST_MALLOC_FREE, sampleStr, sizePerThread));
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using new/delete in every gpu thread and block for structure.
 */
TEST_CASE("Unit_deviceAllocation_New_PerThread_StructDataType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  constexpr size_t sizePerThread = 64;
  struct simpleStruct sampleStr{INT_MAX,  DBL_MAX,   FLT_MAX,
                                SHRT_MAX, SCHAR_MAX, {1, 2, 3, 4, 5, 6, 7, 8}};
  REQUIRE(true ==
          TestAllocInAllThread<struct simpleStruct>(TEST_NEW_DELETE, sampleStr, sizePerThread));
}

/**
 * Scenario: This test validates device memory allocation and free
 * in 1 thread and access in block for different primitive types like
 * char, short, int etc.
 */
TEST_CASE("Unit_deviceAllocation_InOneThread_AccessInAllThreads") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }

  // malloc()/free() tests
  SECTION("Test char datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAccessInAllThread<char>(TEST_MALLOC_FREE, 0));
  }

  SECTION("Test short datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAccessInAllThread<int16_t>(TEST_MALLOC_FREE, 0));
  }

  SECTION("Test int datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAccessInAllThread<int32_t>(TEST_MALLOC_FREE, 0));
  }

  SECTION("Test float datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAccessInAllThread<float>(TEST_MALLOC_FREE, 0));
  }

  SECTION("Test double datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAccessInAllThread<double>(TEST_MALLOC_FREE, 0));
  }

  // new/delete tests
  SECTION("Test char datatype allocation with new") {
    REQUIRE(true == TestMemoryAccessInAllThread<char>(TEST_NEW_DELETE, 0));
  }

  SECTION("Test short datatype allocation with new") {
    REQUIRE(true == TestMemoryAccessInAllThread<int16_t>(TEST_NEW_DELETE, 0));
  }

  SECTION("Test int datatype allocation with new") {
    REQUIRE(true == TestMemoryAccessInAllThread<int32_t>(TEST_NEW_DELETE, 0));
  }

  SECTION("Test float datatype allocation with new") {
    REQUIRE(true == TestMemoryAccessInAllThread<float>(TEST_NEW_DELETE, 0));
  }

  SECTION("Test double datatype allocation with new") {
    REQUIRE(true == TestMemoryAccessInAllThread<double>(TEST_NEW_DELETE, 0));
  }
}

/**
 * Scenario: This test validates device allocation malloc, access and free
 * across multiple kernels for different primitive types like char, short,
 * int etc.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_AcrossKernels") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // malloc()/free() tests
  SECTION("Test char datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<char>(TEST_MALLOC_FREE));
  }

  SECTION("Test short datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int16_t>(TEST_MALLOC_FREE));
  }

  SECTION("Test int datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int32_t>(TEST_MALLOC_FREE));
  }

  SECTION("Test float datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<float>(TEST_MALLOC_FREE));
  }

  SECTION("Test double datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<double>(TEST_MALLOC_FREE));
  }
}

/**
 * Scenario: This test validates device new, access and delete
 * across multiple kernels for different primitive types like char, short,
 * int etc.
 */
TEST_CASE("Unit_deviceAllocation_New_AcrossKernels") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  SECTION("Test char datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<char>(TEST_NEW_DELETE));
  }

  SECTION("Test short datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int16_t>(TEST_NEW_DELETE));
  }

  SECTION("Test int datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int32_t>(TEST_NEW_DELETE));
  }

  SECTION("Test float datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<float>(TEST_NEW_DELETE));
  }

  SECTION("Test double datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<double>(TEST_NEW_DELETE));
  }
}

/**
 * Scenarios:
 * A) This test validates device allocation malloc, access and free
 * across multiple kernels for nested structure.
 * B) This test also validates memory allocation and deallocation through
 * __device__ functions.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_ComplexDataType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }

  // malloc()/free() tests
  REQUIRE(true == TestMemoryAccessInAllThread_CmplxStr(TEST_MALLOC_FREE));
}

/**
 * Scenario:
 * A) This test validates device allocation malloc, access and free
 * across multiple kernels for nested structure.
 * B) This test also validates memory allocation and deallocation through
 * __device__ functions.
 */
TEST_CASE("Unit_deviceAllocation_New_ComplexDataType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  REQUIRE(true == TestMemoryAccessInAllThread_CmplxStr(TEST_NEW_DELETE));
}

/**
 * Scenario: This test validates device allocation malloc, access and free
 * across multiple kernels for Union data type.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_UnionType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // malloc()/free() tests
  REQUIRE(true == TestMemoryAccessInAllThread_Union(TEST_MALLOC_FREE));
}

/**
 * Scenario: This test validates device allocation new, access and delete
 * across multiple kernels for Union data type.
 */
TEST_CASE("Unit_deviceAllocation_New_UnionType") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  REQUIRE(true == TestMemoryAccessInAllThread_Union(TEST_NEW_DELETE));
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using malloc/free in every gpu thread and block using Single
 * Code Object kernel.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_SingleCodeObj") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  constexpr size_t sizePerThread = 128;

  REQUIRE(true == TestAlloc_Load_SingleKer_AllocFree(TEST_MALLOC_FREE, INT_MAX, sizePerThread));
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using new/delete in every gpu thread and block using Single
 * Code Object kernel.
 */
TEST_CASE("Unit_deviceAllocation_New_SingleCodeObj") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  constexpr size_t sizePerThread = 128;

  REQUIRE(true == TestAlloc_Load_SingleKer_AllocFree(TEST_NEW_DELETE, INT_MAX, sizePerThread));
}

#if HT_NVIDIA
/**
 * Scenario: This test validates device allocation and deallocation
 * using malloc/free in multikernel and multistream environment.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_MultKerMultStrm") {
  // malloc()/free() tests
  SECTION("Test char datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<char>(TEST_MALLOC_FREE, true));
  }

  SECTION("Test short datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int16_t>(TEST_MALLOC_FREE, true));
  }

  SECTION("Test int datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int32_t>(TEST_MALLOC_FREE, true));
  }

  SECTION("Test float datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<float>(TEST_MALLOC_FREE, true));
  }

  SECTION("Test double datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernels<double>(TEST_MALLOC_FREE, true));
  }
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using new/delete in multikernel and multistream environment.
 */
TEST_CASE("Unit_deviceAllocation_New_PerThread_MultKerMultStrm") {
  // new/delete tests
  SECTION("Test char datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<char>(TEST_NEW_DELETE, true));
  }

  SECTION("Test short datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int16_t>(TEST_NEW_DELETE, true));
  }

  SECTION("Test int datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<int32_t>(TEST_NEW_DELETE, true));
  }

  SECTION("Test float datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<float>(TEST_NEW_DELETE, true));
  }

  SECTION("Test double datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernels<double>(TEST_NEW_DELETE, true));
  }
}
#endif

/**
 * Scenario: This test validates device allocation and deallocation
 * using malloc/free in graph.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_PerThread_Graph") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // malloc()/free() tests
  SECTION("Test char datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<char>(TEST_MALLOC_FREE));
  }

  SECTION("Test short datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<int16_t>(TEST_MALLOC_FREE));
  }

  SECTION("Test int datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<int32_t>(TEST_MALLOC_FREE));
  }

  SECTION("Test float datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<float>(TEST_MALLOC_FREE));
  }

  SECTION("Test double datatype allocation with malloc") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<double>(TEST_MALLOC_FREE));
  }
}

/**
 * Scenario: This test validates device allocation and deallocation
 * using new/delete in graph.
 */
TEST_CASE("Unit_deviceAllocation_New_PerThread_Graph") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  SECTION("Test char datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<char>(TEST_NEW_DELETE));
  }

  SECTION("Test short datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<int16_t>(TEST_NEW_DELETE));
  }

  SECTION("Test int datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<int32_t>(TEST_NEW_DELETE));
  }

  SECTION("Test float datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<float>(TEST_NEW_DELETE));
  }

  SECTION("Test double datatype allocation with new") {
    REQUIRE(true == TestMemoryAcrossMulKernelsUsingGraph<double>(TEST_NEW_DELETE));
  }
}

/**
 * Scenario: This test validates device allocation malloc, access and free
 * using pointers to device functions.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_DeviceFunc") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // malloc/free tests
  REQUIRE(true == TestAllocInDeviceFunc(TEST_MALLOC_FREE));
}

/**
 * Scenario: This test validates device allocation new, access and delete
 * using pointers to device functions.
 */
TEST_CASE("Unit_deviceAllocation_New_DeviceFunc") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  REQUIRE(true == TestAllocInDeviceFunc(TEST_NEW_DELETE));
}

/**
 * Scenario: This test validates device allocation using vitual functions
 */
TEST_CASE("Unit_deviceAllocation_VirtualFunction") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  int *outputVec_d{nullptr}, *outputVec_h{nullptr};
  constexpr size_t sizeBufferPerThread = 8;
  size_t arraysize = (sizeBufferPerThread * BLOCKSIZE * GRIDSIZE);
  outputVec_h = reinterpret_cast<int*>(malloc(sizeof(int) * arraysize));
  REQUIRE(outputVec_h != nullptr);
  HIP_CHECK(hipMalloc(&outputVec_d, (sizeof(int) * arraysize)));
  // Launch Test Kernel
  kerTestDynamicAllocVirtualFunc<<<GRIDSIZE, BLOCKSIZE>>>(outputVec_d, sizeBufferPerThread);
  HIP_CHECK(hipDeviceSynchronize());
  // Copy to host buffer
  HIP_CHECK(hipMemcpy(outputVec_h, outputVec_d, sizeof(int) * arraysize, hipMemcpyDefault));
  bool bPassed = true;
  for (size_t idx = 0; idx < arraysize; idx++) {
    if (outputVec_h[idx] != (idx / sizeBufferPerThread)) {
      bPassed = false;
      break;
    }
  }
  REQUIRE(true == bPassed);
  HIP_CHECK(hipFree(outputVec_d));
  free(outputVec_h);
}

/**
 * Scenario: This test validates device allocation malloc, access and free
 * across multiple kernels launched using threads.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_MulKernels_MulThreads") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // malloc()/free() tests
  SECTION("Test char datatype allocation with malloc") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<char>(TEST_MALLOC_FREE));
  }

  SECTION("Test short datatype allocation with malloc") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<int16_t>(TEST_MALLOC_FREE));
  }

  SECTION("Test int datatype allocation with malloc") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<int32_t>(TEST_MALLOC_FREE));
  }

  SECTION("Test float datatype allocation with malloc") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<float>(TEST_MALLOC_FREE));
  }

  SECTION("Test double datatype allocation with malloc") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<double>(TEST_MALLOC_FREE));
  }
}

/**
 * Scenario: This test validates device new, access and delete
 * across multiple kernels launched using threads.
 */
TEST_CASE("Unit_deviceAllocation_New_MulKernels_MulThreads") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  SECTION("Test char datatype allocation with new") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<char>(TEST_NEW_DELETE));
  }

  SECTION("Test short datatype allocation with new") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<int16_t>(TEST_NEW_DELETE));
  }

  SECTION("Test int datatype allocation with new") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<int32_t>(TEST_NEW_DELETE));
  }

  SECTION("Test float datatype allocation with new") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<float>(TEST_NEW_DELETE));
  }

  SECTION("Test double datatype allocation with new") {
    REQUIRE(true == TestDevMemAllocMulKerMulThrd<double>(TEST_NEW_DELETE));
  }
}

#if HT_AMD
// Scenarios Unit_deviceAllocation_Malloc_SingKernels_MulThreads and
// are failing on NVIDIA platform.
/**
 * Scenario: This test validates device allocation malloc, access and free
 * in a single kernel launched using threads.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_SingKernels_MulThreads") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // malloc()/free() tests
  std::vector<std::thread> tests;
  // Spawn the test threads
  for (int idx = 0; idx < num_threads; idx++) {
    thread_results[idx] = false;
    tests.push_back(std::thread(runTestMemoryAccessInAllThread<int32_t>, TEST_MALLOC_FREE, idx));
  }
  // Wait for all threads to complete
  for (std::thread& t : tests) {
    t.join();
  }
  // Verify All Results
  for (int idx = 0; idx < num_threads; idx++) {
    REQUIRE(thread_results[idx]);
  }
}

/**
 * Scenario: This test validates device new, access and delete
 * in a single kernel launched using threads.
 */
TEST_CASE("Unit_deviceAllocation_New_SingKernels_MulThreads") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  // new/delete tests
  std::vector<std::thread> tests;
  // Spawn the test threads
  for (int idx = 0; idx < num_threads; idx++) {
    thread_results[idx] = false;
    tests.push_back(std::thread(runTestMemoryAccessInAllThread<int32_t>, TEST_NEW_DELETE, idx));
  }
  // Wait for all threads to complete
  for (std::thread& t : tests) {
    t.join();
  }
  // Verify All Results
  for (int idx = 0; idx < num_threads; idx++) {
    REQUIRE(thread_results[idx]);
  }
}
#endif

/**
 * Scenario: This test validates Allocation and Deallocation in multiple
 * code object kernels defined in different source files.
 */
TEST_CASE("Unit_deviceAllocation_Malloc_MulCodeObj") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  REQUIRE(true == TestAlloc_Load_MultKernels(TEST_MALLOC_FREE, INT_MAX));
}

/**
 * Scenario: This test validates Allocation and Deallocation in multiple
 * code object kernels defined in different source files.
 */
TEST_CASE("Unit_deviceAllocation_New_MulCodeObj") {
  int pcieAtomic = 0;
  HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
  if (!pcieAtomic) {
    HipTest::HIP_SKIP_TEST("Device doesn't support pcie atomic, Skipped");
    return;
  }
  REQUIRE(true == TestAlloc_Load_MultKernels(TEST_NEW_DELETE, INT_MAX));
}