diff --git a/catch/hipTestMain/CMakeLists.txt b/catch/hipTestMain/CMakeLists.txt
index 2b3cb3f60c..5b62dbbb49 100644
--- a/catch/hipTestMain/CMakeLists.txt
+++ b/catch/hipTestMain/CMakeLists.txt
@@ -15,6 +15,7 @@ target_link_libraries(UnitTests PRIVATE UnitDeviceTests
                                         EventTest
                                         OccupancyTest
                                         DeviceTest
+                                        ModuleTest
                                         RTC
                                         stdc++fs)
 
@@ -36,6 +37,7 @@ target_link_libraries(ABMTests PRIVATE ABMAddKernels
                                        stdc++fs)
 
 catch_discover_tests(ABMTests PROPERTIES  SKIP_REGULAR_EXPRESSION "HIP_SKIP_THIS_TEST")
+add_dependencies(UnitTests module_kernels.code)
 
 add_dependencies(build_tests UnitTests ABMTests)
 
@@ -63,7 +65,7 @@ else()
     target_compile_options(StressTest PUBLIC -std=c++17)
 endif()
 if(HIP_PLATFORM MATCHES "amd")
-target_link_libraries(StressTest PRIVATE printf stream)
+target_link_libraries(StressTest PRIVATE printf stream module)
 endif()
 target_link_libraries(StressTest PRIVATE memory stdc++fs)
 add_dependencies(build_stress_test StressTest)
diff --git a/catch/include/hip_test_common.hh b/catch/include/hip_test_common.hh
index 9107740333..9ffe839739 100644
--- a/catch/include/hip_test_common.hh
+++ b/catch/include/hip_test_common.hh
@@ -1,6 +1,5 @@
 /*
-Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -23,6 +22,13 @@ THE SOFTWARE.
 #pragma once
 #include "hip_test_context.hh"
 #include <catch.hpp>
+#ifdef __linux__
+#include <sys/sysinfo.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#endif
+
+
 
 #define HIP_PRINT_STATUS(status) INFO(hipGetErrorName(status) << " at line: " << __LINE__);
 
@@ -72,6 +78,27 @@ THE SOFTWARE.
     }
 
 
+#if HT_NVIDIA
+#define CTX_CREATE() \
+  hipCtx_t context;\
+  initHipCtx(&context);
+#define CTX_DESTROY() HIPCHECK(hipCtxDestroy(context));
+#define ARRAY_DESTROY(array) HIPCHECK(hipArrayDestroy(array));
+#define HIP_TEX_REFERENCE hipTexRef
+#define HIP_ARRAY hiparray
+static void initHipCtx(hipCtx_t *pcontext) {
+  HIPCHECK(hipInit(0));
+  hipDevice_t device;
+  HIPCHECK(hipDeviceGet(&device, 0));
+  HIPCHECK(hipCtxCreate(pcontext, 0, device));
+}
+#else
+#define CTX_CREATE()
+#define CTX_DESTROY()
+#define ARRAY_DESTROY(array) HIPCHECK(hipFreeArray(array));
+#define HIP_TEX_REFERENCE textureReference*
+#define HIP_ARRAY hipArray*
+#endif
 
 // Utility Functions
 namespace HipTest {
@@ -104,4 +131,34 @@ static inline unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlo
 
   return blocks;
 }
+// Get Free Memory from the system
+static size_t getMemoryAmount() {
+#if __linux__
+  struct sysinfo info;
+  sysinfo(&info);
+  return info.freeram / (1024 * 1024);  // MB
+#elif defined(_WIN32)
+  MEMORYSTATUSEX statex;
+  statex.dwLength = sizeof(statex);
+  GlobalMemoryStatusEx(&statex);
+  return (statex.ullAvailPhys / (1024 * 1024));  // MB
+#endif
+}
+
+static inline size_t getHostThreadCount(const size_t memPerThread = 200, const size_t maxThreads = 0) {
+  if (memPerThread == 0) return 0;
+  auto memAmount = getMemoryAmount();
+  const auto processor_count = std::thread::hardware_concurrency();
+  if (processor_count == 0 || memAmount == 0) return 0;
+  size_t thread_count = 0;
+  if ((processor_count * memPerThread) < memAmount)
+    thread_count = processor_count;
+  else
+    thread_count = reinterpret_cast<size_t>(memAmount / memPerThread);
+  if (maxThreads > 0) {
+    return (thread_count > maxThreads) ? maxThreads : thread_count;
+  }
+  return thread_count;
+}
+
 }
diff --git a/catch/include/hip_test_helper.hh b/catch/include/hip_test_helper.hh
index 1d0a57b776..08b6bec0c6 100644
--- a/catch/include/hip_test_helper.hh
+++ b/catch/include/hip_test_helper.hh
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/catch/include/hip_test_kernels.hh b/catch/include/hip_test_kernels.hh
index 04b00b5ad3..3805b4cbbc 100644
--- a/catch/include/hip_test_kernels.hh
+++ b/catch/include/hip_test_kernels.hh
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -72,6 +72,35 @@ __global__ void addCountReverse(const T* A_d, T* C_d, int64_t NELEM, int count)
   }
 }
 
+template<typename T>
+__device__ void waitKernel(uint64_t wait_sec, T clockrate) {
+  uint64_t start = clock64()/clockrate, cur;
+  do { cur = clock64()/clockrate-start;}while (cur < (wait_sec*1000));
+}
+
+template<typename T>
+__global__ void TwoSecKernel_GlobalVar(int globalvar, int clockrate) {
+  if (globalvar == 0x2222) {
+    globalvar = 0x3333;
+  }
+  waitKernel(2, clockrate);
+  if (globalvar != 0x3333) {
+    globalvar = 0x5555;
+  }
+}
+
+template<typename T>
+__global__ void FourSecKernel_GlobalVar(int globalvar, int clockrate) {
+  if (globalvar == 1) {
+    globalvar = 0x2222;
+  }
+  waitKernel(4, clockrate);
+  if (globalvar == 0x2222) {
+    globalvar = 0x4444;
+  }
+}
+
+
 template <typename T> __global__ void memsetReverse(T* C_d, T val, int64_t NELEM) {
   size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
   size_t stride = blockDim.x * gridDim.x;
diff --git a/catch/stress/CMakeLists.txt b/catch/stress/CMakeLists.txt
index 00e12f2c2c..04038e0b8a 100644
--- a/catch/stress/CMakeLists.txt
+++ b/catch/stress/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(memory)
+add_subdirectory(module)
 if(HIP_PLATFORM MATCHES "amd")
 add_subdirectory(printf)
 add_subdirectory(stream)
diff --git a/catch/stress/memory/CMakeLists.txt b/catch/stress/memory/CMakeLists.txt
index 5afd69ee11..4338216fad 100644
--- a/catch/stress/memory/CMakeLists.txt
+++ b/catch/stress/memory/CMakeLists.txt
@@ -2,6 +2,7 @@
 set(TEST_SRC
     memcpy.cc
     hipMemcpyMThreadMSize.cc
+    hipMemcpyBoundaryOffsetCheck.cc
 )
 
 # Create shared lib of all tests
diff --git a/catch/stress/memory/hipMemcpyBoundaryOffsetCheck.cc b/catch/stress/memory/hipMemcpyBoundaryOffsetCheck.cc
new file mode 100644
index 0000000000..b81e575bcf
--- /dev/null
+++ b/catch/stress/memory/hipMemcpyBoundaryOffsetCheck.cc
@@ -0,0 +1,344 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+This testcase verifies following scenarios
+3. Boundary checks with different sizes
+5. device offset scenario
+*/
+#include <hip_test_common.hh>
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include "sys/types.h"
+#include "sys/sysinfo.h"
+#endif
+static constexpr auto NUM_ELM{4*1024 * 1024};
+template<typename T>
+class DeviceMemory {
+ public:
+    explicit DeviceMemory(size_t numElements);
+    DeviceMemory() = delete;
+    ~DeviceMemory();
+    T* A_d() const { return _A_d + _offset; }
+    T* B_d() const { return _B_d + _offset; }
+    T* C_d() const { return _C_d + _offset; }
+    T* C_dd() const { return _C_dd + _offset; }
+    size_t maxNumElements() const { return _maxNumElements; }
+    void offset(int offset) { _offset = offset; }
+    int offset() const { return _offset; }
+ private:
+    T* _A_d;
+    T* _B_d;
+    T* _C_d;
+    T* _C_dd;
+    size_t _maxNumElements;
+    int _offset;
+};
+template <typename T>
+DeviceMemory<T>::DeviceMemory(size_t numElements) :
+                 _maxNumElements(numElements), _offset(0) {
+  T** np = nullptr;
+  HipTest::initArrays(&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0);
+  size_t sizeElements = numElements * sizeof(T);
+  HIP_CHECK(hipMalloc(&_C_dd, sizeElements));
+}
+template <typename T>
+DeviceMemory<T>::~DeviceMemory() {
+  T* np = nullptr;
+  HipTest::freeArrays<T>(_A_d, _B_d, _C_d, np, np, np, 0);
+  HIP_CHECK(hipFree(_C_dd));
+  _C_dd = NULL;
+}
+template <typename T>
+class HostMemory {
+ public:
+    HostMemory(size_t numElements, bool usePinnedHost);
+    HostMemory() = delete;
+    void reset(size_t numElements, bool full = false);
+    ~HostMemory();
+    T* A_h() const { return _A_h + _offset; }
+    T* B_h() const { return _B_h + _offset; }
+    T* C_h() const { return _C_h + _offset; }
+    size_t maxNumElements() const { return _maxNumElements; }
+    void offset(int offset) { _offset = offset; }
+    int offset() const { return _offset; }
+    // Host arrays, secondary copy
+    T* A_hh;
+    T* B_hh;
+    bool _usePinnedHost;
+ private:
+    size_t _maxNumElements;
+    int _offset;
+    // Host arrays
+    T* _A_h;
+    T* _B_h;
+    T* _C_h;
+};
+  template <typename T>
+HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
+  : _usePinnedHost(usePinnedHost), _maxNumElements(numElements), _offset(0) {
+    T** np = nullptr;
+    HipTest::initArrays(np, np, np, &_A_h, &_B_h, &_C_h,
+                        numElements, usePinnedHost);
+    A_hh = NULL;
+    B_hh = NULL;
+    size_t sizeElements = numElements * sizeof(T);
+    if (usePinnedHost) {
+      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&A_hh), sizeElements,
+                              hipHostMallocDefault));
+      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&B_hh), sizeElements,
+                              hipHostMallocDefault));
+    } else {
+      A_hh = reinterpret_cast<T*>(malloc(sizeElements));
+      B_hh = reinterpret_cast<T*>(malloc(sizeElements));
+    }
+  }
+template <typename T>
+void HostMemory<T>::reset(size_t numElements, bool full) {
+  // Initialize the host data:
+  for (size_t i = 0; i < numElements; i++) {
+    (A_hh)[i] = 1097.0 + i;
+    (B_hh)[i] = 1492.0 + i;  // Phi
+    if (full) {
+      (_A_h)[i] = 3.146f + i;  // Pi
+      (_B_h)[i] = 1.618f + i;  // Phi
+    }
+  }
+}
+template <typename T>
+HostMemory<T>::~HostMemory() {
+  HipTest::freeArraysForHost(_A_h, _B_h, _C_h, _usePinnedHost);
+  if (_usePinnedHost) {
+    HIP_CHECK(hipHostFree(A_hh));
+    HIP_CHECK(hipHostFree(B_hh));
+  } else {
+    free(A_hh);
+    free(B_hh);
+  }
+}
+#ifdef _WIN32
+void memcpytest2_get_host_memory(size_t *free, size_t *total) {
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  GlobalMemoryStatusEx(&status);
+  // Windows doesn't allow allocating more than half of system memory to the gpu
+  // Since the runtime also needs space for its internal allocations,
+  // we should not try to allocate more than 40% of reported system memory,
+  // otherwise we can run into OOM issues.
+  *free = static_cast<size_t>(0.4 * status.ullAvailPhys);
+  *total = static_cast<size_t>(0.4 * status.ullTotalPhys);
+}
+#else
+struct sysinfo memInfo;
+void memcpytest2_get_host_memory(size_t  *free, size_t *total) {
+  sysinfo(&memInfo);
+  uint64_t freePhysMem = memInfo.freeram;
+  freePhysMem *= memInfo.mem_unit;
+  *free = freePhysMem;
+  uint64_t totalPhysMem = memInfo.totalram;
+  totalPhysMem *= memInfo.mem_unit;
+  *total = totalPhysMem;
+}
+#endif
+//---
+// Test many different kinds of memory copies.
+// The subroutine allocates memory , copies to device, runs a vector
+//  add kernel, copies back, and
+// checks the result.
+//
+// IN: numElements  controls the number of elements used for allocations.
+// IN: usePinnedHost : If true, allocate host with hipHostMalloc and is pinned
+// else allocate host
+// memory with malloc. IN: useHostToHost : If true, add an extra
+// host-to-host copy. IN:
+// useDeviceToDevice : If true, add an extra deviceto-device copy after
+// result is produced. IN:
+// useMemkindDefault : If true, use memkinddefault
+// (runtime figures out direction).  if false, use
+// explicit memcpy direction.
+//
+template <typename T>
+void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem,
+    size_t numElements, bool useHostToHost,
+    bool useDeviceToDevice, bool useMemkindDefault) {
+  size_t sizeElements = numElements * sizeof(T);
+  hmem->reset(numElements);
+  assert(numElements <= dmem->maxNumElements());
+  assert(numElements <= hmem->maxNumElements());
+  if (useHostToHost) {
+    // Do some extra host-to-host copies here to mix things up:
+    HIP_CHECK(hipMemcpy(hmem->A_hh, hmem->A_h(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToHost));
+    HIP_CHECK(hipMemcpy(hmem->B_hh, hmem->B_h(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToHost));
+    HIP_CHECK(hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+  } else {
+    HIP_CHECK(hipMemcpy(dmem->A_d(), hmem->A_h(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+  }
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
+      static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
+      dmem->C_d(), numElements);
+  if (useDeviceToDevice) {
+    // Do an extra device-to-device copy here to mix things up:
+    HIP_CHECK(hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToDevice));
+    // Destroy the original dmem->C_d():
+    HIP_CHECK(hipMemset(dmem->C_d(), 0x5A, sizeElements));
+    HIP_CHECK(hipMemcpy(hmem->C_h(), dmem->C_dd(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToHost));
+  } else {
+    HIP_CHECK(hipMemcpy(hmem->C_h(), dmem->C_d(), sizeElements,
+          useMemkindDefault ? hipMemcpyDefault : hipMemcpyDeviceToHost));
+  }
+  HIP_CHECK(hipDeviceSynchronize());
+  HipTest::checkVectorADD(hmem->A_h(), hmem->B_h(), hmem->C_h(), numElements);
+}
+// Try all the 16 possible combinations to memcpytest2 - usePinnedHost,
+// useHostToHost,
+// useDeviceToDevice, useMemkindDefault
+template <typename T>
+void memcpytest2_for_type(size_t numElements) {
+  DeviceMemory<T> memD(numElements);
+  HostMemory<T> memU(numElements, 0 /*usePinnedHost*/);
+  HostMemory<T> memP(numElements, 1 /*usePinnedHost*/);
+  for (int usePinnedHost = 0; usePinnedHost <= 1; usePinnedHost++) {
+    for (int useHostToHost = 0; useHostToHost <= 1; useHostToHost++) {
+      for (int useDeviceToDevice = 0; useDeviceToDevice <= 1;
+          useDeviceToDevice++) {
+        for (int useMemkindDefault = 0; useMemkindDefault <= 1;
+            useMemkindDefault++) {
+          memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU,
+              numElements, useHostToHost,
+              useDeviceToDevice, useMemkindDefault);
+        }
+      }
+    }
+  }
+}
+// Try many different sizes to memory copy.
+template <typename T>
+void memcpytest2_sizes(size_t maxElem = 0) {
+  int deviceId;
+  HIP_CHECK(hipGetDevice(&deviceId));
+  size_t free, total, freeCPU, totalCPU;
+  HIP_CHECK(hipMemGetInfo(&free, &total));
+  memcpytest2_get_host_memory(&freeCPU, &totalCPU);
+  if (maxElem == 0) {
+    // Use lesser maxElem if not enough host memory available
+    size_t maxElemGPU = free / sizeof(T) / 8;
+    size_t maxElemCPU = freeCPU / sizeof(T) / 8;
+    maxElem = maxElemGPU < maxElemCPU ? maxElemGPU : maxElemCPU;
+  }
+  HIP_CHECK(hipDeviceReset());
+  DeviceMemory<T> memD(maxElem);
+  HostMemory<T> memU(maxElem, 0 /*usePinnedHost*/);
+  HostMemory<T> memP(maxElem, 1 /*usePinnedHost*/);
+  for (size_t elem = 1; elem <= maxElem; elem *= 2) {
+    memcpytest2<T>(&memD, &memU, elem, 1, 1, 0);  // unpinned host
+    memcpytest2<T>(&memD, &memP, elem, 1, 1, 0);  // pinned host
+  }
+}
+// Try many different sizes to memory copy.
+template <typename T>
+void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) {
+  int deviceId;
+  HIP_CHECK(hipGetDevice(&deviceId));
+  size_t free, total;
+  HIP_CHECK(hipMemGetInfo(&free, &total));
+  HIP_CHECK(hipDeviceReset());
+  DeviceMemory<T> memD(maxElem);
+  HostMemory<T> memU(maxElem, 0 /*usePinnedHost*/);
+  HostMemory<T> memP(maxElem, 1 /*usePinnedHost*/);
+  size_t elem = maxElem / 2;
+  for (size_t offset = 0; offset < 512; offset++) {
+    assert(elem + offset < maxElem);
+    if (devOffsets) {
+      memD.offset(offset);
+    }
+    if (hostOffsets) {
+      memU.offset(offset);
+      memP.offset(offset);
+    }
+    memcpytest2<T>(&memD, &memU, elem, 1, 1, 0);  // unpinned host
+    memcpytest2<T>(&memD, &memP, elem, 1, 1, 0);  // pinned host
+  }
+  for (size_t offset = 512; offset < elem; offset *= 2) {
+    assert(elem + offset < maxElem);
+    if (devOffsets) {
+      memD.offset(offset);
+    }
+    if (hostOffsets) {
+      memU.offset(offset);
+      memP.offset(offset);
+    }
+    memcpytest2<T>(&memD, &memU, elem, 1, 1, 0);  // unpinned host
+    memcpytest2<T>(&memD, &memP, elem, 1, 1, 0);  // pinned host
+  }
+}
+// Create multiple threads to stress multi-thread locking behavior in the
+// allocation/deallocation/tracking logic:
+template <typename T>
+void multiThread_1(bool serialize, bool usePinnedHost) {
+  DeviceMemory<T> memD(NUM_ELM);
+  HostMemory<T> mem1(NUM_ELM, usePinnedHost);
+  HostMemory<T> mem2(NUM_ELM, usePinnedHost);
+  std::thread t1(memcpytest2<T>, &memD, &mem1, NUM_ELM, 0, 0, 0);
+  if (serialize) {
+    t1.join();
+  }
+  std::thread t2(memcpytest2<T>, &memD, &mem2, NUM_ELM, 0, 0, 0);
+  if (serialize) {
+    t2.join();
+  }
+}
+/*
+This testcase verfies the boundary checks of hipMemcpy API for different sizes
+*/
+TEST_CASE("Unit_hipMemcpy_BoundaryCheck") {
+  size_t maxElem = 32 * 1024 * 1024;
+  DeviceMemory<float> memD(maxElem);
+  HostMemory<float> memU(maxElem, 0 /*usePinnedHost*/);
+  HostMemory<float> memP(maxElem, 0 /*usePinnedHost*/);
+  memcpytest2<float>(&memD, &memU, 32 * 1024 * 1024, 0, 0, 0);
+  auto sizes = GENERATE(15 * 1024 * 1024, 16 * 1024 * 1024,
+                        16 * 1024 * 1024 + 16 * 1024,
+                        16 * 1024 * 1024 + 512 * 1024,
+                        17 * 1024 * 1024 + 1024,
+                        32 * 1024 * 1024);
+  memcpytest2<float>(&memD, &memP, sizes, 0, 0, 0);
+}
+
+/*
+This testcase verifies the device offsets
+*/
+TEMPLATE_TEST_CASE("Unit_hipMemcpy_DeviceOffsets", "", float, double) {
+  HIP_CHECK(hipDeviceReset());
+  size_t maxSize = 256 * 1024;
+  memcpytest2_offsets<TestType>(maxSize, true, false);
+  memcpytest2_offsets<TestType>(maxSize, false, true);
+}
diff --git a/catch/stress/module/CMakeLists.txt b/catch/stress/module/CMakeLists.txt
new file mode 100644
index 0000000000..17ebb212e6
--- /dev/null
+++ b/catch/stress/module/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Common Tests - Test independent of all platforms
+if(HIP_PLATFORM MATCHES "amd")
+set(TEST_SRC
+    hipExtModuleLaunchKernel_CornerTest.cc
+    hipModuleLaunchKernel_CornerTests.cc
+)
+else()
+set(TEST_SRC
+    hipModuleLaunchKernel_CornerTests.cc
+)
+endif()
+
+add_custom_target(kernels.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${HIP_COMMON_DIR}/tests/catch/stress/module/kernels.cc -o ${HIP_PATH}/catch/hipTestMain/kernels.code -I${HIP_PATH}/include/ -I${HIP_COMMON_DIR}/tests/catch/include)
+
+# Create shared lib of all tests
+add_library(module SHARED EXCLUDE_FROM_ALL ${TEST_SRC})
+
+# Add dependency on build_tests to build it on this custom target
+add_dependencies(build_stress_test module kernels.code)
diff --git a/catch/stress/module/hipExtModuleLaunchKernel_CornerTest.cc b/catch/stress/module/hipExtModuleLaunchKernel_CornerTest.cc
new file mode 100644
index 0000000000..05c17c7b9b
--- /dev/null
+++ b/catch/stress/module/hipExtModuleLaunchKernel_CornerTest.cc
@@ -0,0 +1,86 @@
+/*
+   Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   */
+
+/*
+Test Scenario
+hipExtModuleLaunchKernel API verifying Corner Scenarios for Grid and Block dimensions
+*/
+
+#include "hip_test_common.hh"
+#include "hip_test_kernels.hh"
+#include "hip/hip_ext.h"
+
+#define fileName "kernels.code"
+#define dummyKernel "EmptyKernel"
+
+struct gridblockDim {
+  unsigned int gridX;
+  unsigned int gridY;
+  unsigned int gridZ;
+  unsigned int blockX;
+  unsigned int blockY;
+  unsigned int blockZ;
+};
+
+/*
+This testcase verifies hipExtModuleLaunchKernel API Corner
+cases
+*/
+TEST_CASE("Stress_hipExtModuleLaunchKernel_CornerCases") {
+  hipModule_t Module;
+  hipFunction_t DummyKernel;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
+  constexpr auto gridblocksize{6};
+  struct {
+  } args;
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+  size_t size = sizeof(args);
+  void *config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                     HIP_LAUNCH_PARAM_END};
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, 0);
+  unsigned int maxblockX = deviceProp.maxThreadsDim[0];
+  unsigned int maxblockY = deviceProp.maxThreadsDim[1];
+  unsigned int maxblockZ = deviceProp.maxThreadsDim[2];
+  struct gridblockDim test[gridblocksize] = {{1, 1, 1, maxblockX, 1, 1},
+                                            {1, 1, 1, 1, maxblockY, 1},
+                                            {1, 1, 1, 1, 1, maxblockZ},
+                                            {UINT32_MAX, 1, 1, 1, 1, 1},
+                                            {1, UINT32_MAX, 1, 1, 1, 1},
+                                            {1, 1, UINT32_MAX, 1, 1, 1}};
+
+  // Launching kernel with corner cases in grid and block dimensions
+  for (int i = 0; i < gridblocksize; i++) {
+    HIP_CHECK(hipExtModuleLaunchKernel(DummyKernel,
+                                       test[i].gridX,
+                                       test[i].gridY,
+                                       test[i].gridZ,
+                                       test[i].blockX,
+                                       test[i].blockY,
+                                       test[i].blockZ,
+                                       0,
+                                       stream, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0));
+  }
+  HIP_CHECK(hipStreamDestroy(stream));
+}
diff --git a/catch/stress/module/hipModuleLaunchKernel_CornerTests.cc b/catch/stress/module/hipModuleLaunchKernel_CornerTests.cc
new file mode 100644
index 0000000000..f7bd866ceb
--- /dev/null
+++ b/catch/stress/module/hipModuleLaunchKernel_CornerTests.cc
@@ -0,0 +1,90 @@
+/*
+   Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   */
+
+/*
+Test Scenario
+hipModuleLaunchKernel API verifying Corner Scenarios for Grid and Block dimensions
+*/
+
+#include "hip_test_common.hh"
+#include "hip_test_kernels.hh"
+#include "hip/hip_ext.h"
+
+#define fileName "kernels.code"
+#define dummyKernel "EmptyKernel"
+
+struct gridblockDim {
+  unsigned int gridX;
+  unsigned int gridY;
+  unsigned int gridZ;
+  unsigned int blockX;
+  unsigned int blockY;
+  unsigned int blockZ;
+};
+
+/*
+This testcase verifies hipModuleLaunchKernel API Corner
+cases
+*/
+TEST_CASE("Stress_hipModuleLaunchKernel_CornerCases") {
+  HIP_CHECK(hipSetDevice(0));
+  hipStream_t stream1;
+  CTX_CREATE()
+  hipModule_t Module;
+  hipFunction_t DummyKernel;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
+  HIP_CHECK(hipStreamCreate(&stream1));
+
+  // Passing Max int value to block dimensions
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, 0);
+  unsigned int maxblockX = deviceProp.maxThreadsDim[0];
+  unsigned int maxblockY = deviceProp.maxThreadsDim[1];
+  unsigned int maxblockZ = deviceProp.maxThreadsDim[2];
+#if HT_NVIDIA
+  unsigned int maxgridX = deviceProp.maxGridSize[0];
+  unsigned int maxgridY = deviceProp.maxGridSize[1];
+  unsigned int maxgridZ = deviceProp.maxGridSize[2];
+#else
+  unsigned int maxgridX = UINT32_MAX;
+  unsigned int maxgridY = UINT32_MAX;
+  unsigned int maxgridZ = UINT32_MAX;
+#endif
+  struct gridblockDim test[6] = {{1, 1, 1, maxblockX, 1, 1},
+                                 {1, 1, 1, 1, maxblockY, 1},
+                                 {1, 1, 1, 1, 1, maxblockZ},
+                                 {maxgridX, 1, 1, 1, 1, 1},
+                                 {1, maxgridY, 1, 1, 1, 1},
+                                 {1, 1, maxgridZ, 1, 1, 1}};
+  for (int i = 0; i < 6; i++) {
+    HIP_CHECK(hipModuleLaunchKernel(DummyKernel,
+                                test[i].gridX,
+                                test[i].gridY,
+                                test[i].gridZ,
+                                test[i].blockX,
+                                test[i].blockY,
+                                test[i].blockZ,
+                                0,
+                                stream1, NULL, NULL));
+    }
+  HIP_CHECK(hipStreamDestroy(stream1));
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY();
+}
diff --git a/catch/stress/module/kernels.cc b/catch/stress/module/kernels.cc
new file mode 100644
index 0000000000..5a980c40e7
--- /dev/null
+++ b/catch/stress/module/kernels.cc
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include "hip/hip_runtime.h"
+
+extern "C" __global__ void EmptyKernel() {
+}
+
diff --git a/catch/unit/CMakeLists.txt b/catch/unit/CMakeLists.txt
index f0ae8b6b93..0d19d6d82b 100644
--- a/catch/unit/CMakeLists.txt
+++ b/catch/unit/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(module)
 add_subdirectory(memory)
 add_subdirectory(deviceLib)
 add_subdirectory(stream)
diff --git a/catch/unit/module/CMakeLists.txt b/catch/unit/module/CMakeLists.txt
new file mode 100644
index 0000000000..7e5d6aeaf6
--- /dev/null
+++ b/catch/unit/module/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Common Tests - Test independent of all platforms
+if(HIP_PLATFORM MATCHES "amd")
+set(TEST_SRC
+hipExtLaunchKernelGGL.cc
+hipExtModuleLaunchKernel.cc
+hipExtLaunchMultiKernelMultiDevice.cc
+hipModuleLaunchKernel.cc
+hipFuncSetCacheConfig.cc
+hipModuleUnload.cc
+hipFuncSetAttribute.cc
+hipModuleLoadData.cc
+hipFuncSetSharedMemConfig.cc
+hipManagedKeyword.cc
+hipModuleGetGlobal.cc
+hipFuncGetAttributes.cc
+hipModule.cc
+hipModuleLoadDataMultThreadOnMultGPU.cc
+hipModuleLoadDataMultThreaded.cc
+hipModuleLoadMultiThreaded.cc
+hipModuleLoadUnloadStress.cc
+hipModuleNegative.cc
+hipModuleOccupancyMaxPotentialBlockSize.cc
+hipModuleTexture2dDrv.cc
+hipOpenCLCOTest.cc
+)
+else()
+set(TEST_SRC
+hipModuleLaunchKernel.cc
+hipFuncSetCacheConfig.cc
+hipModuleUnload.cc
+hipFuncSetAttribute.cc
+hipModuleLoadData.cc
+hipFuncSetSharedMemConfig.cc
+hipManagedKeyword.cc
+hipModuleGetGlobal.cc
+hipFuncGetAttributes.cc
+hipModule.cc
+hipModuleLoadDataMultThreadOnMultGPU.cc
+hipModuleLoadDataMultThreaded.cc
+hipModuleLoadMultiThreaded.cc
+hipModuleLoadUnloadStress.cc
+hipModuleNegative.cc
+hipModuleOccupancyMaxPotentialBlockSize.cc
+)
+endif()
+
+add_custom_target(module_kernels.code COMMAND ${CMAKE_CXX_COMPILER} --genco ${HIP_COMMON_DIR}/tests/catch/unit/module/module_kernels.cc -o ${HIP_PATH}/catch/hipTestMain/module_kernels.code -I${HIP_PATH}/include/ -I${HIP_COMMON_DIR}/tests/catch/include)
+# Create shared lib of all tests
+add_library(ModuleTest SHARED EXCLUDE_FROM_ALL ${TEST_SRC})
+
+add_dependencies(build_tests ModuleTest module_kernels.code)
diff --git a/catch/unit/module/hipExtLaunchKernelGGL.cc b/catch/unit/module/hipExtLaunchKernelGGL.cc
new file mode 100755
index 0000000000..9ab420679e
--- /dev/null
+++ b/catch/unit/module/hipExtLaunchKernelGGL.cc
@@ -0,0 +1,129 @@
+/*
+   Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   */
+/*
+ * Test Scenarios
+   1. Verify kernel execution time of the particular kernel
+   2. Verify hipExtLaunchKernelGGL API by disabling time flag in event creation
+ */
+
+#include <hip_test_common.hh>
+#include <hip_test_kernels.hh>
+#include "hip/hip_ext.h"
+
+#define FOURSEC_KERNEL 4999
+#define TWOSEC_KERNEL  2999
+
+__device__ int globalvar = 1;
+__global__ void TwoSecKernel_GlobalVar(int clockrate) {
+  if (globalvar == 0x2222) {
+    globalvar = 0x3333;
+  }
+  HipTest::waitKernel(2, clockrate);
+  if (globalvar != 0x3333) {
+    globalvar = 0x5555;
+  }
+}
+
+__global__ void FourSecKernel_GlobalVar(int clockrate) {
+  if (globalvar == 1) {
+    globalvar = 0x2222;
+  }
+  HipTest::waitKernel(4, clockrate);
+  if (globalvar == 0x2222) {
+    globalvar = 0x4444;
+  }
+}
+
+
+/*
+ * In this Scenario, we create events by disabling the timing flag
+ * We then Launch the kernel using hipExtModuleLaunchKernel by passing
+ * disabled events and try to fetch kernel execution time using
+ * hipEventElapsedTime API which would fail as the flag is disabled.
+ */
+TEST_CASE("Unit_hipExtLaunchKernelGGL_TimeFlagDisabled") {
+  hipStream_t stream;
+  HIP_CHECK(hipSetDevice(0));
+  float time_2sec;
+  hipEvent_t  start_event, end_event;
+  int clkRate = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  // Event Creation and Launching kernels
+  HIP_CHECK(hipEventCreateWithFlags(&start_event,
+                                   hipEventDisableTiming));
+  HIP_CHECK(hipEventCreateWithFlags(&end_event,
+                                   hipEventDisableTiming));
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  hipExtLaunchKernelGGL(TwoSecKernel_GlobalVar, dim3(1), dim3(1), 0,
+                        stream, start_event, end_event, 0, clkRate);
+  HIP_CHECK(hipStreamSynchronize(stream));
+  REQUIRE(hipEventElapsedTime(&time_2sec, start_event, end_event)
+                              != hipSuccess);
+
+  // Destroying the events and streams
+  HIP_CHECK(hipStreamDestroy(stream));
+  HIP_CHECK(hipEventDestroy(start_event));
+  HIP_CHECK(hipEventDestroy(end_event));
+}
+/*
+ * Launching FourSecKernel and TwoSecKernel and then we try to
+ * get the event elapsed time of each kernel using the start and
+ * end events.The event elapsed time should return us the kernel
+ * execution time for that particular kernel
+*/
+TEST_CASE("Unit_hipExtLaunchKernelGGL_KernelTimeExecution") {
+  hipStream_t stream;
+  HIP_CHECK(hipSetDevice(0));
+  hipEvent_t  start_event1, end_event1, start_event2, end_event2;
+  float time_4sec, time_2sec;
+  int clkRate = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  // Creating streams and events
+  HIP_CHECK(hipEventCreate(&start_event1));
+  HIP_CHECK(hipEventCreate(&end_event1));
+  HIP_CHECK(hipEventCreate(&start_event2));
+  HIP_CHECK(hipEventCreate(&end_event2));
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Launching 4sec and 2sec kernels
+  hipExtLaunchKernelGGL(FourSecKernel_GlobalVar, dim3(1), dim3(1), 0,
+                         stream, start_event1, end_event1, 0, clkRate);
+  hipExtLaunchKernelGGL(TwoSecKernel_GlobalVar, dim3(1), dim3(1), 0,
+                          stream, start_event2, end_event2, 0, clkRate);
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  HIP_CHECK(hipEventElapsedTime(&time_4sec, start_event1, end_event1));
+  HIP_CHECK(hipEventElapsedTime(&time_2sec, start_event2, end_event2));
+
+  INFO("Expected Vs Actual: Kernel1-<" <<  FOURSEC_KERNEL << "Vs" << time_4sec
+       << "Kernel2-<" << TWOSEC_KERNEL << "Vs" << time_2sec);
+  // Verifying the kernel execution time
+  REQUIRE(time_4sec < static_cast<float>(FOURSEC_KERNEL));
+  REQUIRE(time_2sec < static_cast<float>(TWOSEC_KERNEL));
+
+  // Destroying streams and events
+  HIP_CHECK(hipStreamDestroy(stream));
+  HIP_CHECK(hipEventDestroy(start_event1));
+  HIP_CHECK(hipEventDestroy(end_event1));
+  HIP_CHECK(hipEventDestroy(start_event2));
+  HIP_CHECK(hipEventDestroy(end_event2));
+}
diff --git a/catch/unit/module/hipExtLaunchMultiKernelMultiDevice.cc b/catch/unit/module/hipExtLaunchMultiKernelMultiDevice.cc
new file mode 100644
index 0000000000..d83c425b3d
--- /dev/null
+++ b/catch/unit/module/hipExtLaunchMultiKernelMultiDevice.cc
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* This testfile verifies the basic functionality of
+   hipExtLaunchMultiKernelMultiDevice API.
+   It can be tested on single GPU or multi GPUs.
+*/
+
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include "hip/hip_runtime.h"
+
+#define MAX_GPUS 8
+#define NUM_KERNEL_ARGS 3
+
+/*
+This testcase verifies hipExtLaunchMultiKernelMultiDevice API for different
+datatypes where
+1. Intitialize device variables
+2. Initializing hipLaunchParams structure to pass it to
+   hipExtLaunchMultiKernelMultiDevice API
+3. Launches vector_square kernel which performs square of the variable
+4. Validates the result with the square of variable.
+*/
+
+TEMPLATE_TEST_CASE("Unit_hipExtLaunchMultiKernelMultiDevice_Basic", "", int
+                   , float, double) {
+  TestType *A_d[MAX_GPUS], *C_d[MAX_GPUS];
+  TestType *A_h, *C_h;
+  size_t N = 1000000;
+  size_t Nbytes = N * sizeof(TestType);
+  int nGpu = 0;
+
+  HIP_CHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+    SUCCEED("info: didn't find any GPU! Skipping the testcase");
+  } else {
+    if (nGpu > MAX_GPUS) {
+      nGpu = MAX_GPUS;
+    }
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, nullptr, &C_h, N, false);
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+
+    // Allocating and initializing device variables
+    hipStream_t stream[MAX_GPUS];
+    for (int i = 0; i < nGpu; i++) {
+      HIP_CHECK(hipSetDevice(i));
+      HIP_CHECK(hipStreamCreateWithFlags(&stream[i], hipStreamNonBlocking));
+      hipDeviceProp_t props;
+      HIP_CHECK(hipGetDeviceProperties(&props, i/*deviceID*/));
+      INFO("Running on bus 0x" << props.pciBusID << " " << props.name);
+      INFO("Allocate device mem " << 2*Nbytes/1024.0/1024.0);
+      HIP_CHECK(hipMalloc(&A_d[i], Nbytes));
+      HIP_CHECK(hipMalloc(&C_d[i], Nbytes));
+      HIP_CHECK(hipMemcpy(A_d[i], A_h, Nbytes, hipMemcpyHostToDevice));
+    }
+
+    hipLaunchParams *launchParamsList = reinterpret_cast<hipLaunchParams *>(
+        malloc(sizeof(hipLaunchParams)*nGpu));
+    void *args[MAX_GPUS * NUM_KERNEL_ARGS];
+
+    // Intializing the hipLaunchParams structure with device variables
+    // ,kernel and launching hipExtLaunchMultiKernelMultiDevice API
+    for (int i = 0; i < nGpu; i++) {
+      args[i * NUM_KERNEL_ARGS]     = &A_d[i];
+      args[i * NUM_KERNEL_ARGS + 1] = &C_d[i];
+      args[i * NUM_KERNEL_ARGS + 2] = &N;
+      launchParamsList[i].func  =
+        reinterpret_cast<void *>(HipTest::vector_square<TestType>);
+      launchParamsList[i].gridDim   = dim3(blocks);
+      launchParamsList[i].blockDim  = dim3(threadsPerBlock);
+      launchParamsList[i].sharedMem = 0;
+      launchParamsList[i].stream    = stream[i];
+      launchParamsList[i].args      = args + i * NUM_KERNEL_ARGS;
+    }
+
+    hipExtLaunchMultiKernelMultiDevice(launchParamsList, nGpu, 0);
+
+    // Validating the result
+    for (int j = 0; j < nGpu; j++) {
+      hipStreamSynchronize(stream[j]);
+      hipDeviceProp_t props;
+      HIP_CHECK(hipGetDeviceProperties(&props, j/*deviceID*/));
+      INFO("Checking result on bus " << props.pciBusID << props.name);
+
+      HIP_CHECK(hipSetDevice(j));
+      HIP_CHECK(hipMemcpy(C_h, C_d[j], Nbytes, hipMemcpyDeviceToHost));
+
+      for (size_t i = 0; i < N; i++)  {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+          INFO("validation failed " << C_h[i] << A_h[i]*A_h[i]);
+          REQUIRE(false);
+        }
+      }
+    }
+
+    // DeAllocating memory
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, nullptr, C_h, false);
+    for (int j = 0; j < nGpu; j++) {
+      HIP_CHECK(hipFree(A_d[j]));
+      HIP_CHECK(hipFree(C_d[j]));
+      HIP_CHECK(hipStreamDestroy(stream[j]));
+    }
+  }
+}
diff --git a/catch/unit/module/hipExtModuleLaunchKernel.cc b/catch/unit/module/hipExtModuleLaunchKernel.cc
new file mode 100755
index 0000000000..45db114135
--- /dev/null
+++ b/catch/unit/module/hipExtModuleLaunchKernel.cc
@@ -0,0 +1,433 @@
+/*
+   Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   */
+/* Test Scenarios
+  1. hipExtModuleLaunchKernel Negative Scenarios
+  2. hipExtModuleLaunchKernel API verifying the kernel execution time of a particular kernel.
+  3. hipExtModuleLaunchKernel API verifying the kernel execution time by disabling the time flag
+  4. hipModuleLaunchKernel Work Group tests =>
+     - (block.x * block.y * block.z) <= Work Group Size
+       where block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+     - (block.x * block.y * block.z) > Work Group Size
+       where block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+ */
+
+#include <math.h>
+#include "hip_test_common.hh"
+#include "hip_test_kernels.hh"
+#include "hip/hip_ext.h"
+
+#define fileName "module_kernels.code"
+#define matmulK "matmulK"
+#define SixteenSec "SixteenSecKernel"
+#define KernelandExtra "KernelandExtraParams"
+#define FourSec "FourSecKernel"
+#define TwoSec "TwoSecKernel"
+#define globalDevVar "deviceGlobal"
+#define dummyKernel "EmptyKernel"
+#define FOURSEC_KERNEL 4999
+#define TWOSEC_KERNEL  2999
+
+struct gridblockDim {
+  unsigned int gridX;
+  unsigned int gridY;
+  unsigned int gridZ;
+  unsigned int blockX;
+  unsigned int blockY;
+  unsigned int blockZ;
+};
+
+class ModuleLaunchKernel {
+  int N = 64;
+  int SIZE = N*N;
+  int *A, *B, *C;
+  hipDeviceptr_t *Ad, *Bd;
+  hipStream_t stream1, stream2;
+  hipEvent_t  start_event1, end_event1, start_event2, end_event2,
+              start_timingDisabled, end_timingDisabled;
+  hipModule_t Module;
+  hipDeviceptr_t deviceGlobal;
+  hipFunction_t MultKernel, SixteenSecKernel, FourSecKernel,
+  TwoSecKernel, KernelandExtraParamKernel, DummyKernel;
+  struct {
+    int clockRate;
+    void* _Ad;
+    void* _Bd;
+    void* _Cd;
+    int _n;
+  } args1, args2;
+  struct {
+  } args3;
+  size_t size1;
+  size_t size2;
+  size_t size3;
+  size_t deviceGlobalSize;
+ public :
+  void AllocateMemory();
+  void DeAllocateMemory();
+  void ModuleLoad();
+  void Module_Negative_tests();
+  void ExtModule_Negative_tests();
+  void Module_WorkGroup_Test();
+  void ExtModule_KernelExecutionTime();
+  void ExtModule_Disabled_Timingflag();
+};
+
+void ModuleLaunchKernel::AllocateMemory() {
+  A = new int[N*N*sizeof(int)];
+  B = new int[N*N*sizeof(int)];
+  for (int i=0; i < N; i++) {
+    for (int j=0; j < N; j++) {
+      A[i*N +j] = 1;
+      B[i*N +j] = 1;
+    }
+  }
+  HIP_CHECK(hipStreamCreate(&stream1));
+  HIP_CHECK(hipStreamCreate(&stream2));
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Ad),
+           SIZE*sizeof(int)));
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Bd),
+           SIZE*sizeof(int)));
+  HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&C), SIZE*sizeof(int)));
+  HIP_CHECK(hipMemcpy(Ad, A, SIZE*sizeof(int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Bd, B, SIZE*sizeof(int), hipMemcpyHostToDevice));
+  int clkRate = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  args1._Ad = Ad;
+  args1._Bd = Bd;
+  args1._Cd = C;
+  args1._n  = N;
+  args1.clockRate = clkRate;
+  args2._Ad = NULL;
+  args2._Bd = NULL;
+  args2._Cd = NULL;
+  args2._n  = 0;
+  args2.clockRate = clkRate;
+  size1 = sizeof(args1);
+  size2 = sizeof(args2);
+  size3 = sizeof(args3);
+  HIP_CHECK(hipEventCreate(&start_event1));
+  HIP_CHECK(hipEventCreate(&end_event1));
+  HIP_CHECK(hipEventCreate(&start_event2));
+  HIP_CHECK(hipEventCreate(&end_event2));
+  HIP_CHECK(hipEventCreateWithFlags(&start_timingDisabled,
+                                   hipEventDisableTiming));
+  HIP_CHECK(hipEventCreateWithFlags(&end_timingDisabled,
+                                   hipEventDisableTiming));
+}
+
+void ModuleLaunchKernel::ModuleLoad() {
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&MultKernel, Module, matmulK));
+  HIP_CHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSec));
+  HIP_CHECK(hipModuleGetFunction(&KernelandExtraParamKernel,
+                                Module, KernelandExtra));
+  HIP_CHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSec));
+  HIP_CHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSec));
+  HIP_CHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
+  HIP_CHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
+                              Module, globalDevVar));
+}
+
+void ModuleLaunchKernel::DeAllocateMemory() {
+  HIP_CHECK(hipEventDestroy(start_event1));
+  HIP_CHECK(hipEventDestroy(end_event1));
+  HIP_CHECK(hipEventDestroy(start_event2));
+  HIP_CHECK(hipEventDestroy(end_event2));
+  HIP_CHECK(hipEventDestroy(start_timingDisabled));
+  HIP_CHECK(hipEventDestroy(end_timingDisabled));
+  HIP_CHECK(hipStreamDestroy(stream1));
+  HIP_CHECK(hipStreamDestroy(stream2));
+  delete[] A;
+  delete[] B;
+  HIP_CHECK(hipFree(Ad));
+  HIP_CHECK(hipFree(Bd));
+  HIP_CHECK(hipHostFree(C));
+  HIP_CHECK(hipModuleUnload(Module));
+}
+/*
+ * In this scenario,We launch the 4 sec kernel and 2 sec kernel
+ * and we fetch the event execution time of each kernel and it
+ * should not exceed the execution time of that particular kernel
+ */
+void ModuleLaunchKernel::ExtModule_KernelExecutionTime() {
+  HIP_CHECK(hipSetDevice(0));
+  AllocateMemory();
+  ModuleLoad();
+  float time_4sec, time_2sec;
+  void *config2[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args2,
+                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size2,
+                     HIP_LAUNCH_PARAM_END};
+
+  // Launching kernels
+  HIP_CHECK(hipExtModuleLaunchKernel(FourSecKernel, 1, 1, 1, 1, 1, 1, 0,
+                                     stream1,
+                                     NULL, reinterpret_cast<void**>(&config2),
+                                     start_event1, end_event1, 0));
+  HIP_CHECK(hipExtModuleLaunchKernel(TwoSecKernel, 1, 1, 1, 1, 1, 1, 0, stream1,
+                                     NULL, reinterpret_cast<void**>(&config2),
+                                     start_event2, end_event2, 0));
+  HIP_CHECK(hipStreamSynchronize(stream1));
+  HIP_CHECK(hipEventElapsedTime(&time_4sec, start_event1, end_event1));
+  HIP_CHECK(hipEventElapsedTime(&time_2sec, start_event2, end_event2));
+
+  INFO("Expected Vs Actual: Kernel1-<" <<  FOURSEC_KERNEL << "Vs" << time_4sec
+       << "Kernel2-<" << TWOSEC_KERNEL << "Vs" << time_2sec);
+  // Verifying the kernel execution time
+  REQUIRE(time_4sec < static_cast<float>(FOURSEC_KERNEL));
+  REQUIRE(time_2sec < static_cast<float>(TWOSEC_KERNEL));
+
+  DeAllocateMemory();
+}
+/*
+ * In this Scenario, we create events by disabling the timing flag
+ * We then Launch the kernel using hipExtModuleLaunchKernel by passing
+ * disabled events and try to fetch kernel execution time using
+ * hipEventElapsedTime API which would fail as the flag is disabled.
+ */
+void ModuleLaunchKernel::ExtModule_Disabled_Timingflag() {
+  // Allocating Memory and Loading kernel
+  AllocateMemory();
+  ModuleLoad();
+  float time_2sec;
+  void *config2[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args2,
+    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size2,
+    HIP_LAUNCH_PARAM_END};
+
+  // Launching Kernel
+  HIP_CHECK(hipExtModuleLaunchKernel(TwoSecKernel, 1, 1, 1, 1, 1, 1, 0, stream1,
+                                    NULL, reinterpret_cast<void**>(&config2),
+                                    start_timingDisabled,
+                                    end_timingDisabled, 0));
+  HIP_CHECK(hipStreamSynchronize(stream1));
+
+  REQUIRE(hipEventElapsedTime(&time_2sec, start_timingDisabled,
+          end_timingDisabled) != hipSuccess);
+
+  // DeAllocating the memory
+  DeAllocateMemory();
+}
+
+/*
+This testcase verifies negative scenarios of hipExtModuleLaunchKernel API
+*/
+void ModuleLaunchKernel::ExtModule_Negative_tests() {
+  HIP_CHECK(hipSetDevice(0));
+  // Allocating memeory and loading kernel
+  AllocateMemory();
+  ModuleLoad();
+  void *config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args1,
+                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
+                     HIP_LAUNCH_PARAM_END};
+  void *params[] = {Ad};
+
+  SECTION("Nullptr to kernel function") {
+    REQUIRE(hipExtModuleLaunchKernel(nullptr, 1, 1, 1, 1, 1, 1, 0,
+                                     stream1, NULL,
+                                     reinterpret_cast<void**>(&config1),
+                                      nullptr, nullptr, 0) != hipSuccess);
+  }
+
+  SECTION("Max int value to block dimensions") {
+    REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                     std::numeric_limits<uint32_t>::max(),
+                                     std::numeric_limits<uint32_t>::max(),
+                                     std::numeric_limits<uint32_t>::max(), 0,
+                                     stream1, NULL,
+                                     reinterpret_cast<void**>(&config1),
+                                     nullptr, nullptr, 0) != hipSuccess);
+  }
+
+    SECTION("Null values to all dimensions") {
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 0, 0, 0,
+                                       0,
+                                       0,
+                                       0, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Passing 0 for x dimension") {
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 0, 1, 1,
+                                       0,
+                                       1,
+                                       1, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Passing 0 for y dimension") {
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 0, 1,
+                                       1,
+                                       0,
+                                       1, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Passing 0 for Z dimension") {
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 0,
+                                       1,
+                                       1,
+                                       0, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Passing both kernel and extra params") {
+      REQUIRE(hipExtModuleLaunchKernel(KernelandExtraParamKernel, 1, 1, 1, 1,
+                                       1, 1, 0,
+                                       stream1,
+                                       reinterpret_cast<void**>(&params),
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Passing both than maxthreadsperblock to block dimensions") {
+      hipDeviceProp_t deviceProp;
+      hipGetDeviceProperties(&deviceProp, 0);
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                       deviceProp.maxThreadsPerBlock+1,
+                                       deviceProp.maxThreadsPerBlock+1,
+                                       deviceProp.maxThreadsPerBlock+1, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Block dimension x = Max alloweed + 1") {
+      hipDeviceProp_t deviceProp;
+      hipGetDeviceProperties(&deviceProp, 0);
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                       deviceProp.maxThreadsDim[0]+1,
+                                       1,
+                                       1, 0, stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Block dimension Y = Max alloweed + 1") {
+      hipDeviceProp_t deviceProp;
+      hipGetDeviceProperties(&deviceProp, 0);
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                       1,
+                                       deviceProp.maxThreadsDim[1]+1,
+                                       1, 0, stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Block dimension Z = Max alloweed + 1") {
+      hipDeviceProp_t deviceProp;
+      hipGetDeviceProperties(&deviceProp, 0);
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                       1,
+                                       1,
+                                       deviceProp.maxThreadsDim[2]+1, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config1),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    SECTION("Passing invalid config data in extra params") {
+      void *config3[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                         HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
+                         HIP_LAUNCH_PARAM_END};
+      REQUIRE(hipExtModuleLaunchKernel(MultKernel, 1, 1, 1, 1, 1, 1, 0,
+                                       stream1, NULL,
+                                       reinterpret_cast<void**>(&config3),
+                                       nullptr, nullptr, 0) != hipSuccess);
+    }
+
+    DeAllocateMemory();
+}
+
+void ModuleLaunchKernel::Module_WorkGroup_Test() {
+  // Allocate memory and load modules
+  AllocateMemory();
+  ModuleLoad();
+  void *config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args3,
+                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size3,
+                     HIP_LAUNCH_PARAM_END};
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, 0);
+  double cuberootVal =
+                cbrt(static_cast<double>(deviceProp.maxThreadsPerBlock));
+  uint32_t cuberoot_floor = floor(cuberootVal);
+  uint32_t cuberoot_ceil = ceil(cuberootVal);
+
+  // Scenario: (block.x * block.y * block.z) <= Work Group Size where
+  // block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+  HIP_CHECK(hipExtModuleLaunchKernel(DummyKernel,
+                            1, 1, 1,
+                            cuberoot_floor, cuberoot_floor, cuberoot_floor,
+                            0, stream1, NULL,
+                            reinterpret_cast<void**>(&config1),
+                            nullptr, nullptr, 0));
+
+  // Scenario: (block.x * block.y * block.z) > Work Group Size where
+  // block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+  REQUIRE(hipExtModuleLaunchKernel(DummyKernel,
+                            1, 1, 1,
+                            cuberoot_ceil, cuberoot_ceil, cuberoot_ceil+1,
+                            0, stream1, NULL,
+                            reinterpret_cast<void**>(&config1),
+                            nullptr, nullptr, 0) != hipSuccess);
+
+  // DeAllocating memory
+  DeAllocateMemory();
+}
+
+/*
+This testcase verifies the negative scenarios of
+hipExtModuleLaunchKernel API
+*/
+TEST_CASE("Unit_hipExtModuleLaunchKernel_Negative") {
+  ModuleLaunchKernel Ext_obj;
+  Ext_obj.ExtModule_Negative_tests();
+}
+
+/*
+This testcase verifies hipExtModuleLaunchKernel API by
+disabling the timing flag
+*/
+TEST_CASE("Unit_hipExtModuleLaunchKernel_TimingflagDisabled") {
+  ModuleLaunchKernel Ext_obj;
+  Ext_obj.ExtModule_Disabled_Timingflag();
+}
+
+/*
+This testcase verifies hipExtModuleLaunchKernel API kernel
+execution time
+*/
+TEST_CASE("Unit_hipExtModuleLaunchKernel_KernelExecutionTime") {
+  ModuleLaunchKernel Ext_obj;
+  Ext_obj.ExtModule_KernelExecutionTime();
+}
+
+/*
+This testcase verifies workgroup of hipExtModuleLaunchKernel API
+*/
+TEST_CASE("Unit_hipExtModuleLaunchKernel_WorkGroup") {
+  ModuleLaunchKernel Ext_obj;
+  Ext_obj.Module_WorkGroup_Test();
+}
diff --git a/catch/unit/module/hipFuncGetAttributes.cc b/catch/unit/module/hipFuncGetAttributes.cc
new file mode 100644
index 0000000000..5e21dc17c6
--- /dev/null
+++ b/catch/unit/module/hipFuncGetAttributes.cc
@@ -0,0 +1,163 @@
+
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+#include <utility>
+
+#define fileName "module_kernels.code"
+#define kernel_name "hello_world"
+
+namespace testhipFuncGetAttributesApi {
+__global__
+void fn(float* px, float* py) {
+  bool a[42];
+  __shared__ double b[69];
+  for (auto&& x : b) x = *py++;
+  for (auto&& x : a) x = *px++ > 0.0;
+  for (auto&& x : a) if (x) *--py = *--px;
+}
+template <int WGSIZE, int LDS>
+__launch_bounds__(WGSIZE, 1) __global__ void kernelfn(int *x) {
+  __shared__ int lds[LDS];
+  for (int i = 0; i < LDS; ++i) {
+    lds[i] = x[i];
+  }
+  x[LDS - 1] = lds[0] / lds[LDS - 1];
+}
+template <int WGSIZE, int LDS> bool test_Attributes_Values() {
+  bool TestPassed = true;
+  hipFuncAttributes attr{};
+  hipFuncGetAttributes(&attr,
+     reinterpret_cast<void const *>(kernelfn<WGSIZE, LDS>));
+  if (attr.maxThreadsPerBlock != WGSIZE) {
+    TestPassed = false;
+  }
+  if (attr.sharedSizeBytes != LDS * sizeof(int)) {
+    TestPassed = false;
+  }
+  return TestPassed;
+}
+}  // namespace testhipFuncGetAttributesApi
+/**
+ * hipFuncGetAttributes and hipModuleGetFunction functional tests
+ * Scenario1: Validates the value of attribute "maxThreadsPerBlock" should be non zero.
+ * Scenario2: Validates the value of attribute
+ *            "HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK" should be non zero.
+ */
+// scenario 1
+TEST_CASE("Unit_hipFuncGetAttributes_FuncTst") {
+  hipFuncAttributes attr{};
+  auto r = hipFuncGetAttributes(&attr,
+  reinterpret_cast<const void*>(&testhipFuncGetAttributesApi::fn));
+  REQUIRE_FALSE(r != hipSuccess);
+  REQUIRE_FALSE(attr.maxThreadsPerBlock == 0);
+}
+// scenario 2
+TEST_CASE("Unit_hipFuncGetAttribute_FuncTst") {
+  hipModule_t Module;
+  int attrib_val;
+  CTX_CREATE()
+  hipFunction_t Function;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+  auto r = hipFuncGetAttribute(&attrib_val,
+          HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Function);
+  REQUIRE_FALSE(r != hipSuccess);
+  REQUIRE_FALSE(attrib_val == 0);
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+/**
+ * hipFuncGetAttributes negative tests
+ * Scenario1: Validates returned error code for attr = nullptr
+ * Scenario2: Validates returned error code for function = nullptr
+ */
+TEST_CASE("Unit_hipFuncGetAttributes_NegTst") {
+  SECTION("attr is nullptr") {
+    REQUIRE_FALSE(hipSuccess == hipFuncGetAttributes(nullptr,
+    reinterpret_cast<const void*>(&testhipFuncGetAttributesApi::fn)));
+  }
+  SECTION("function is nullptr") {
+    hipFuncAttributes attr{};
+    REQUIRE_FALSE(hipSuccess == hipFuncGetAttributes(&attr, nullptr));
+  }
+}
+/**
+ * hipFuncGetAttribute negative tests
+ * Scenario1: Validates returned error code for attrib_val = nullptr
+ * Scenario2: Validates returned error code for attrib = invalid = 0xff
+ */
+TEST_CASE("Unit_hipFuncGetAttribute_NegTst") {
+  hipModule_t Module;
+  CTX_CREATE()
+  hipFunction_t Function;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+  SECTION("attr is nullptr") {
+    REQUIRE_FALSE(hipSuccess == hipFuncGetAttribute(nullptr,
+        HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Function));
+  }
+  SECTION("attr is invalid") {
+    int attrib_val;
+    REQUIRE_FALSE(hipSuccess == hipFuncGetAttribute(&attrib_val,
+           static_cast<hipFunction_attribute>(0xff), Function));
+  }
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+/**
+ * hipFuncGetAttributes
+ * Scenario4: Validates the value of attribute "maxThreadsPerBlock".
+ * Scenario5: Validates the value of attribute "sharedSizeBytes".
+ */
+TEST_CASE("Unit_hipFuncGetAttributes_AttrTest") {
+  bool TestPassed = true;
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<64, 64>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<128, 64>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<256, 64>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<512, 64>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<1024, 64>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<64, 128>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<128, 128>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<256, 128>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<512, 128>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<1024, 128>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<64, 256>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<128, 256>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<256, 256>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<512, 256>();
+  TestPassed &= testhipFuncGetAttributesApi::
+                test_Attributes_Values<1024, 256>();
+  REQUIRE(TestPassed);
+}
+
diff --git a/catch/unit/module/hipFuncSetAttribute.cc b/catch/unit/module/hipFuncSetAttribute.cc
new file mode 100644
index 0000000000..33602f9078
--- /dev/null
+++ b/catch/unit/module/hipFuncSetAttribute.cc
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hip_test_common.hh"
+
+__global__ void fn(float* px, float* py) {
+  bool a[42];
+  __shared__ double b[69];
+
+  for (auto&& x : b) x = *py++;
+  for (auto&& x : a) x = *px++ > 0.0;
+  for (auto&& x : a) if (x) *--py = *--px;
+}
+
+/*
+This testcases verifies the basic func of hipFuncSetAttribute API where
+we need to pass function that executes on device
+hipFuncAttributeMaxDynamicSharedMemorySize -->
+The sum of this value + sharedSizeBytes  should not exceed device attribute
+hipFuncAttributePreferredSharedMemoryCarveout -->
+Carving out the shared memory.
+*/
+TEST_CASE("Unit_hipFuncSetAttribute_Basic") {
+  HIP_CHECK(hipFuncSetAttribute(reinterpret_cast<const void*>(&fn),
+                                hipFuncAttributeMaxDynamicSharedMemorySize,
+                                0));
+  HIP_CHECK(hipFuncSetAttribute(reinterpret_cast<const void*>(&fn),
+                                hipFuncAttributePreferredSharedMemoryCarveout,
+                                0));
+}
diff --git a/catch/unit/module/hipFuncSetCacheConfig.cc b/catch/unit/module/hipFuncSetCacheConfig.cc
new file mode 100644
index 0000000000..cd5d58a8f6
--- /dev/null
+++ b/catch/unit/module/hipFuncSetCacheConfig.cc
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+__global__ void Empty_Kernel() {
+}
+
+/*
+This testcase verifies the basic funct of hipFuncSetCacheConfig API
+On GPU devices, where L1 and shared memory uses same resources
+This sets the preferred cache configuration for the kernel function
+In this testcases we are setting hipFuncCachePreferL1 where L1 is
+preferred more than shared memory
+*/
+TEST_CASE("Unit_hipFuncSetCacheConfig_Basic") {
+  hipFuncCache_t cacheConfig{hipFuncCachePreferL1};
+  HIP_CHECK(hipFuncSetCacheConfig(reinterpret_cast<void*>(Empty_Kernel),
+                                  cacheConfig));
+}
diff --git a/catch/unit/module/hipFuncSetSharedMemConfig.cc b/catch/unit/module/hipFuncSetSharedMemConfig.cc
new file mode 100644
index 0000000000..c9d65275b5
--- /dev/null
+++ b/catch/unit/module/hipFuncSetSharedMemConfig.cc
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Test Description:
+// This test case verifies the working of hipFuncSetSharedMemConfig() api and
+// the flag parameter
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+
+__global__ void ReverseSeq(int *A, int *B, int N) {
+  extern __shared__ int SMem[];
+  int offset = threadIdx.x;
+  int MirrorVal = N - offset - 1;
+  SMem[offset] = A[offset];
+  __syncthreads();
+  B[offset] = SMem[MirrorVal];
+}
+/*
+This testcase verifies the basic functionality of hipFuncSetSharedMemConfig API
+by setting shared memory bank size
+
+1. hipSharedMemBankSizeDefault
+2. hipSharedMemBankSizeFourByte
+3. hipSharedMemBankSizeEightByte
+
+*/
+TEST_CASE("Unit_hipFuncSetSharedMemConfig_Basic") {
+  int *Ah{nullptr}, *RAh{nullptr}, NumElms = 128;
+  int *Ad{nullptr}, *RAd{nullptr};
+
+  HipTest::initArrays<int>(&Ad, &RAd, nullptr,
+                           &Ah, &RAh, nullptr, NumElms, false);
+  for (int i = 0; i < NumElms; ++i) {
+    Ah[i] = i;
+    RAh[i] = NumElms - i - 1;
+  }
+  HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemset(RAd, 0, NumElms * sizeof(int)));
+
+  // Testing hipFuncSetSharedMemConfig() with hipSharedMemBankSizeDefault flag
+  HIP_CHECK(hipFuncSetSharedMemConfig(reinterpret_cast<const void*>
+                                      (&ReverseSeq),
+                                      hipSharedMemBankSizeDefault));
+
+  // Kernel Launch with shared mem size of = NumElms * sizeof(int)
+  ReverseSeq<<<1, NumElms, NumElms * sizeof(int)>>>(Ad, RAd, NumElms);
+  memset(Ah, 0, NumElms * sizeof(int));
+
+  // Verifying the results
+  HIP_CHECK(hipMemcpy(Ah, RAd, NumElms * sizeof(int), hipMemcpyDeviceToHost));
+  for (int i = 0; i < NumElms; ++i) {
+    REQUIRE(Ah[i] == RAh[i]);
+  }
+
+  // Testing hipFuncSetSharedMemConfig() with hipSharedMemBankSizeFourBytes flg
+  HIP_CHECK(hipFuncSetSharedMemConfig(reinterpret_cast<const void*>
+                                      (&ReverseSeq),
+                                      hipSharedMemBankSizeFourByte));
+  HIP_CHECK(hipMemset(RAd, 0, NumElms * sizeof(int)));
+
+  // Kernel Launch with shared mem size of = NumElms * sizeof(int)
+  ReverseSeq<<<1, NumElms, NumElms * sizeof(int)>>>(Ad, RAd, NumElms);
+  memset(Ah, 0, NumElms * sizeof(int));
+
+  // Verifying the results
+  HIP_CHECK(hipMemcpy(Ah, RAd, NumElms * sizeof(int), hipMemcpyDeviceToHost));
+  for (int i = 0; i < NumElms; ++i) {
+    REQUIRE(Ah[i] == RAh[i]);
+  }
+
+  // Testing hipFuncSetSharedMemConfig() with hipSharedMemBankSizeEightBytes flg
+  HIP_CHECK(hipFuncSetSharedMemConfig(reinterpret_cast<const void*>
+                                      (&ReverseSeq),
+                                      hipSharedMemBankSizeEightByte));
+  HIP_CHECK(hipMemset(RAd, 0, NumElms * sizeof(int)));
+
+  // Kernel Launch with shared mem size of = NumElms * sizeof(int)
+  ReverseSeq<<<1, NumElms, NumElms * sizeof(int)>>>(Ad, RAd, NumElms);
+  memset(Ah, 0, NumElms * sizeof(int));
+
+  // Verifying the results
+  HIP_CHECK(hipMemcpy(Ah, RAd, NumElms * sizeof(int), hipMemcpyDeviceToHost));
+  for (int i = 0; i < NumElms; ++i) {
+    REQUIRE(Ah[i] == RAh[i]);
+  }
+
+  HipTest::freeArrays<int>(Ad, RAd, nullptr,
+                            Ah, RAh, nullptr, false);
+}
diff --git a/catch/unit/module/hipManagedKeyword.cc b/catch/unit/module/hipManagedKeyword.cc
new file mode 100644
index 0000000000..7ed9bbe630
--- /dev/null
+++ b/catch/unit/module/hipManagedKeyword.cc
@@ -0,0 +1,56 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+hipManagedKeyword API Scenario
+1. Test hipModuleLoad on multiple GPUs
+*/
+
+#include "hip_test_common.hh"
+#include "hip_test_kernels.hh"
+#include "hip_test_checkers.hh"
+
+#define MANAGED_VAR_INIT_VALUE 10
+#define fileName "module_kernels.code"
+
+TEST_CASE("Unit_hipMangedKeyword_ModuleLoadMultiGPU") {
+  int numDevices = 0, data;
+  hipDeviceptr_t x;
+  size_t xSize;
+  hipGetDeviceCount(&numDevices);
+  for (int i = 0; i < numDevices; i++) {
+    hipSetDevice(i);
+    CTX_CREATE()
+    hipModule_t Module;
+    HIP_CHECK(hipModuleLoad(&Module, fileName));
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "GPU_func"));
+    HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, 1, 1,
+                                    1, 0, 0, NULL, NULL));
+    hipDeviceSynchronize();
+    HIP_CHECK(hipModuleGetGlobal(reinterpret_cast<hipDeviceptr_t*>(&x),
+                                 &xSize, Module, "x"));
+    HIP_CHECK(hipMemcpyDtoH(&data, hipDeviceptr_t(x), xSize));
+    REQUIRE(data == (1 + MANAGED_VAR_INIT_VALUE));
+    HIP_CHECK(hipModuleUnload(Module));
+    CTX_DESTROY()
+  }
+}
diff --git a/catch/unit/module/hipModule.cc b/catch/unit/module/hipModule.cc
new file mode 100755
index 0000000000..9683eacee9
--- /dev/null
+++ b/catch/unit/module/hipModule.cc
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testcase verifies the hipModuleLoad API On
+1. Single code object
+2. Multi Target architecture code object
+*/
+#include <fstream>
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+#ifdef __linux__
+#include <unistd.h>
+#endif
+#define LEN 64
+#define SIZE (LEN << 2)
+#define COMMAND_LEN 256
+#define CODE_OBJ_SINGLEARCH "module_kernels.code"
+#define kernel_name "hello_world"
+#define CODE_OBJ_MULTIARCH "vcpy_kernel_multarch.code"
+
+/*
+This API loads the kernel function, Launches the kernel
+which copies one variable to another and validates both
+the device variables for the current GPU architecture
+*/
+void testCodeObjFile(const char *codeObjFile) {
+  float *A, *B;
+  float *Ad, *Bd;
+  HipTest::initArrays<float>(&Ad, &Bd, nullptr,
+                             &A, &B, nullptr, LEN, false);
+
+  HIP_CHECK(hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(Ad), A, SIZE));
+  HIP_CHECK(hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(Bd), B, SIZE));
+
+  hipModule_t Module;
+  hipFunction_t Function;
+  HIP_CHECK(hipModuleLoad(&Module, codeObjFile));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  struct {
+    void* _Ad;
+    void* _Bd;
+  } args;
+  args._Ad = reinterpret_cast<void*>(Ad);
+  args._Bd = reinterpret_cast<void*>(Bd);
+  size_t size = sizeof(args);
+
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                    HIP_LAUNCH_PARAM_END};
+  HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0,
+                                 stream, NULL,
+                                 reinterpret_cast<void**>(&config)));
+
+  HIP_CHECK(hipStreamDestroy(stream));
+
+  HIP_CHECK(hipMemcpyDtoH(B, reinterpret_cast<hipDeviceptr_t>(Bd), SIZE));
+
+  for (uint32_t i = 0; i < LEN; i++) {
+    REQUIRE(A[i] == B[i]);
+  }
+
+  HipTest::freeArrays<float>(Ad, Bd, nullptr,
+                             A, B, nullptr,
+                             false);
+  HIP_CHECK(hipModuleUnload(Module));
+}
+
+#ifdef __linux__
+/**
+ * Check if environment variable $ROCM_PATH is defined
+ *
+ */
+bool isRocmPathSet() {
+  FILE *fpipe;
+  char const *command = "echo $ROCM_PATH";
+  fpipe = popen(command, "r");
+
+  if (fpipe == nullptr) {
+    WARN("Unable to create command");
+    return false;
+  }
+  char command_op[COMMAND_LEN];
+  if (fgets(command_op, COMMAND_LEN, fpipe)) {
+    size_t len = strlen(command_op);
+    if (len > 1) {  // This is because fgets always adds newline character
+      pclose(fpipe);
+      return true;
+    }
+  }
+  pclose(fpipe);
+  return false;
+}
+#endif
+/*
+This testcase checks the hipModuleLoadData API for the
+current GPU architecture.
+*/
+TEST_CASE("Unit_hipModule_TestCodeObjFile") {
+  testCodeObjFile(CODE_OBJ_SINGLEARCH);
+}
+
+/*
+This testcases
+1. Creates kernel file and copies to tmp folder
+2. Checks for Rocm path and generates code file for
+   multiple target architectures.
+*/
+TEST_CASE("Unit_hipModule_MultiTargArchCodeObj") {
+#ifdef __linux__
+  char command[COMMAND_LEN];
+  hipDeviceProp_t props;
+  hipGetDeviceProperties(&props, 0);
+  // Hardcoding the codeobject lines in multiple string to avoid cpplint warning
+  std::string CodeObjL1 = "#include \"hip/hip_runtime.h\"\n";
+  std::string CodeObjL2 =
+       "extern \"C\" __global__ void hello_world(float* a, float* b) {\n";
+  std::string CodeObjL3 = "  int tx = hipThreadIdx_x;\n";
+  std::string CodeObjL4 = "  b[tx] = a[tx];\n";
+  std::string CodeObjL5 = "}";
+  // Creating the full code object string
+  static std::string CodeObj = CodeObjL1 + CodeObjL2 + CodeObjL3 +
+                               CodeObjL4 + CodeObjL5;
+  std::ofstream ofs("/tmp/vcpy_kernel.cpp", std::ofstream::out);
+  ofs << CodeObj;
+  ofs.close();
+  // Copy the file into current working location if not available
+  if (access("/tmp/vcpy_kernel.cpp", F_OK) == -1) {
+    INFO("Code Object File: /tmp/vcpy_kernel.cpp not found");
+    REQUIRE(true);
+  }
+  // Generate the command to generate multi architecture code object file
+  const char* hipcc_path = nullptr;
+  if (isRocmPathSet()) {
+    hipcc_path = "$ROCM_PATH/bin/hipcc";
+  } else {
+    hipcc_path = "/opt/rocm/bin/hipcc";
+  }
+  /* Putting these command parameters into a variable to shorten the string
+    literal length in order to avoid multiline string literal cpplint warning
+  */
+  const char* genco_option = "--offload-arch";
+  const char* input_codeobj = "/tmp/vcpy_kernel.cpp";
+  snprintf(command, COMMAND_LEN,
+  "%s --genco %s=gfx801,gfx802,gfx803,gfx900,gfx908,gfx1030,gfx90a,%s %s -o %s",
+  hipcc_path, genco_option, props.gcnArchName, input_codeobj,
+  CODE_OBJ_MULTIARCH);
+
+  system((const char*)command);
+  // Check if the code object file is created
+  snprintf(command, COMMAND_LEN, "./%s",
+           CODE_OBJ_MULTIARCH);
+
+  if (access(command, F_OK) == -1) {
+    INFO("Code Object File not found");
+    REQUIRE(true);
+  } else {
+    testCodeObjFile(CODE_OBJ_MULTIARCH);
+  }
+#else
+  SUCCEED("This test is skipped due to non linux environment");
+#endif
+}
diff --git a/catch/unit/module/hipModuleGetGlobal.cc b/catch/unit/module/hipModuleGetGlobal.cc
new file mode 100755
index 0000000000..e99e3a64f1
--- /dev/null
+++ b/catch/unit/module/hipModuleGetGlobal.cc
@@ -0,0 +1,120 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include <hip_test_checkers.hh>
+#include <hip_test_kernels.hh>
+#include <fstream>
+#include <vector>
+
+#define LEN 64
+#define SIZE LEN * sizeof(float)
+#define ARRAY_SIZE 16
+#define fileName "module_kernels.code"
+
+/*
+This testcase verifies the basic functionality of hipModuleGetGlobal API
+1. Simple kernel
+2. Global variables
+*/
+TEST_CASE("Unit_hipModuleGetGlobal_Basic") {
+  float *A{nullptr}, *B{nullptr}, *Ad{nullptr}, *Bd{nullptr};
+  HipTest::initArrays<float>(&Ad, &Bd, nullptr, &A, &B, nullptr, LEN,
+                             false);
+  CTX_CREATE()
+  hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(Ad), A, SIZE);
+  hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(Bd), B, SIZE);
+  hipModule_t Module;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+
+  float myDeviceGlobal_h = 42.0;
+  hipDeviceptr_t deviceGlobal;
+  size_t deviceGlobalSize;
+  HIP_CHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
+                               Module, "myDeviceGlobal"));
+  HIP_CHECK(hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>(deviceGlobal),
+                          &myDeviceGlobal_h, deviceGlobalSize));
+  float myDeviceGlobalArray_h[ARRAY_SIZE];
+  hipDeviceptr_t myDeviceGlobalArray;
+  size_t myDeviceGlobalArraySize;
+
+  HIP_CHECK(hipModuleGetGlobal(reinterpret_cast<hipDeviceptr_t*>
+                               (&myDeviceGlobalArray),
+                               &myDeviceGlobalArraySize, Module,
+                               "myDeviceGlobalArray"));
+
+  for (int i = 0; i < ARRAY_SIZE; i++) {
+    myDeviceGlobalArray_h[i] = i * 1000.0f;
+    HIP_CHECK(hipMemcpyHtoD(reinterpret_cast<hipDeviceptr_t>
+                            (myDeviceGlobalArray),
+                            &myDeviceGlobalArray_h,
+                            myDeviceGlobalArraySize));
+  }
+
+  struct {
+    void* _Ad;
+    void* _Bd;
+  } args;
+
+  args._Ad = reinterpret_cast<void*>(Ad);
+  args._Bd = reinterpret_cast<void*>(Bd);
+  size_t size = sizeof(args);
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                    HIP_LAUNCH_PARAM_END};
+
+  SECTION("Testing with simple kernel") {
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "hello_world"));
+    HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0,
+                                    NULL,
+                                    reinterpret_cast<void**>(&config)));
+
+    hipMemcpyDtoH(B, hipDeviceptr_t(Bd), SIZE);
+
+    for (uint32_t i = 0; i < LEN; i++) {
+      REQUIRE(A[i] == B[i]);
+    }
+  }
+
+  SECTION("Testing global variables") {
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "test_globals"));
+    HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0,
+                                    NULL,
+                                    reinterpret_cast<void**>(&config)));
+
+    hipMemcpyDtoH(B, hipDeviceptr_t(Bd), SIZE);
+
+    for (uint32_t i = 0; i < LEN; i++) {
+      float expected = A[i] + myDeviceGlobal_h +
+        myDeviceGlobalArray_h[i % 16];
+      REQUIRE(expected == B[i]);
+    }
+  }
+
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+  HipTest::freeArrays<float>(Ad, Bd, nullptr,
+                             A, B, nullptr,
+                             false);
+}
diff --git a/catch/unit/module/hipModuleLaunchKernel.cc b/catch/unit/module/hipModuleLaunchKernel.cc
new file mode 100644
index 0000000000..341378aec0
--- /dev/null
+++ b/catch/unit/module/hipModuleLaunchKernel.cc
@@ -0,0 +1,246 @@
+/*
+   Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   */
+/* Test Scenarios
+  1. hipModuleLaunchKernel Negative Scenarios
+  2. hipModuleLaunchKernel Work Group tests =>
+     - (block.x * block.y * block.z) <= Work Group Size
+       where block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+     - (block.x * block.y * block.z) > Work Group Size
+       where block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+ */
+
+#include <math.h>
+#include <hip_test_common.hh>
+
+#define fileName "module_kernels.code"
+#define matmulK "matmulK"
+#define SixteenSec "SixteenSecKernel"
+#define KernelandExtra "KernelandExtraParams"
+#define FourSec "FourSecKernel"
+#define TwoSec "TwoSecKernel"
+#define dummyKernel "EmptyKernel"
+
+struct gridblockDim {
+  unsigned int gridX;
+  unsigned int gridY;
+  unsigned int gridZ;
+  unsigned int blockX;
+  unsigned int blockY;
+  unsigned int blockZ;
+};
+
+/*
+This testcase verifies the negative scenarios of
+hipModuleLaunchKernel API
+*/
+TEST_CASE("Unit_hipModuleLaunchKernel_Negative") {
+  HIP_CHECK(hipSetDevice(0));
+  struct {
+    void* _Ad;
+    void* _Bd;
+    void* _Cd;
+    int _n;
+  } args1;
+  args1._Ad = nullptr;
+  args1._Bd = nullptr;
+  args1._Cd = nullptr;
+  args1._n  = 0;
+  hipFunction_t MultKernel, KernelandExtraParamKernel;
+  size_t size1;
+  size1 = sizeof(args1);
+  hipModule_t Module;
+  hipStream_t stream1;
+  hipDeviceptr_t *Ad{nullptr};
+  CTX_CREATE()
+
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&MultKernel, Module, matmulK));
+  HIP_CHECK(hipModuleGetFunction(&KernelandExtraParamKernel,
+                                Module, KernelandExtra));
+  void *config1[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args1,
+                     HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
+                     HIP_LAUNCH_PARAM_END};
+  void *params[] = {Ad};
+  HIP_CHECK(hipStreamCreate(&stream1));
+  SECTION("Passing nullptr to kernel function") {
+    REQUIRE(hipModuleLaunchKernel(nullptr, 1, 1, 1, 1, 1, 1, 0,
+                                  stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Passing Max int value to block dim") {
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                  std::numeric_limits<uint32_t>::max(),
+                                  std::numeric_limits<uint32_t>::max(),
+                                  std::numeric_limits<uint32_t>::max(),
+                                  0, stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+
+  SECTION("Passing 0 to all value dim") {
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 0, 0, 0,
+                                  0,
+                                  0,
+                                  0, 0,
+                                  stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Passing 0 for X dim") {
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 0, 1, 1,
+                                  0,
+                                  1,
+                                  1, 0,
+                                  stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+
+  SECTION("Passing 0 for Y dim") {
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 0, 1,
+                                  1,
+                                  0,
+                                  1, 0,
+                                  stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Passing 0 for Z dim") {
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 0,
+                                  1,
+                                  1,
+                                  0, 0,
+                                  stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Passing both kernel and extra params") {
+    REQUIRE(hipModuleLaunchKernel(KernelandExtraParamKernel, 1, 1, 1, 1,
+                                  1, 1, 0, stream1,
+                                  reinterpret_cast<void**>(&params),
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Passing more than maxthreadsperblock to block dim") {
+    hipDeviceProp_t deviceProp;
+    hipGetDeviceProperties(&deviceProp, 0);
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                  deviceProp.maxThreadsPerBlock+1,
+                                  deviceProp.maxThreadsPerBlock+1,
+                                  deviceProp.maxThreadsPerBlock+1, 0,
+                                  stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Block dim X is more than max allowed") {
+    hipDeviceProp_t deviceProp;
+    hipGetDeviceProperties(&deviceProp, 0);
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                  deviceProp.maxThreadsDim[0]+1,
+                                  1,
+                                  1, 0, stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Block dim Y is more than max allowed") {
+    hipDeviceProp_t deviceProp;
+    hipGetDeviceProperties(&deviceProp, 0);
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                  1,
+                                  deviceProp.maxThreadsDim[1]+1,
+                                  1, 0, stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Block dim Z is more than max allowed") {
+    hipDeviceProp_t deviceProp;
+    hipGetDeviceProperties(&deviceProp, 0);
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                  1,
+                                  1,
+                                  deviceProp.maxThreadsDim[2]+1,
+                                  0, stream1, NULL,
+                                  reinterpret_cast<void**>(&config1))
+                                  != hipSuccess);
+  }
+
+  SECTION("Block invalid config to extra params") {
+    void *config3[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                       HIP_LAUNCH_PARAM_BUFFER_SIZE, &size1,
+                       HIP_LAUNCH_PARAM_END};
+    REQUIRE(hipModuleLaunchKernel(MultKernel, 1, 1, 1,
+                                  1, 1, 1, 0, stream1,
+                                  NULL,
+                                  reinterpret_cast<void**>(&config3))
+                                  != hipSuccess);
+  }
+
+  HIP_CHECK(hipStreamDestroy(stream1));
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+
+/*
+This testcase verifies the work group scenarios of
+hipModuleLaunchKernel API
+*/
+TEST_CASE("Unit_hipModuleLaunchKernel_WorkGroup") {
+  HIP_CHECK(hipSetDevice(0));
+  hipFunction_t DummyKernel;
+  hipModule_t Module;
+  hipStream_t stream1;
+  CTX_CREATE()
+
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
+  HIP_CHECK(hipStreamCreate(&stream1));
+  // Passing Max int value to block dimensions
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, 0);
+  double cuberootVal =
+              cbrt(static_cast<double>(deviceProp.maxThreadsPerBlock));
+  uint32_t cuberoot_floor = floor(cuberootVal);
+  uint32_t cuberoot_ceil = ceil(cuberootVal);
+  // Scenario: (block.x * block.y * block.z) <= Work Group Size where
+  // block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+  HIP_CHECK(hipModuleLaunchKernel(DummyKernel,
+                            1, 1, 1,
+                            cuberoot_floor, cuberoot_floor, cuberoot_floor,
+                            0, stream1, NULL, NULL));
+  // Scenario: (block.x * block.y * block.z) > Work Group Size where
+  // block.x < MaxBlockDimX , block.y < MaxBlockDimY and block.z < MaxBlockDimZ
+  REQUIRE(hipModuleLaunchKernel(DummyKernel,
+                            1, 1, 1,
+                            cuberoot_ceil, cuberoot_ceil, cuberoot_ceil + 1,
+                            0, stream1, NULL, NULL) != hipSuccess);
+  HIP_CHECK(hipStreamDestroy(stream1));
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
diff --git a/catch/unit/module/hipModuleLoadData.cc b/catch/unit/module/hipModuleLoadData.cc
new file mode 100644
index 0000000000..2102a10f49
--- /dev/null
+++ b/catch/unit/module/hipModuleLoadData.cc
@@ -0,0 +1,91 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+hipModuleLoadData scenarios
+
+1. Loads the kernel and the corresponding kernel function
+   which copies the data from one device variable to another.
+*/
+
+#include <fstream>
+#include <vector>
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+
+#define LEN 64
+#define SIZE LEN << 2
+#define FILENAME "module_kernels.code"
+#define kernel_name "hello_world"
+
+static std::vector<char> load_file() {
+  std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
+  std::streamsize fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(fsize);
+  if (!file.read(buffer.data(), fsize)) {
+    INFO("could not open code object" << FILENAME);
+    REQUIRE(false);
+  }
+  return buffer;
+}
+
+
+TEST_CASE("Unit_hipModuleLoadData_Basic") {
+    auto buffer = load_file();
+    float *A{nullptr}, *B{nullptr}, *Ad{nullptr}, *Bd{nullptr};
+    HipTest::initArrays<float>(&Ad, &Bd, nullptr, &A, &B, nullptr,
+                               LEN, false);
+    HIP_CHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+    hipModule_t Module;
+    hipFunction_t Function{nullptr};
+
+    HIP_CHECK(hipModuleLoadData(&Module, &buffer[0]));
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+
+    struct {
+      void* _Ad;
+      void* _Bd;
+    } args;
+    args._Ad = reinterpret_cast<void*>(Ad);
+    args._Bd = reinterpret_cast<void*>(Bd);
+    size_t size = sizeof(args);
+
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END};
+    HIP_CHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0,
+          stream, NULL, reinterpret_cast<void**>(&config)));
+
+    HIP_CHECK(hipStreamDestroy(stream));
+
+    HIP_CHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
+
+    for (uint32_t i = 0; i < LEN; i++) {
+      REQUIRE(A[i] == B[i]);
+    }
+    HipTest::freeArrays<float>(Ad, Bd, nullptr,
+                        A, B,
+                        nullptr, false);
+}
diff --git a/catch/unit/module/hipModuleLoadDataMultThreadOnMultGPU.cc b/catch/unit/module/hipModuleLoadDataMultThreadOnMultGPU.cc
new file mode 100644
index 0000000000..bfb780ca06
--- /dev/null
+++ b/catch/unit/module/hipModuleLoadDataMultThreadOnMultGPU.cc
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testcase verifies the multithreaded scenario of
+hipModuleLoadData API on MultiGPU system
+*/
+
+#include <fstream>
+#include <vector>
+
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+
+#define LEN 64
+#define SIZE LEN << 2
+#define THREADS 8
+
+#define FILENAME "module_kernels.code"
+#define kernel_name "hello_world"
+
+/*
+This function reads the kernel code object file into buffer
+*/
+static std::vector<char> load_file() {
+  std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
+  std::streamsize fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(fsize);
+  if (!file.read(buffer.data(), fsize)) {
+    INFO("could not open code object " << FILENAME);
+    REQUIRE(false);
+  }
+  return buffer;
+}
+
+/*
+Thread function
+1. Loads the module using hipModuleLoadData API
+2. Initializes 2 device variables.
+3. Launches kernel which copies one data into another.
+4. validates the result and returns it to the caller using
+   std::ref variable.
+*/
+static void run(const std::vector<char>& buffer, int deviceNo,
+                bool &testResult) {
+  hipSetDevice(deviceNo);
+  hipModule_t Module;
+  hipFunction_t Function;
+  float *A{nullptr}, *B{nullptr}, *Ad{nullptr}, *Bd{nullptr};
+  testResult = true;
+  HipTest::initArrays<float>(&Ad, &Bd, nullptr,
+                             &A, &B, nullptr,
+                             LEN, false);
+  HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
+  HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
+  HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  struct {
+    void* _Ad;
+    void* _Bd;
+  } args;
+  args._Ad = static_cast<void*>(Ad);
+  args._Bd = static_cast<void*>(Bd);
+  size_t size = sizeof(args);
+
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                    HIP_LAUNCH_PARAM_END};
+  HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN,
+                                 1, 1, 0, stream, NULL,
+                                 reinterpret_cast<void**>(&config)));
+
+  HIPCHECK(hipStreamSynchronize(stream));
+
+  HIPCHECK(hipStreamDestroy(stream));
+
+  HIPCHECK(hipModuleUnload(Module));
+
+  HIPCHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
+
+  for (uint32_t i = 0; i < LEN; i++) {
+     REQUIRE(A[i] == B[i]);
+  }
+  HipTest::freeArrays<float>(Ad, Bd, nullptr,
+                             A, B, nullptr,
+                             false);
+}
+
+/*
+Thread class inherited from std::thread
+*/
+struct joinable_thread : std::thread {
+  template <class... Xs>
+  joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) {} // NOLINT
+
+  joinable_thread& operator=(joinable_thread&& other) = default;
+  joinable_thread(joinable_thread&& other)      = default;
+
+  ~joinable_thread() {
+    if (this->joinable())
+      this->join();
+  }
+};
+
+/*
+This API is triggered form the test case where in
+1. Creates the thread object.
+2. Loops through the number of GPUs and launches multiple threads.
+*/
+static void run_multi_threads(uint32_t n, const std::vector<char>& buffer) {
+  int numDevices = 0;
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  bool testResult = false;
+  std::vector<joinable_thread> threads;
+
+  for (int deviceNo=0; deviceNo < numDevices; ++deviceNo) {
+    for (uint32_t i = 0; i < n; i++) {
+    threads.emplace_back(std::thread{[&, buffer] {
+    run(buffer, deviceNo, std::ref(testResult));
+    }});
+    }
+  }
+}
+/*
+The testcase verifies the multithreaded funtionality on MGPU system
+1. Loads the kernel file by calling load_file API
+2. Gets the host thread count
+3. Creates multiple threads in parallel where in each thread initializes
+   2 device variables and loads the kernel using hipModuleLoadData API.
+   The kernel copies the data from one variable to another.Then the thread
+   validates both the variables.
+*/
+TEST_CASE("Unit_hipModuleLoadData_MGpuMultiThread") {
+  auto buffer = load_file();
+  auto file_size = buffer.size() / (1024 * 1024);
+  auto thread_count =  HipTest::getHostThreadCount(file_size + 10);
+  run_multi_threads(thread_count, buffer);
+}
diff --git a/catch/unit/module/hipModuleLoadDataMultThreaded.cc b/catch/unit/module/hipModuleLoadDataMultThreaded.cc
new file mode 100644
index 0000000000..183da4c007
--- /dev/null
+++ b/catch/unit/module/hipModuleLoadDataMultThreaded.cc
@@ -0,0 +1,164 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testcase verifies the multithreaded scenario of hipModuleLoadData API
+*/
+#include <fstream>
+#include <vector>
+
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+
+#define LEN 64
+#define SIZE LEN << 2
+#define THREADS 8
+#define MAX_THREADS 512
+
+#define FILENAME "module_kernels.code"
+#define kernel_name "hello_world"
+
+/*
+This function reads the kernel code object file into buffer
+*/
+std::vector<char> load_file() {
+  std::ifstream file(FILENAME, std::ios::binary | std::ios::ate);
+  std::streamsize fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(fsize);
+  if (!file.read(buffer.data(), fsize)) {
+    INFO("could not open code object" << FILENAME);
+    REQUIRE(false);
+  }
+  return buffer;
+}
+
+/*
+Thread function
+1. Loads the module using hipModuleLoadData API
+2. Initializes 2 device variables.
+3. Launches kernel which copies one data into another.
+4. validates the result and returns it to the caller using
+   std::ref variable.
+*/
+void run(const std::vector<char>& buffer, bool &testResult) {
+  hipModule_t Module;
+  hipFunction_t Function;
+
+  float *A, *B, *Ad, *Bd;
+  testResult = true;
+  HipTest::initArrays<float>(&Ad, &Bd, nullptr,
+                             &A, &B, nullptr,
+                             LEN, false);
+
+
+  HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
+  HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
+  HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  struct {
+    void* _Ad;
+    void* _Bd;
+  } args;
+  args._Ad = static_cast<void*>(Ad);
+  args._Bd = static_cast<void*>(Bd);
+  size_t size = sizeof(args);
+
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+                    HIP_LAUNCH_PARAM_END};
+  HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1,
+                                 LEN, 1, 1, 0, stream,
+                                 NULL, reinterpret_cast<void**>(&config)));
+
+  HIPCHECK(hipStreamSynchronize(stream));
+
+  HIPCHECK(hipStreamDestroy(stream));
+
+  HIPCHECK(hipModuleUnload(Module));
+
+  HIPCHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
+
+  for (uint32_t i = 0; i < LEN; i++) {
+    REQUIRE(A[i] == B[i]);
+  }
+
+  HipTest::freeArrays<float>(Ad, Bd, nullptr,
+                             A, B, nullptr,
+                             false);
+}
+
+/*
+Thread class inherited from std::thread
+*/
+struct joinable_thread : std::thread {
+  template <class... Xs>
+  joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) {} // NOLINT
+
+  joinable_thread& operator=(joinable_thread&& other) = default;
+  joinable_thread(joinable_thread&& other)      = default;
+
+  ~joinable_thread() {
+    if (this->joinable())
+      this->join();
+  }
+};
+
+/*
+This API is triggered form the test case where in
+1. Creates the thread object.
+2. Loops through the number of GPUs and launches multiple threads.
+*/
+void run_multi_threads(uint32_t n, const std::vector<char>& buffer) {
+  std::vector<joinable_thread> threads;
+  bool testResult = false;
+  for (uint32_t i = 0; i < n; i++) {
+    threads.emplace_back(std::thread{[&] {
+    run(buffer, std::ref(testResult));
+    }});
+  }
+}
+
+/*
+The testcase verifies the multithreaded funtionality
+1. Loads the kernel file by calling load_file API
+2. Gets the host thread count
+3. Creates multiple threads in parallel where in each thread initializes
+   2 device variables and loads the kernel using hipModuleLoadData API.
+   The kernel copies the data from one variable to another.Then the thread
+   validates both the variables.
+*/
+TEST_CASE("Unit_hipModuleLoadData_MultiThreaded") {
+  HIPCHECK(hipInit(0));
+  auto buffer = load_file();
+  auto file_size = buffer.size() / (1024 * 1024);
+  auto thread_count = HipTest::getHostThreadCount(file_size + 10);
+  if (thread_count == 0) {
+      INFO("Thread Count is zero");
+      REQUIRE(false);
+  }
+
+  run_multi_threads(thread_count, buffer);
+}
diff --git a/catch/unit/module/hipModuleLoadMultiThreaded.cc b/catch/unit/module/hipModuleLoadMultiThreaded.cc
new file mode 100644
index 0000000000..401892915e
--- /dev/null
+++ b/catch/unit/module/hipModuleLoadMultiThreaded.cc
@@ -0,0 +1,121 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testcase verifies hipModuleLoad API in multithreaded scenario 
+*/
+#include <stdio.h>
+#include "hip/hip_runtime.h"
+#if HT_AMD
+#include "hip/hip_ext.h"
+#endif
+#include <fstream>
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <vector>
+#include <future>
+#define THREADS 8
+#define MAX_NUM_THREADS 128
+
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+
+#define NUM_GROUPS 1
+#define GROUP_SIZE 1
+#define WARMUP_RUN_COUNT 10
+#define TIMING_RUN_COUNT 100
+#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
+#define FILENAME "module_kernels.code"
+#define kernel_name "EmptyKernel"
+
+/*
+This thread function loads the kernel file , synchronizes the threads
+and Launches the kernel .
+*/
+void hipModuleLaunchKernel_enqueue(std::atomic_int* shared, int max_threads) {
+    // resources necessary for this thread
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreate(&stream));
+    hipModule_t module;
+    hipFunction_t function;
+
+    HIPCHECK(hipModuleLoad(&module, FILENAME));
+    HIPCHECK(hipModuleGetFunction(&function, module, kernel_name));
+
+    void* kernel_params = nullptr;
+
+    // synchronize all threads, before running
+    shared->fetch_add(1, std::memory_order_release);
+    while (max_threads != shared->load(std::memory_order_acquire)) {}
+
+    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
+        HIPCHECK(hipModuleLaunchKernel(function, 1, 1,
+                                       1, 1, 1, 1, 0, stream,
+                                       &kernel_params, nullptr));
+    }
+    HIPCHECK(hipModuleUnload(module));
+    HIPCHECK(hipStreamDestroy(stream));
+}
+
+/*
+thread pool class contains launching the threads using std::async API
+with future variable "threads".
+The start API Launches the threads and finish API waits for the
+thread execution to end.
+*/
+struct thread_pool {
+  explicit thread_pool(int total_threads) : max_threads(total_threads) {
+  }
+  void start(std::function<void(std::atomic_int*, int)> f) {
+    for (int i = 0; i < max_threads; ++i) {
+      threads.push_back(std::async(std::launch::async, f,
+                                   &shared, max_threads));
+    }
+  }
+  void finish() {
+    for (auto&&thread : threads) {
+      thread.get();
+    }
+    threads.clear();
+    shared = 0;
+  }
+  ~thread_pool() {
+    finish();
+  }
+ private:
+  std::atomic_int shared {0};
+  std::vector<char> buffer;
+  std::vector<std::future<void>> threads;
+  int max_threads = 1;
+};
+
+/*
+This testcase verifies the Multithreaded scenario of hipModule API
+where in threadpool object is created and the object invokes start API
+which launches multiple threads where each thread loads the kernel object
+using hipModuleLoad API and launches the kernel in parallel.
+*/
+TEST_CASE("Unit_hipModuleLoad_MultiThread") {
+    int max_threads = min(THREADS * std::thread::hardware_concurrency(),
+                          MAX_NUM_THREADS);
+    thread_pool task(max_threads);
+    task.start(hipModuleLaunchKernel_enqueue);
+    task.finish();
+}
diff --git a/catch/unit/module/hipModuleLoadUnloadStress.cc b/catch/unit/module/hipModuleLoadUnloadStress.cc
new file mode 100644
index 0000000000..46acdae29b
--- /dev/null
+++ b/catch/unit/module/hipModuleLoadUnloadStress.cc
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <cstddef>
+#include <vector>
+#include "hip_test_common.hh"
+
+#define TEST_ITERATIONS 1000
+#define CODEOBJ_FILE "module_kernels.code"
+/**
+ * Run Valgrind tool with these test cases to validate memory leakage.
+ * E.g. valgrind --leak-check=yes ./a.out
+ */
+
+/**
+ * Internal Function
+ */
+static std::vector<char> load_file() {
+  std::ifstream file(CODEOBJ_FILE, std::ios::binary | std::ios::ate);
+  std::streamsize fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+  std::vector<char> buffer(fsize);
+  if (!file.read(buffer.data(), fsize)) {
+    WARN("could not open code object " << CODEOBJ_FILE);
+  }
+  file.close();
+  return buffer;
+}
+/**
+ * Validates no memory leakage for hipModuleLoad
+ */
+TEST_CASE("Unit_hipModule_LoadUnloadStress") {
+  CTX_CREATE()
+  for (int count = 0; count < TEST_ITERATIONS; count++) {
+    hipModule_t Module;
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "testWeightedCopy"));
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+  CTX_DESTROY()
+}
+/**
+ * Validates no memory leakage for hipModuleLoadData
+ */
+TEST_CASE("Unit_hipModuleLoadData_LoadUnloadStress") {
+  CTX_CREATE()
+  auto buffer = load_file();
+  for (int count = 0; count < TEST_ITERATIONS; count++) {
+    hipModule_t Module;
+    HIP_CHECK(hipModuleLoadData(&Module, &buffer[0]));
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "testWeightedCopy"));
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+  CTX_DESTROY()
+}
+/**
+ * Validates no memory leakage for hipModuleLoadDataEx
+ */
+TEST_CASE("Unit_hipModuleLoadDataEx_UnloadStress") {
+  CTX_CREATE()
+  auto buffer = load_file();
+  for (int count = 0; count < TEST_ITERATIONS; count++) {
+    hipModule_t Module;
+    HIP_CHECK(hipModuleLoadDataEx(&Module, &buffer[0], 0,
+                                nullptr, nullptr));
+    hipFunction_t Function;
+    HIP_CHECK(hipModuleGetFunction(&Function, Module, "testWeightedCopy"));
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+  CTX_DESTROY()
+}
diff --git a/catch/unit/module/hipModuleNegative.cc b/catch/unit/module/hipModuleNegative.cc
new file mode 100644
index 0000000000..c43b507c21
--- /dev/null
+++ b/catch/unit/module/hipModuleNegative.cc
@@ -0,0 +1,274 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+This testcase verifies the negative scenarios of
+1. hipModuleLoad API
+2. hipModuleLoadData API
+3. hipModuleGetFunction API
+4. hipModuleGetGlobal API
+*/
+
+#include <ctime>
+#include <fstream>
+#include <cstddef>
+#include <vector>
+#include "hip_test_common.hh"
+
+#define FILENAME_NONEXST "sample_nonexst.code"
+#define FILENAME_EMPTY "emptyfile.code"
+#define FILENAME_RAND "rand_file.code"
+#define RANDOMFILE_LEN 2048
+#define CODEOBJ_FILE "module_kernels.code"
+#define KERNEL_NAME "hello_world"
+#define KERNEL_NAME_NONEXST "xyz"
+#define CODEOBJ_GLOBAL "module_kernels.code"
+#define DEVGLOB_VAR_NONEXIST "xyz"
+#define DEVGLOB_VAR "myDeviceGlobal"
+/**
+ * Internal Function
+ * Loads the kernel file into buffer
+ */
+std::vector<char> load_file(const char* filename) {
+  std::ifstream file(filename, std::ios::binary | std::ios::ate);
+  std::streamsize fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+  std::vector<char> buffer(fsize);
+  if (!file.read(buffer.data(), fsize)) {
+    INFO("could not open code object " <<  filename);
+  }
+  file.close();
+  return buffer;
+}
+
+/**
+ * Internal Function
+   Create Randome file
+ */
+void createRandomFile(const char* filename) {
+  std::ofstream outfile(filename, std::ios::binary);
+  char buf[RANDOMFILE_LEN];
+  unsigned int seed = 1;
+  for (int i = 0; i < RANDOMFILE_LEN; i++) {
+    buf[i] = rand_r(&seed) % 256;
+  }
+  outfile.write(buf, RANDOMFILE_LEN);
+  outfile.close();
+}
+
+/**
+ * Validates negative scenarios for hipModuleLoad API
+ */
+
+TEST_CASE("Unit_hipModuleLoad_Negative") {
+  CTX_CREATE()
+  hipModule_t Module;
+
+  SECTION("Nullptr to module") {
+    REQUIRE(hipModuleLoad(nullptr, CODEOBJ_FILE)
+            != hipSuccess);
+  }
+
+  SECTION("Nullptr to Fname") {
+    REQUIRE(hipModuleLoad(&Module, nullptr)
+            != hipSuccess);
+  }
+
+  SECTION("Empty fname") {
+  std::fstream fs;
+  fs.open(FILENAME_EMPTY, std::ios::out);
+  fs.close();
+  REQUIRE(hipModuleLoad(&Module, FILENAME_EMPTY)
+                        != hipSuccess);
+  }
+
+  SECTION("Binary file with random number") {
+    createRandomFile(FILENAME_RAND);
+    REQUIRE(hipModuleLoad(&Module, FILENAME_RAND)
+                          != hipSuccess);
+    remove(FILENAME_RAND);
+  }
+
+  SECTION("Non Existent file") {
+    REQUIRE(hipModuleLoad(&Module, FILENAME_NONEXST)
+                          != hipSuccess);
+  }
+
+  SECTION("Empty string to file name") {
+    REQUIRE(hipModuleLoad(&Module, "")
+                          != hipSuccess);
+  }
+
+  CTX_DESTROY()
+}
+
+/**
+ * Validates negative scenarios for hipModuleLoadData API
+ */
+TEST_CASE("Unit_hipModuleLoadData_Negative") {
+  CTX_CREATE()
+  hipModule_t Module;
+
+  SECTION("Nullptr to module") {
+    auto buffer = load_file(CODEOBJ_FILE);
+    REQUIRE(hipModuleLoadData(nullptr, &buffer[0])
+                              != hipSuccess);
+  }
+
+  SECTION("Nullptr to image") {
+    REQUIRE(hipModuleLoadData(&Module, nullptr)
+                              != hipSuccess);
+  }
+
+  SECTION("Random file to image") {
+    createRandomFile(FILENAME_RAND);
+    auto buffer = load_file(FILENAME_RAND);
+    REQUIRE(hipModuleLoadData(&Module, &buffer[0])
+                              != hipSuccess);
+  }
+
+  SECTION("Nullptr to Module") {
+  auto buffer = load_file(CODEOBJ_FILE);
+  REQUIRE(hipModuleLoadDataEx(nullptr, &buffer[0], 0, nullptr, nullptr)
+                              != hipSuccess);
+  }
+
+  SECTION("Nullptr to image") {
+    REQUIRE(hipModuleLoadDataEx(&Module, nullptr, 0, nullptr, nullptr)
+                                != hipSuccess);
+  }
+
+  SECTION("Random image file") {
+    // Create a binary file with random numbers
+    createRandomFile(FILENAME_RAND);
+    // Open the code object file and copy it in a buffer
+    auto buffer = load_file(FILENAME_RAND);
+    REQUIRE(hipModuleLoadDataEx(&Module, &buffer[0], 0, nullptr, nullptr)
+        != hipSuccess);
+  }
+
+  CTX_DESTROY()
+}
+
+/**
+ * Validates negative scenarios for hipModuleGetFunction API
+ */
+TEST_CASE("Unit_hipModuleGetFunction_Negative") {
+  CTX_CREATE()
+  hipFunction_t Function;
+  hipModule_t Module;
+
+  SECTION("Nullptr to function name") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    REQUIRE(hipModuleGetFunction(nullptr, Module, KERNEL_NAME) != hipSuccess);
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+
+  SECTION("Uninitialized module") {
+    REQUIRE(hipModuleGetFunction(&Function, Module, KERNEL_NAME) != hipSuccess);
+  }
+
+  SECTION("Non existing function kernel name") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    REQUIRE(hipModuleGetFunction(&Function, Module, KERNEL_NAME_NONEXST)
+                                 != hipSuccess);
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+
+  SECTION("Nullptr to kernel name") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    REQUIRE(hipModuleGetFunction(&Function, Module, nullptr) != hipSuccess);
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+#if HT_AMD
+  SECTION("Unloaded module") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    HIP_CHECK(hipModuleUnload(Module));
+    REQUIRE(hipModuleGetFunction(&Function, Module, KERNEL_NAME) != hipSuccess);
+  }
+#endif
+
+  SECTION("Empty string to kernel name") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    REQUIRE(hipModuleGetFunction(&Function, Module, "") != hipSuccess);
+    HIP_CHECK(hipModuleUnload(Module));
+  }
+
+  CTX_DESTROY()
+}
+
+/**
+ * Validates negative scenarios for hipModuleGetGlobal API
+ */
+TEST_CASE("Unit_hipModuleGetGlobal_Negative") {
+  CTX_CREATE()
+  hipModule_t Module;
+  hipDeviceptr_t deviceGlobal;
+  size_t deviceGlobalSize;
+
+  SECTION("Nullptr to varname") {
+    HIPCHECK(hipModuleLoad(&Module, CODEOBJ_GLOBAL));
+    REQUIRE(hipModuleGetGlobal(&deviceGlobal,
+                               &deviceGlobalSize, Module, nullptr)
+                               != hipSuccess);
+    HIPCHECK(hipModuleUnload(Module));
+  }
+
+  SECTION("Wrong variable name") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_GLOBAL));
+    REQUIRE(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
+                               Module, DEVGLOB_VAR_NONEXIST) != hipSuccess);
+    HIPCHECK(hipModuleUnload(Module));
+  }
+
+  SECTION("Empty string to module name") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_GLOBAL));
+    REQUIRE(hipModuleGetGlobal(&deviceGlobal,
+                               &deviceGlobalSize, Module, "") != hipSuccess);
+    HIPCHECK(hipModuleUnload(Module));
+  }
+
+#if HT_AMD
+  SECTION("Unloaded Module") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_GLOBAL));
+    HIP_CHECK(hipModuleUnload(Module));
+    REQUIRE(hipModuleGetGlobal(&deviceGlobal,
+                               &deviceGlobalSize, Module,
+                               DEVGLOB_VAR) != hipSuccess);
+  }
+
+  SECTION("Unload an Unloaded module") {
+    HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+    HIP_CHECK(hipModuleUnload(Module));
+    REQUIRE(hipModuleUnload(Module) != hipSuccess);
+  }
+
+  SECTION("Uninitialized module") {
+    REQUIRE(hipModuleGetGlobal(&deviceGlobal,
+                               &deviceGlobalSize, Module,
+                               DEVGLOB_VAR) != hipSuccess);
+  }
+  SECTION("Unload Uninitialized module") {
+    REQUIRE(hipModuleUnload(Module) != hipSuccess);
+  }
+#endif
+
+  CTX_DESTROY()
+}
diff --git a/catch/unit/module/hipModuleOccupancyMaxPotentialBlockSize.cc b/catch/unit/module/hipModuleOccupancyMaxPotentialBlockSize.cc
new file mode 100644
index 0000000000..c58b3978e6
--- /dev/null
+++ b/catch/unit/module/hipModuleOccupancyMaxPotentialBlockSize.cc
@@ -0,0 +1,267 @@
+
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+#include <limits>
+
+#define fileName "module_kernels.code"
+#define kernel_name "hello_world"
+/**
+ * hipModuleOccupancyMaxPotentialBlockSize and hipModuleOccupancyMaxPotentialBlockSizeWithFlags
+ * corner tests.
+ * Scenario1:
+ * Validates the value of gridSize, which should be always non zero +ve integer and blockSize
+ * range returned for dynSharedMemPerBlk = 0 and blockSizeLimit = 0.
+ * Scenario2:
+ * Validates the value of gridSize, which should be always non zero +ve integer and blockSize
+ * range returned for dynSharedMemPerBlk = devProp.sharedMemPerBlock and
+ * blockSizeLimit = devProp.maxThreadsPerBlock.
+ */
+TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_FuncTst") {
+  // Initialize
+  hipDeviceProp_t devProp;
+  int gridSize = 0;
+  int blockSize = 0;
+  hipModule_t Module;
+  CTX_CREATE()
+  hipFunction_t Function;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+  // Scenario1
+  SECTION("without flag - gridSize when input params are 0") {
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize,
+            &blockSize, Function, 0, 0));
+  }
+  // Scenario2
+  SECTION("without flag - gridSize when input params are maximum") {
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize,
+             &blockSize, Function,
+             devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock));
+  }
+  // Scenario1
+  SECTION("with flag - gridSize when input params are 0") {
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize,
+            &blockSize, Function, 0, 0, 0));
+  }
+  // Scenario2
+  SECTION("with flag - gridSize when input params are maximum") {
+    HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize,
+            &blockSize, Function, devProp.sharedMemPerBlock,
+            devProp.maxThreadsPerBlock, 0));
+  }
+  // Check if blockSize doen't exceed maxThreadsPerBlock
+  REQUIRE_FALSE(gridSize <= 0);
+  REQUIRE_FALSE(blockSize <= 0);
+  REQUIRE_FALSE(blockSize > devProp.maxThreadsPerBlock);
+  // Un-initialize
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+/**
+ * hipModuleOccupancyMaxActiveBlocksPerMultiprocessor and
+ * hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags Corner tests.
+ * Scenario1:
+ * Validates numBlock value range is within expected limit when sharedMemPerBlock
+ * is 0.
+ * Scenario2:
+ * Validates numBlock value range is within expected limit when
+ * dynSharedMemPerBlk = devProp.sharedMemPerBlock.
+ */
+TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_FuncTst") {
+  // Initialize
+  hipDeviceProp_t devProp;
+  int gridSize = 0;
+  int blockSize = 0;
+  int numBlock = 0;
+  hipModule_t Module;
+  CTX_CREATE()
+  hipFunction_t Function;
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+  HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize,
+          &blockSize, Function, 0, 0));
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+  // Scenario1
+  SECTION("without flag - gridSize when input params are 0") {
+    HIP_CHECK(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock,
+             Function, blockSize, 0));
+  }
+  // Scenario2
+  SECTION("without flag - gridSize when input params are maximum") {
+    HIP_CHECK(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock,
+             Function, blockSize, devProp.sharedMemPerBlock));
+  }
+  // Scenario1
+  SECTION("with flag - gridSize when input params are 0") {
+    HIP_CHECK(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            &numBlock, Function, blockSize, 0, 0));
+  }
+  // Scenario2
+  SECTION("with flag - gridSize when input params are maximum") {
+    HIP_CHECK(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            &numBlock, Function, blockSize, devProp.sharedMemPerBlock, 0))
+  }
+  // Check if numBlocks are within limits
+  int temp_val = (numBlock * blockSize);
+  REQUIRE_FALSE(numBlock <= 0);
+  REQUIRE_FALSE(temp_val > devProp.maxThreadsPerMultiProcessor);
+  // Un-initialize
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+/**
+ * hipModuleOccupancyMaxPotentialBlockSize negative tests.
+ * Scenario1: gridSize is nullptr.
+ * Scenario2: blocksize is nullptr.
+ * Scenario3: blockSizeLimit < 0.
+ */
+TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_NegTst") {
+  int gridSize = 0;
+  int blockSize = 0;
+  hipModule_t Module;
+  hipFunction_t Function;
+  CTX_CREATE()
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+  // Scenario1
+  SECTION("without flag - gridSize is nullptr") {
+    REQUIRE_FALSE(hipSuccess == hipModuleOccupancyMaxPotentialBlockSize(
+                  nullptr, &blockSize, Function, 0, 0));
+  }
+  // Scenario2
+  SECTION("without flag - blocksize is nullptr") {
+    REQUIRE_FALSE(hipSuccess == hipModuleOccupancyMaxPotentialBlockSize(
+                  &gridSize, nullptr, Function, 0, 0));
+  }
+  // Scenario3
+  SECTION("without flag - blockSizeLimit is less than 0") {
+    hipDeviceProp_t devProp;
+    HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+#if HT_NVIDIA
+    REQUIRE_FALSE(hipSuccess == hipModuleOccupancyMaxPotentialBlockSize(
+                  &gridSize, &blockSize, Function, 0, -1));
+#else
+    // As discussed in SWDEV-269400
+    // with developers this difference in behavior between NVIDIA and AMD
+    // is retained.
+    REQUIRE_FALSE(hipSuccess != hipModuleOccupancyMaxPotentialBlockSize(
+                  &gridSize, &blockSize, Function, 0, -1));
+#endif
+  }
+  // Scenario1
+  SECTION("with flag - gridSize is nullptr") {
+    REQUIRE_FALSE(hipSuccess ==
+      hipModuleOccupancyMaxPotentialBlockSizeWithFlags(nullptr,
+      &blockSize, Function, 0, 0, 0));
+  }
+  // Scenario2
+  SECTION("with flag - blocksize is nullptr") {
+    REQUIRE_FALSE(hipSuccess ==
+      hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize,
+      nullptr, Function, 0, 0, 0));
+  }
+  // Scenario3
+  SECTION("with flag - blockSizeLimit is less than 0") {
+#if HT_NVIDIA
+    REQUIRE_FALSE(hipSuccess ==
+      hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize,
+      &blockSize, Function, 0, -1, 0));
+#else
+    // As discussed in SWDEV-269400
+    // with developers this difference in behavior between NVIDIA and AMD
+    // is retained.
+    REQUIRE_FALSE(hipSuccess !=
+      hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize,
+      &blockSize, Function, 0, -1, 0));
+#endif
+  }
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+/**
+ * hipModuleOccupancyMaxActiveBlocksPerMultiprocessor negative tests.
+ * Scenario1: numBlocks is nullptr.
+ * Scenario2: Check the behavior for blockSize < 0.
+ * Scenario3: Check error code returned for dynSharedMemPerBlk = 0 and blockSize = 0.
+ * Scenario4: dynSharedMemPerBlk = size_t numeric limit.
+ */
+TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_NegTst") {
+  int gridSize = 0;
+  int blockSize = 0;
+  int numBlocks = 0;
+  hipModule_t Module;
+  hipFunction_t Function;
+  CTX_CREATE()
+  HIP_CHECK(hipModuleLoad(&Module, fileName));
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+  HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize,
+                                                   Function, 0, 0));
+  // Scenario1
+  SECTION("without flag - numBlocks is nullptr") {
+    REQUIRE_FALSE(hipSuccess ==
+       hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(nullptr,
+       Function, blockSize, 0));
+  }
+  // Scenario3
+  SECTION("without flag - dynSharedMemPerBlk = 0 and blockSize = 0") {
+    REQUIRE_FALSE(hipSuccess ==
+       hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
+       Function, 0, 0));
+  }
+  // Scenario2
+  SECTION("without flag - blockSize is less than 0") {
+    REQUIRE_FALSE(hipSuccess ==
+       hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
+       Function, -1, 0));
+  }
+  // Scenario4
+  SECTION("without flag - dynSharedMemPerBlk = max_numerical_limit") {
+    REQUIRE_FALSE(hipSuccess ==
+       hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
+       Function, 0, std::numeric_limits<std::size_t>::max()));
+  }
+  // Scenario1
+  SECTION("with flag - numBlocks is nullptr") {
+    REQUIRE_FALSE(hipSuccess ==
+       hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(nullptr,
+       Function, blockSize, 0, 0));
+  }
+  // Scenario3
+  SECTION("with flag - dynSharedMemPerBlk = 0 and blockSize = 0") {
+    REQUIRE_FALSE(hipSuccess ==
+      hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks,
+      Function, 0, 0, 0));
+  }
+  // Scenario2
+  SECTION("with flag - blockSize is less than 0") {
+    REQUIRE_FALSE(hipSuccess ==
+      hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks,
+      Function, -1, 0, 0));
+  }
+  // Scenario4
+  SECTION("with flag - dynSharedMemPerBlk = max_numerical_limit") {
+    REQUIRE_FALSE(hipSuccess ==
+      hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&numBlocks,
+      Function, 0, std::numeric_limits<std::size_t>::max(), 0));
+  }
+  HIP_CHECK(hipModuleUnload(Module));
+  CTX_DESTROY()
+}
+
diff --git a/catch/unit/module/hipModuleTexture2dDrv.cc b/catch/unit/module/hipModuleTexture2dDrv.cc
new file mode 100755
index 0000000000..2afa3e06fc
--- /dev/null
+++ b/catch/unit/module/hipModuleTexture2dDrv.cc
@@ -0,0 +1,561 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/*
+This testcase verifies the following scenarios of hipModuleGetTexRef API
+1. Negative
+2. Basic functionality using different data types
+3. Multiple streams
+4. MultiThreaded - MultStreamMultGPU
+5. MultiThreaded - SingleStreamMultGPU
+*/
+
+#include <fstream>
+#include <vector>
+#include <type_traits>
+#include <limits>
+#include <atomic>
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+
+#define CODEOBJ_FILE "module_kernels.code"
+#define NON_EXISTING_TEX_NAME "xyz"
+#define EMPTY_TEX_NAME ""
+#define GLOBAL_KERNEL_VAR "deviceGlobalFloat"
+#define TEX_REF "ftex"
+#define WIDTH 256
+#define HEIGHT 256
+#define MAX_STREAMS 4
+#define GRIDDIMX 16
+#define GRIDDIMY 16
+#define GRIDDIMZ 1
+#define BLOCKDIMZ 1
+#define MAX_GPU 16
+
+std::atomic<int> g_thTestPassed(1);
+
+
+/**
+ * Internal Functions
+ * Loads the kernel file
+ */
+static std::vector<char> load_file() {
+  std::ifstream file(CODEOBJ_FILE, std::ios::binary | std::ios::ate);
+  std::streamsize fsize = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(fsize);
+  if (!file.read(buffer.data(), fsize)) {
+    INFO("could not open code object " << CODEOBJ_FILE);
+    REQUIRE(false);
+  }
+  return buffer;
+}
+
+/*
+Initializes the array
+*/
+template<typename T>
+void allocInitArray(unsigned int width,
+                     unsigned int height,
+                     hipArray_Format format,
+                     HIP_ARRAY* array
+                     ) {
+  HIP_ARRAY_DESCRIPTOR desc;
+  desc.Format = format;
+  desc.NumChannels = 1;
+  desc.Width = width * sizeof(T);
+  desc.Height = height;
+  HIPCHECK(hipArrayCreate(array, &desc));
+}
+
+/*
+Copies buffer to array using hipMemcpyParam2D API
+*/
+template <class T, class T1> void copyBuffer2Array(unsigned int width,
+                                                   unsigned int height,
+                                                   T* hData,
+                                                   T1 array
+                                                   ) {
+  hip_Memcpy2D copyParam;
+  memset(&copyParam, 0, sizeof(copyParam));
+#if HT_NVIDIA
+  copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+  copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+  copyParam.dstArray = *array;
+#else
+  copyParam.dstMemoryType = hipMemoryTypeArray;
+  copyParam.srcMemoryType = hipMemoryTypeHost;
+  copyParam.dstArray = array;
+#endif
+  copyParam.srcHost = hData;
+  copyParam.srcPitch = width * sizeof(T);
+  copyParam.WidthInBytes = width * sizeof(T);
+  copyParam.Height = height;
+  HIPCHECK(hipMemcpyParam2D(&copyParam));
+}
+
+/*
+Assigns array to texture ref
+*/
+template <class T> void assignArray2TexRef(hipArray_Format format,
+                                           const char* texRefName,
+                                           hipModule_t Module,
+                                           T array
+                                           ) {
+  HIP_TEX_REFERENCE texref;
+#if HT_NVIDIA
+  HIPCHECK(hipModuleGetTexRef(&texref, Module, texRefName));
+  HIPCHECK(hipTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP));
+  HIPCHECK(hipTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP));
+  HIPCHECK(hipTexRefSetFilterMode(texref, HIP_TR_FILTER_MODE_POINT));
+  HIPCHECK(hipTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+  HIPCHECK(hipTexRefSetFormat(texref, format, 1));
+  HIPCHECK(hipTexRefSetArray(texref, *array, CU_TRSA_OVERRIDE_FORMAT));
+#else
+  HIPCHECK(hipModuleGetTexRef(&texref, Module, texRefName));
+  HIPCHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap));
+  HIPCHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap));
+  HIPCHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint));
+  HIPCHECK(hipTexRefSetFlags(texref, HIP_TRSF_READ_AS_INTEGER));
+  HIPCHECK(hipTexRefSetFormat(texref, format, 1));
+  HIPCHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT));
+#endif
+}
+
+template <class T> bool validateOutput(unsigned int width,
+                                       unsigned int height,
+                                       T* hData,
+                                       T* hOutputData) {
+  for (unsigned int i = 0; i < height; i++) {
+    for (unsigned int j = 0; j < width; j++) {
+      if (hData[i * width + j] != hOutputData[i * width + j]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/**
+ * Validates texture functionality with multiple streams for hipModuleGetTexRef
+ *
+ */
+template <class T> bool testTexMultStream(const std::vector<char>& buffer,
+                                        hipArray_Format format,
+                                        const char* texRefName,
+                                        const char* kerFuncName,
+                                        unsigned int numOfStreams) {
+  bool TestPassed = true;
+  unsigned int width = WIDTH;
+  unsigned int height = HEIGHT;
+  unsigned int size = width * height * sizeof(T);
+  T* hData = reinterpret_cast<T*>(malloc(size));
+  CTX_CREATE()
+  HipTest::setDefaultData<T>(width * height, hData, nullptr, nullptr);
+
+  // Load Kernel File and create hipArray
+  hipModule_t Module;
+  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
+  HIP_ARRAY array;
+  allocInitArray<T>(width, height, format, &array);
+#if HT_NVIDIA
+  // Copy from hData to array using hipMemcpyParam2D
+  copyBuffer2Array<T, HIP_ARRAY*>(width, height, hData, &array);
+  // Get tex reference from the loaded kernel file
+  // Assign array to the tex reference
+  assignArray2TexRef<HIP_ARRAY*>(format, texRefName, Module, &array);
+#else
+  // Copy from hData to array using hipMemcpyParam2D
+  copyBuffer2Array<T, HIP_ARRAY>(width, height, hData, array);
+  // Get tex reference from the loaded kernel file
+  // Assign array to the tex reference
+  assignArray2TexRef<HIP_ARRAY>(format, texRefName, Module, array);
+#endif
+  hipFunction_t Function;
+  HIPCHECK(hipModuleGetFunction(&Function, Module, kerFuncName));
+
+  // Create Multiple Strings
+  hipStream_t streams[MAX_STREAMS]={0};
+  T* dData[MAX_STREAMS] = {NULL};
+  T* hOutputData[MAX_STREAMS] = {NULL};
+  if (numOfStreams > MAX_STREAMS) {
+    numOfStreams = MAX_STREAMS;
+  }
+  unsigned int totalStreamsCreated = 0;
+  for (unsigned int stream_num = 0; stream_num < numOfStreams; stream_num++) {
+    hOutputData[stream_num] = reinterpret_cast<T*>(malloc(size));
+    if (NULL == hOutputData[stream_num]) {
+      WARN("Failed to allocate using malloc in testTexMultStream");
+      TestPassed &= false;
+    }
+    HIPCHECK(hipStreamCreate(&streams[stream_num]));
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dData[stream_num]), size));
+    memset(hOutputData[stream_num], 0, size);
+    struct {
+      void* _Ad;
+      unsigned int _Bd;
+      unsigned int _Cd;
+    } args;
+    args._Ad = reinterpret_cast<void*>(dData[stream_num]);
+    args._Bd = width;
+    args._Cd = height;
+
+    size_t sizeTemp = sizeof(args);
+
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                      &args,
+                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &sizeTemp,
+                      HIP_LAUNCH_PARAM_END};
+
+    int temp1 = width / GRIDDIMX;
+    int temp2 = height / GRIDDIMY;
+    HIPCHECK(hipModuleLaunchKernel(Function, GRIDDIMX, GRIDDIMY, GRIDDIMZ,
+                                   temp1, temp2, BLOCKDIMZ, 0,
+                                   streams[stream_num],
+                                   NULL, reinterpret_cast<void**>(&config)));
+    totalStreamsCreated++;
+  }
+  // Check the kernel results separately
+  for (unsigned int stream_num = 0; stream_num < totalStreamsCreated;
+       stream_num++) {
+    HIPCHECK(hipStreamSynchronize(streams[stream_num]));
+    HIPCHECK(hipMemcpy(hOutputData[stream_num], dData[stream_num], size,
+                            hipMemcpyDeviceToHost));
+    TestPassed &= validateOutput<T>(width, height, hData,
+                                    hOutputData[stream_num]);
+  }
+  for (unsigned int i = 0; i < totalStreamsCreated; i++) {
+    HIPCHECK(hipFree(dData[i]));
+    HIPCHECK(hipStreamDestroy(streams[i]));
+    free(hOutputData[i]);
+  }
+  ARRAY_DESTROY(array)
+  HIPCHECK(hipModuleUnload(Module));
+  free(hData);
+  CTX_DESTROY()
+  return TestPassed;
+}
+
+/**
+ * Internal Thread Functions
+ *
+ */
+void launchSingleStreamMultGPU(int gpu, const std::vector<char>& buffer) {
+  bool TestPassed = true;
+  HIPCHECK(hipSetDevice(gpu));
+  TestPassed = testTexMultStream<float>(buffer,
+                                        HIP_AD_FORMAT_FLOAT,
+                                        "ftex",
+                                        "tex2dKernelFloat", 1);
+  g_thTestPassed &= static_cast<int>(TestPassed);
+}
+
+void launchMultStreamMultGPU(int gpu, const std::vector<char>& buffer) {
+  bool TestPassed = true;
+  HIPCHECK(hipSetDevice(gpu));
+  TestPassed = testTexMultStream<float>(buffer,
+                                        HIP_AD_FORMAT_FLOAT,
+                                        "ftex",
+                                        "tex2dKernelFloat", 3);
+  g_thTestPassed &= static_cast<int>(TestPassed);
+}
+/**
+ * Validates texture functionality with Multiple Streams on multuple GPU
+ * for hipModuleGetTexRef
+ *
+ */
+bool testTexMultStreamMultGPU(unsigned int numOfGPUs,
+                              const std::vector<char>& buffer) {
+  bool TestPassed = true;
+  std::thread T[MAX_GPU];
+
+  for (unsigned int gpu = 0; gpu < numOfGPUs; gpu++) {
+    T[gpu] = std::thread(launchMultStreamMultGPU, gpu, buffer);
+  }
+  for (unsigned int gpu = 0; gpu < numOfGPUs; gpu++) {
+    T[gpu].join();
+  }
+
+  if (g_thTestPassed) {
+    TestPassed = true;
+  } else {
+    TestPassed = false;
+  }
+  return TestPassed;
+}
+
+/**
+ * Validates texture functionality with Single Stream on multuple GPU
+ * for hipModuleGetTexRef
+ *
+ */
+bool testTexSingleStreamMultGPU(unsigned int numOfGPUs,
+                                const std::vector<char>& buffer) {
+  bool TestPassed = true;
+  std::thread T[MAX_GPU];
+
+  for (unsigned int gpu = 0; gpu < numOfGPUs; gpu++) {
+    T[gpu] = std::thread(launchSingleStreamMultGPU, gpu, buffer);
+  }
+  for (unsigned int gpu = 0; gpu < numOfGPUs; gpu++) {
+    T[gpu].join();
+  }
+
+  if (g_thTestPassed) {
+    TestPassed = true;
+  } else {
+    TestPassed = false;
+  }
+  return TestPassed;
+}
+
+/*
+This testcase verifies the negative scenarios of hipModuleGetTexRef API
+*/
+TEST_CASE("Unit_hipModuleGetTexRef_Negative") {
+  hipModule_t Module;
+  HIP_TEX_REFERENCE texref;
+  CTX_CREATE()
+  HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+
+  SECTION("TexRef as nullptr") {
+    REQUIRE(hipModuleGetTexRef(nullptr, Module, "tex") != hipSuccess);
+  }
+
+  SECTION("Name as nullptr") {
+    REQUIRE(hipModuleGetTexRef(&texref, Module, nullptr) != hipSuccess);
+  }
+
+  SECTION("Name as non existing TexName") {
+    REQUIRE(hipModuleGetTexRef(&texref, Module,
+                               NON_EXISTING_TEX_NAME) != hipSuccess);
+  }
+
+  SECTION("Empty tex name") {
+    REQUIRE(hipModuleGetTexRef(&texref, Module, EMPTY_TEX_NAME) != hipSuccess);
+  }
+#if HT_NVIDIA
+  SECTION("Name as Global kernel Var") {
+    REQUIRE(hipModuleGetTexRef(&texref, Module,
+                               GLOBAL_KERNEL_VAR) != hipSuccess);
+  }
+#endif
+
+  SECTION("Unload Module") {
+    HIP_CHECK(hipModuleUnload(Module));
+    REQUIRE(hipModuleGetTexRef(&texref, Module, TEX_REF) != hipSuccess);
+  }
+
+  CTX_DESTROY()
+}
+/**
+ * Validates texture type data functionality for hipModuleGetTexRef
+ * 1.Loads the code object file
+ * 2.Based on the template type texRefName,KernelFuncName and format are assigned.
+ * 3.Allocate array based on format.
+ * 4.Assigns array to texRef
+ * 5.Launches the kernel based on the template type which invokes text2D API
+     and copies the data to output variable.
+ * 6.Validates the data.
+ */
+TEMPLATE_TEST_CASE("Unit_hipModuleGetTexRef_Basic", "", int,
+                   char, uint16_t, float) {
+  bool TestPassed = true;
+  constexpr unsigned int width = WIDTH;
+  constexpr unsigned int height = HEIGHT;
+  constexpr unsigned int size = width * height * sizeof(TestType);
+  const char *texRefName, *kerFuncName;
+  hipArray_Format format;
+
+  TestType* hData = reinterpret_cast<TestType*>(malloc(size));
+  if (NULL == hData) {
+    INFO("Failed to allocate using malloc in testTexType.\n");
+    REQUIRE(false);
+  }
+  CTX_CREATE()
+  HipTest::setDefaultData<TestType>(width * height, hData, nullptr, nullptr);
+  // Load Kernel File and create hipArray
+  hipModule_t Module;
+  HIP_CHECK(hipModuleLoad(&Module, CODEOBJ_FILE));
+  HIP_ARRAY array;
+
+  if (std::is_same<TestType, char>::value) {
+    texRefName = "ctex";
+    kerFuncName = "tex2dKernelInt8";
+    format = HIP_AD_FORMAT_SIGNED_INT8;
+  } else if (std::is_same<TestType, uint16_t>::value)  {
+    texRefName = "stex";
+    kerFuncName = "tex2dKernelInt16";
+    format = HIP_AD_FORMAT_SIGNED_INT16;
+  } else if (std::is_same<TestType, int>::value)  {
+    texRefName = "itex";
+    kerFuncName = "tex2dKernelInt";
+    format = HIP_AD_FORMAT_SIGNED_INT32;
+  } else if (std::is_same<TestType, float>::value)  {
+    texRefName = "ftex";
+    kerFuncName = "tex2dKernelFloat";
+    format = HIP_AD_FORMAT_FLOAT;
+  }
+  allocInitArray<TestType>(width, height, format, &array);
+
+#if HT_NVIDIA
+  // Copy from hData to array using hipMemcpyParam2D
+  copyBuffer2Array<TestType, HIP_ARRAY*>(width, height, hData, &array);
+  // Get tex reference from the loaded kernel file
+  // Assign array to the tex reference
+  assignArray2TexRef<HIP_ARRAY*>(format, texRefName, Module, &array);
+#else
+  // Copy from hData to array using hipMemcpyParam2D
+  copyBuffer2Array<TestType, HIP_ARRAY>(width, height, hData, array);
+  // Get tex reference from the loaded kernel file
+  // Assign array to the tex reference
+  assignArray2TexRef<HIP_ARRAY>(format, texRefName, Module, array);
+#endif
+  hipFunction_t Function;
+  HIP_CHECK(hipModuleGetFunction(&Function, Module, kerFuncName));
+
+  TestType* dData = NULL;
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dData), size));
+
+  struct {
+    void* _Ad;
+    unsigned int _Bd;
+    unsigned int _Cd;
+  } args;
+  args._Ad = reinterpret_cast<void*>(dData);
+  args._Bd = width;
+  args._Cd = height;
+
+  size_t sizeTemp = sizeof(args);
+
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                    &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                    &sizeTemp,
+                    HIP_LAUNCH_PARAM_END};
+
+  int temp1 = width / GRIDDIMX;
+  int temp2 = height / GRIDDIMY;
+  HIP_CHECK(
+    hipModuleLaunchKernel(Function, GRIDDIMX, GRIDDIMY, GRIDDIMZ,
+                          temp1, temp2, BLOCKDIMZ, 0, 0,
+                          NULL, reinterpret_cast<void**>(&config)));
+  HIP_CHECK(hipDeviceSynchronize());
+  TestType* hOutputData = reinterpret_cast<TestType*>(malloc(size));
+  if (NULL == hOutputData) {
+    INFO("Failed to allocate using malloc in testTexType");
+    REQUIRE(false);
+  } else {
+    memset(hOutputData, 0, size);
+    HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost));
+    TestPassed = validateOutput<TestType>(width, height, hData, hOutputData);
+    REQUIRE(TestPassed);
+  }
+  free(hOutputData);
+  HIP_CHECK(hipFree(dData));
+  ARRAY_DESTROY(array)
+  HIP_CHECK(hipModuleUnload(Module));
+  free(hData);
+  CTX_DESTROY()
+}
+
+/*
+This testcase verifies hipModuleGetTexRef on multiple streams
+where
+ * 1..Loads the code object file
+ * 2.Allocate array and initializes it with hData
+ * 3.Assigns array to texRef
+   4.Creates multiple streams
+ * 4.Launches the kernel on each stream which invokes text2D API
+     and copies the data to output variable
+ * 5.Validates the hData with output data in each stream.
+*/
+TEST_CASE("Unit_hipModuleGetTexRef_TexMultStream") {
+  bool TestPassed = true;
+  auto buffer = load_file();
+  TestPassed = testTexMultStream<float>(buffer,
+                                        HIP_AD_FORMAT_FLOAT,
+                                        "ftex",
+                                        "tex2dKernelFloat",
+                                        MAX_STREAMS);
+  REQUIRE(TestPassed);
+}
+/*
+This testcase verifies hipModuleGetTexRef Multithreaded scenario on
+single stream and multi GPU machine.
+1. Gets the device count.
+2. Create the threads based on device count.
+3. Each thread calls the testTexMultStream which performs the same
+   above funtionality on single Stream
+4. The threads are executed in parallel and are joined later.
+
+This testcase ensures that the multi thread execution on single stream
+in parallel is successful
+*/
+TEST_CASE("Unit_hipModuleGetTexRef_MultiThreadTexSingleStreamMultiGPU") {
+  bool TestPassed = true;
+  // Testcase skipped on nvidia with CUDA API version 11.2,
+  // as hipModuleLoadData returning error code
+  // 'a PTX JIT compilation failed'(218), which is invalid
+  // behavior. Test passes with AMD and previous CUDA versions.
+#if HT_NVIDIA
+  INFO("Testcase skipped on CUDA version 11.2\n");
+  REQUIRE(true);
+#else
+  int gpu_cnt = 0;
+  auto buffer = load_file();
+  HIP_CHECK(hipGetDeviceCount(&gpu_cnt));
+  TestPassed = testTexSingleStreamMultGPU(gpu_cnt, buffer);
+  REQUIRE(TestPassed);
+#endif
+}
+
+
+/*
+This testcase verifies hipModuleGetTexRef Multithreaded scenario on
+single stream and multi GPU machine.
+1. Gets the device count.
+2. Create the threads based on device count.
+3. Each thread calls the testTexMultStream which performs the same
+   above funtionality on multiple Stream
+4. The threads are executed in parallel and are joined later.
+
+This testcase ensures that the multi thread execution on multiple streams
+in parallel is successful
+*/
+TEST_CASE("Unit_hipModuleGetTexRef_MultiThreadTexMultiStreamMultiGPU") {
+  bool TestPassed = true;
+  // Testcase skipped on nvidia with CUDA API version 11.2,
+  // as hipModuleLoadData returning error code
+  // 'a PTX JIT compilation failed'(218), which is invalid
+  // behavior. Test passes with AMD and previous CUDA versions.
+#if HT_NVIDIA
+  INFO("Testcase skipped on CUDA version 11.2\n");
+  REQUIRE(true);
+#else
+  int gpu_cnt = 0;
+  auto buffer = load_file();
+  HIP_CHECK(hipGetDeviceCount(&gpu_cnt));
+  TestPassed = testTexMultStreamMultGPU(gpu_cnt, buffer);
+  REQUIRE(TestPassed);
+#endif
+}
diff --git a/catch/unit/module/hipModuleUnload.cc b/catch/unit/module/hipModuleUnload.cc
new file mode 100644
index 0000000000..24c6575d1b
--- /dev/null
+++ b/catch/unit/module/hipModuleUnload.cc
@@ -0,0 +1,34 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+
+#include <hip_test_common.hh>
+
+#define fileName "module_kernels.code"
+/*
+This testcase verifies the basic functionality of hipModuleUnload API
+*/
+TEST_CASE("Unit_hipModuleUnload_Basic") {
+  CTX_CREATE()
+  hipModule_t module;
+  HIP_CHECK(hipModuleLoad(&module, fileName));
+  HIP_CHECK(hipModuleUnload(module));
+  CTX_DESTROY()
+}
diff --git a/catch/unit/module/hipOpenCLCOTest.cc b/catch/unit/module/hipOpenCLCOTest.cc
new file mode 100644
index 0000000000..91cfae767a
--- /dev/null
+++ b/catch/unit/module/hipOpenCLCOTest.cc
@@ -0,0 +1,229 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testcase reads the openCL kernel file and generate the the code object
+file which gets executed in HIP interface.
+This testcase verifies for the
+1. Current GPU architecture
+2. Code object version v3
+*/
+
+#ifdef __linux__
+#include <unistd.h>
+ #include <experimental/filesystem>
+#endif
+#include <string.h>
+#include "hip_test_common.hh"
+#include "hip_test_checkers.hh"
+
+#define OPENCL_OBJ_FILE "opencl_add.cc"
+#define HIP_CODEOBJ_FILE_DEFAULT "opencl_add.co"
+#define HIP_CODEOBJ_FILE_V3 "opencl_add_v3.co"
+#define COMMAND_LEN 256
+#define BUFFER_LEN 256
+
+
+#ifdef __linux__
+
+/* Check if environment variable $ROCM_PATH is defined */
+static bool isRocmPathSet() {
+  FILE *fpipe;
+  char const *command = "echo $ROCM_PATH";
+  fpipe = popen(command, "r");
+
+  if (fpipe == nullptr) {
+    WARN("Unable to create command");
+    return false;
+  }
+  char command_op[BUFFER_LEN];
+  if (fgets(command_op, BUFFER_LEN, fpipe)) {
+    size_t len = strlen(command_op);
+    if (len > 1) {  // This is because fgets always adds newline character
+      pclose(fpipe);
+      return true;
+    }
+  }
+  pclose(fpipe);
+  return false;
+}
+
+/* Gets the sramecc/xnack settings from rocm info */
+
+int getV3TargetIdFeature(char* feature, bool rocmPathSet) {
+  FILE *fpipe;
+  char command[COMMAND_LEN] = "";
+  const char *rocmpath = nullptr;
+  if (rocmPathSet) {
+    // For STG2 testing where /opt/rocm path is not present
+    rocmpath = "$ROCM_PATH/bin/rocminfo";
+  } else {
+    // Check if the rocminfo tool exists
+    rocmpath = "/opt/rocm/bin/rocminfo";
+  }
+  snprintf(command, COMMAND_LEN, "%s", rocmpath);
+  strncat(command, " | grep -m1 \"sramecc.:xnack.\"", COMMAND_LEN);
+  fpipe = popen(command, "r");
+
+  if (fpipe == nullptr) {
+    WARN("Unable to create command file");
+    return -1;
+  }
+  char command_op[BUFFER_LEN];
+  const char* pOpt1 = nullptr;
+  const char *pOpt2 = nullptr;
+  if (fgets(command_op, BUFFER_LEN, fpipe)) {
+    if (strstr(command_op, "sramecc+")) {
+      pOpt1 = "-msram-ecc";
+    } else if (strstr(command_op, "sramecc-")) {
+      pOpt1 = "-mno-sram-ecc";
+    } else {
+      pclose(fpipe);
+      return -1;
+    }
+    if (strstr(command_op, "xnack+")) {
+      pOpt2 = " -mxnack";
+    } else if (strstr(command_op, "xnack-")) {
+      pOpt2 = " -mno-xnack";
+    } else {
+      pclose(fpipe);
+      return -1;
+    }
+  } else {
+    printf("No sramecc/xnack settings found.\n");
+    pclose(fpipe);
+    return -1;
+  }
+  strncpy(feature, pOpt1, strlen(pOpt1));
+  strncat(feature, pOpt2, strlen(pOpt2));
+  pclose(fpipe);
+  return 0;
+}
+#endif
+/**
+ * Validates OpenCL Static Lds Code Object where
+ * 1. Tries to access opencl kernel file
+ * 2. Copies it to current folder
+ * 3. Tries to get RocmPath and execute the kernel file to
+      generate the code object file.code-object-version argument
+      specifies the code object version
+ * 4. Launch the kernel which copies one variable to another
+ * 5. Validates the result.
+ */
+TEST_CASE("Unit_hipModuleLoad_OpenCLStaticCodeObjV3") {
+#ifdef __linux__
+  auto codeobj_type = GENERATE(0, 1);
+  char command[COMMAND_LEN] = "";
+  char v3option[32] = "";
+  hipDeviceProp_t props;
+  hipGetDeviceProperties(&props, 0);
+  std::string path = std::experimental::filesystem::current_path();
+  WARN("path is " << path.c_str());
+  if (access("./opencl_add.cc", F_OK) == -1) {
+    system("cp ./../../../../hip-on-rocclr/tests/catch/unit/module/opencl_add.cc .");
+  }
+  // Generate the command to translate the OpenCL code object to hip code object
+  const char *pCodeObjVer = nullptr;
+  const char *pCodeObjFile = nullptr;
+  bool rocmPathSet = isRocmPathSet();
+  if (codeobj_type == 0) {
+    pCodeObjVer = "";
+    pCodeObjFile = HIP_CODEOBJ_FILE_DEFAULT;
+  } else {
+    pCodeObjVer = "-mcode-object-version=3";
+    if (-1 == getV3TargetIdFeature(v3option, rocmPathSet)) {
+      INFO("Error getting V3 Option. Skipping Test. \n");
+      REQUIRE(true);
+    }
+    pCodeObjFile = HIP_CODEOBJ_FILE_V3;
+  }
+  INFO("v3option "<< v3option);
+  /* The command string is created using multiple concatenation instead of one go
+    to avoid the following cpplint error:
+    " Multi-line string ("...") found.  This lint script doesn't do well with such strings,
+    and may give bogus warnings.  Use C++11 raw strings or concatenation instead."
+  */
+  if (rocmPathSet) {
+    // For STG2 testing where /opt/rocm path is not present
+    snprintf(command, COMMAND_LEN,
+    "$ROCM_PATH/llvm/bin/clang -target amdgcn-amd-amdhsa -x cl ");
+  } else {
+    snprintf(command, COMMAND_LEN,
+    "/opt/rocm/llvm/bin/clang -target amdgcn-amd-amdhsa -x cl ");
+  }
+  char command_temp[COMMAND_LEN] = "";
+  snprintf(command_temp, COMMAND_LEN,
+  "-include `find /opt/rocm* -name opencl-c.h` %s %s -mcpu=%s -o %s %s",
+  pCodeObjVer, v3option, props.gcnArchName, pCodeObjFile, OPENCL_OBJ_FILE);
+
+  strncat(command, command_temp, COMMAND_LEN);
+  INFO("command executed "<< command);
+
+  system((const char*)command);
+  // Check if the code object file is created
+  snprintf(command, COMMAND_LEN, "./%s",
+           pCodeObjFile);
+
+  if (access(command, F_OK) == -1) {
+    INFO("Code Object File not found \n");
+    REQUIRE(true);
+  }
+
+  hipDevice_t device;
+  hipModule_t Module;
+  hipFunction_t Function;
+  HIPCHECK(hipDeviceGet(&device, 0));
+  HIPCHECK(hipModuleLoad(&Module, pCodeObjFile));
+  HIPCHECK(hipModuleGetFunction(&Function, Module, "add"));
+  float *Ah, *Bh, *Ad, *Bd;
+  HipTest::initArrays<float>(&Ad, &Bd, nullptr, &Ah, &Bh, nullptr,
+                             BUFFER_LEN, false);
+
+  HIPCHECK(hipMemcpy(Ad, Ah, sizeof(float) * BUFFER_LEN,
+                     hipMemcpyHostToDevice));
+
+  struct {
+    void* _Bd;
+    void* _Ad;
+  } args;
+  args._Ad = static_cast<void*>(Ad);
+  args._Bd = static_cast<void*>(Bd);
+  size_t size = sizeof(args);
+
+  void *config[] = {
+    HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+    HIP_LAUNCH_PARAM_END
+  };
+
+  HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, BUFFER_LEN, 1, 1, 0, 0,
+                                 NULL, reinterpret_cast<void**>(&config)));
+  HIPCHECK(hipMemcpy(Bh, Bd, sizeof(float) * BUFFER_LEN,
+                     hipMemcpyDeviceToHost));
+
+  for (uint32_t i = 0; i < BUFFER_LEN; i++) {
+    REQUIRE(Ah[i] == Bh[i]);
+  }
+  HipTest::freeArrays<float>(Ad, Bd, nullptr,
+                             Ah, Bh, nullptr, false);
+#else
+  INFO("This test is skipped due to non linux environment.\n");
+  REQUIRE(true);
+#endif
+}
diff --git a/catch/unit/module/module_kernels.cc b/catch/unit/module/module_kernels.cc
new file mode 100644
index 0000000000..ccb9c204ea
--- /dev/null
+++ b/catch/unit/module/module_kernels.cc
@@ -0,0 +1,167 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include "hip/hip_runtime.h"
+
+#define GLOBAL_BUF_SIZE 2048
+#define ARRAY_SIZE (16)
+
+texture<float, 2, hipReadModeElementType> ftex;
+texture<int, 2, hipReadModeElementType> itex;
+texture<uint16_t, 2, hipReadModeElementType> stex;
+texture<char, 2, hipReadModeElementType> ctex;
+
+__device__ int deviceGlobal = 1;
+__managed__ int x = 10;
+__device__ float myDeviceGlobal;
+__device__ float myDeviceGlobalArray[16];
+
+
+__device__ float deviceGlobalFloat;
+__device__ int   deviceGlobalInt1;
+__device__ int   deviceGlobalInt2;
+__device__ uint16_t deviceGlobalShort;
+__device__ char  deviceGlobalChar;
+
+extern "C" __global__ void tex2dKernelFloat(float* outputData,
+                                       int width, int height) {
+  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+  if ((x < width) && (y < width)) {
+    outputData[y * width + x] = tex2D(ftex, x, y);
+  }
+}
+
+extern "C" __global__ void tex2dKernelInt(int* outputData,
+                                          int width, int height) {
+  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+  if ((x < width) && (y < width)) {
+    outputData[y * width + x] = tex2D(itex, x, y);
+  }
+}
+
+extern "C" __global__ void tex2dKernelInt16(uint16_t* outputData,
+                                          int width, int height) {
+  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+  if ((x < width) && (y < width)) {
+    outputData[y * width + x] = tex2D(stex, x, y);
+  }
+}
+
+extern "C" __global__ void tex2dKernelInt8(char* outputData,
+                                          int width, int height) {
+  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+  if ((x < width) && (y < width)) {
+    outputData[y * width + x] = tex2D(ctex, x, y);
+  }
+}
+
+extern "C" __global__ void matmulK(int clockrate, int* A, int* B, int* C,
+                                   int N) {
+  int ROW = blockIdx.y*blockDim.y+threadIdx.y;
+  int COL = blockIdx.x*blockDim.x+threadIdx.x;
+  int tmpSum = 0;
+  if ((ROW < N) && (COL < N)) {
+    // each thread computes one element of the block sub-matrix
+    for (int i = 0; i < N; i++) {
+      tmpSum += A[ROW * N + i] * B[i * N + COL];
+    }
+    C[ROW * N + COL] = tmpSum;
+  }
+}
+
+extern "C" __global__ void KernelandExtraParams(int* A, int* B, int* C,
+  int *D, int N) {
+  int ROW = blockIdx.y*blockDim.y+threadIdx.y;
+  int COL = blockIdx.x*blockDim.x+threadIdx.x;
+  int tmpSum = 0;
+  if (ROW < N && COL < N) {
+    // each thread computes one element of the block sub-matrix
+    for (int i = 0; i < N; i++) {
+      tmpSum += A[ROW * N + i] * B[i * N + COL];
+    }
+  }
+  C[ROW * N + COL] = tmpSum;
+  D[ROW * N + COL] = tmpSum;
+}
+
+extern "C" __global__ void SixteenSecKernel(int clockrate) {
+  HipTest::waitKernel(16, clockrate);
+}
+
+extern "C" __global__ void TwoSecKernel(int clockrate) {
+  if (deviceGlobal == 0x2222) {
+    deviceGlobal = 0x3333;
+  }
+
+  HipTest::waitKernel(2, clockrate);
+
+  if (deviceGlobal != 0x3333) {
+    deviceGlobal = 0x5555;
+  }
+}
+
+extern "C" __global__ void FourSecKernel(int clockrate) {
+  if (deviceGlobal == 1) {
+    deviceGlobal = 0x2222;
+  }
+
+  HipTest::waitKernel(4, clockrate);
+
+  if (deviceGlobal == 0x2222) {
+    deviceGlobal = 0x4444;
+  }
+}
+
+extern "C" __global__ void GPU_func() {
+  x++;
+}
+
+
+__device__ int getSquareOfGlobalFloat() {
+  return static_cast<int>(deviceGlobalFloat*deviceGlobalFloat);
+}
+
+extern "C" __global__ void testWeightedCopy(int* a, int* b) {
+  int tx = hipThreadIdx_x;
+  b[tx] = deviceGlobalInt1*a[tx] + deviceGlobalInt2 +
+  static_cast<int>(deviceGlobalShort) + static_cast<int>(deviceGlobalChar)
+  + getSquareOfGlobalFloat();
+}
+
+
+extern "C" __global__ void hello_world(const float* a, float* b) {
+    int tx = hipThreadIdx_x;
+    b[tx] = a[tx];
+}
+
+extern "C" __global__ void test_globals(const float* a, float* b) {
+    int tx = hipThreadIdx_x;
+    b[tx] = a[tx] + myDeviceGlobal + myDeviceGlobalArray[tx % ARRAY_SIZE];
+}
+
+extern "C" __global__ void EmptyKernel() {
+}
diff --git a/catch/unit/module/opencl_add.cc b/catch/unit/module/opencl_add.cc
new file mode 100644
index 0000000000..267ffc7f3b
--- /dev/null
+++ b/catch/unit/module/opencl_add.cc
@@ -0,0 +1,37 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+kernel void add(global float* output, global float* input) {
+  __local float lds[100];
+  int id = get_global_id(0);
+
+  if (id == 0) {
+      for (int i = 0; i < 100; i++) {
+      lds[i] = input[i];
+    }
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  if (id < 100) {
+    output[id] = lds[id];
+  } else {
+    output[id] = input[id];
+  }
+}