Catch2 Test Fixes (#2961)

* test fixes * address PR comment * PR comment fixing Nvidia pass * add additional kernel launch checks * pr comments
2022-10-01 01:07:25 +03:00
@@ -29,6 +29,7 @@ TEMPLATE_TEST_CASE("ABM_AddKernel_MultiTypeMultiSize", "", int, long, float, lon
  REQUIRE(res == hipSuccess);

  hipLaunchKernelGGL(add<TestType>, 1, size, 0, 0, d_a, d_b, d_c, size);
+  HIP_CHECK(hipGetLastError());

  res = hipMemcpy(a.data(), d_c, sizeof(TestType) * size, hipMemcpyDeviceToHost);
  REQUIRE(res == hipSuccess);
@@ -53,6 +53,7 @@ TEST_CASE("Unit_hipManagedKeyword_SingleGpu") {
  hipLaunchKernelGGL(add, dimGrid, dimBlock, 0, 0, static_cast<const float*>(A),
                     static_cast<float*>(B));

+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipDeviceSynchronize());

  float maxError = 0.0f;
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <fstream>
 #include <regex>
 #include <type_traits>
-
+#define TOL 0.001
 #define guarantee(cond, str)                                                                       \
  {                                                                                                \
    if (!(cond)) {                                                                                 \
@@ -45,7 +45,7 @@ size_t checkVectors(T* A, T* B, T* Out, size_t N, T (*F)(T a, T b), bool expectM
  size_t mismatchesToPrint = 10;
  for (size_t i = 0; i < N; i++) {
    T expected = F(A[i], B[i]);
-    if (Out[i] != expected) {
+    if (std::fabs(Out[i] - expected) > TOL) {
      if (mismatchCount == 0) {
        firstMismatch = i;
      }
@@ -270,6 +270,7 @@ void launchKernel(K kernel, Dim numBlocks, Dim numThreads, std::uint32_t memPerB
  launchRTCKernel<Typenames...>(kernel, numBlocks, numThreads, memPerBlock, stream,
                                std::forward<Args>(packedArgs)...);
 #endif
+HIP_CHECK(hipGetLastError());
 }

 //---
@@ -221,6 +221,7 @@ void runMultiProcKernel(ipcEventInfo_t *shmEventInfo, int index) {
    const dim3 blocks(BUF_SIZE / threads.x, 1);
    hipLaunchKernelGGL(computeKernel, dim3(blocks), dim3(threads), 0, 0,
                                    d_ptr + index *BUF_SIZE, d_ptr, index + 1);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipEventRecord(event));

    // Barrier 2 : Signals that event is recorded
@@ -120,7 +120,7 @@ static bool validateMemoryOnGPU(int gpu, bool concurOnOneGPU = false) {
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, 0, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
-
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

  if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
@@ -276,12 +276,15 @@ bool test_printf_multistream(uint32_t num_blocks,
      hipLaunchKernelGGL(kernel_complex_opX, dim3(num_blocks, 1, 1),
                         dim3(threads_per_block, 1, 1),
                         0, stream[i], Ad, Bd, iterCount);
+      HIP_CHECK(hipGetLastError());
      hipLaunchKernelGGL(kernel_complex_opY, dim3(1, num_blocks, 1),
                         dim3(1, threads_per_block, 1),
                         0, stream[i], Ad, Bd, iterCount);
+      HIP_CHECK(hipGetLastError());
      hipLaunchKernelGGL(kernel_complex_opZ, dim3(1, 1, num_blocks),
                         dim3(1, 1, threads_per_block),
                         0, stream[i], Ad, Bd, iterCount);
+      HIP_CHECK(hipGetLastError());
    }
    HIP_CHECK(hipDeviceSynchronize());
    for (int i = 0; i < NUM_STREAM; i++) {
@@ -368,12 +371,15 @@ bool test_printf_multigpu(int gpu,
    hipLaunchKernelGGL(kernel_complex_opX, dim3(num_blocks, 1, 1),
                       dim3(threads_per_block, 1, 1),
                       0, 0, Ad, Bd, iterCount);
+    HIP_CHECK(hipGetLastError());
    hipLaunchKernelGGL(kernel_complex_opY, dim3(1, num_blocks, 1),
                       dim3(1, threads_per_block, 1),
                       0, 0, Ad, Bd, iterCount);
+    HIP_CHECK(hipGetLastError());
    hipLaunchKernelGGL(kernel_complex_opZ, dim3(1, 1, num_blocks),
                       dim3(1, 1, threads_per_block),
                       0, 0, Ad, Bd, iterCount);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipDeviceSynchronize());
    std::ifstream CapturedData = captured.getCapturedData();
    char *buffer = new char[CHUNK_SIZE];
@@ -246,6 +246,7 @@ bool test_printf_conststr(uint32_t num_blocks, uint32_t threads_per_block,
    hipLaunchKernelGGL(kernel_printf_conststr, dim3(num_blocks, 1, 1),
                       dim3(threads_per_block, 1, 1),
                       0, 0, iterCount);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipStreamSynchronize(0));
    std::ifstream CapturedData = captured.getCapturedData();
    char *buffer = new char[CHUNK_SIZE];
@@ -308,6 +309,7 @@ bool test_printf_two_conditionalstr(uint32_t num_blocks,
                       dim3(num_blocks, 1, 1),
                       dim3(threads_per_block, 1, 1),
                       0, 0, iterCount);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipStreamSynchronize(0));
    std::ifstream CapturedData = captured.getCapturedData();
    char *buffer = new char[CHUNK_SIZE];
@@ -370,6 +372,7 @@ bool test_printf_single_conditionalstr(uint32_t num_blocks,
                       dim3(num_blocks, 1, 1),
                       dim3(threads_per_block, 1, 1),
                       0, 0, iterCount);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipStreamSynchronize(0));
    std::ifstream CapturedData = captured.getCapturedData();
    char *buffer = new char[CHUNK_SIZE];
@@ -427,6 +430,7 @@ bool test_variable_str(uint32_t print_limit,
    hipLaunchKernelGGL(func, dim3(num_blocks, 1, 1),
                       dim3(threads_per_block, 1, 1),
                       0, 0, iterCount, Ad);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipStreamSynchronize(0));
    HIP_CHECK(hipMemcpy(Ah, Ad, buffsize*sizeof(int32_t),
             hipMemcpyDeviceToHost));
@@ -483,6 +487,7 @@ bool test_decimal_str(uint32_t num_blocks, uint32_t threads_per_block,
    hipLaunchKernelGGL(kernel_decimal_calculation, dim3(num_blocks, 1, 1),
                       dim3(threads_per_block, 1, 1),
                       0, 0, iterCount, maxPrecision);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipStreamSynchronize(0));
    std::ifstream CapturedData = captured.getCapturedData();
    char *buffer = new char[CHUNK_SIZE];
@@ -157,6 +157,7 @@ static void test_group_partition(unsigned int tileSz) {
    // Launch Kernel
    hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dResult, tileSz, i);
+    HIP_CHECK(hipGetLastError()); 
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
@@ -217,6 +218,7 @@ static void test_shfl_down() {
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl_down, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
+    HIP_CHECK(hipGetLastError()); 
    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
@@ -138,6 +138,7 @@ static void test_group_partition(unsigned tileSz) {
    // Launch Kernel
    hipLaunchKernelGGL(kernel_cg_group_partition, blockSize, threadsPerBlock,
                     threadsPerBlock * sizeof(int), 0, dPtr, tileSz, i);
+    HIP_CHECK(hipGetLastError()); 
    hipMemcpy(hPtr, dPtr, arrSize, hipMemcpyDeviceToHost);
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
@@ -207,6 +208,7 @@ static void test_shfl_up() {
    hipLaunchKernelGGL(kernel_shfl_up, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dPtr, dResults, lane_delta, i);
    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
+    HIP_CHECK(hipGetLastError()); 
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
      fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
@@ -244,6 +244,7 @@ static void test_active_threads_grouping() {

  // Launch Kernel
    hipLaunchKernelGGL(kernel_coalesced_active_groups, blockSize, threadsPerBlock, 0, 0);
+    HIP_CHECK(hipGetLastError()); 

    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
@@ -308,6 +309,7 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
    if (useGlobalMem) {
      hipLaunchKernelGGL(kernel_cg_coalesced_group_partition, blockSize, threadsPerBlock, 0, 0, tileSz,
                         dResult, useGlobalMem, globalMem, i);
+      HIP_CHECK(hipGetLastError()); 

      err = hipDeviceSynchronize();
      if (err != hipSuccess) {
@@ -316,6 +318,7 @@ static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
    } else {
      hipLaunchKernelGGL(kernel_cg_coalesced_group_partition, blockSize, threadsPerBlock,
                         threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem, i);
+      HIP_CHECK(hipGetLastError()); 

      err = hipDeviceSynchronize();
      if (err != hipSuccess) {
@@ -390,6 +393,7 @@ static void test_shfl_any_to_any() {
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl_any_to_any, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0 , dPtr, dsrcArr, dResults, i);
+    HIP_CHECK(hipGetLastError()); 
    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
@@ -459,6 +463,7 @@ static void test_shfl_broadcast() {
    // Launch Kernel
    hipLaunchKernelGGL(kernel_shfl, blockSize, threadsPerBlock,
                       threadsPerBlock * sizeof(int), 0, dPtr, dResults, srcLane, i);
+    HIP_CHECK(hipGetLastError()); 
    hipMemcpy(hPtr, dResults, group_size_in_bytes, hipMemcpyDeviceToHost);
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
@@ -27,6 +27,7 @@ THE SOFTWARE.

 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2
+#define NUM_ITERS 1 << 30

 static __global__ void Iter(int* Ad, int num) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -57,7 +58,8 @@ TEST_CASE("Unit_hipDeviceSynchronize_Functional") {
  }
  for (int i = 0; i < NUM_STREAMS; i++) {
      hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0,
-                                                stream[i], Ad[i], 1 << 30);
+                                                stream[i], Ad[i], NUM_ITERS);
+      HIP_CHECK(hipGetLastError());
  }
  for (int i = 0; i < NUM_STREAMS; i++) {
      HIP_CHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost,
@@ -71,7 +73,7 @@ TEST_CASE("Unit_hipDeviceSynchronize_Functional") {
  // Conservative implementations which synchronize the hipMemcpyAsync will
  // fail, ie if HIP_LAUNCH_BLOCKING=true.

-  CHECK(1 << 30 != A[NUM_STREAMS - 1][0] - 1);
+  CHECK(NUM_ITERS != A[NUM_STREAMS - 1][0] - 1);
  HIP_CHECK(hipDeviceSynchronize());
-  CHECK(1 << 30 == A[NUM_STREAMS - 1][0] - 1);
+  CHECK(NUM_ITERS == A[NUM_STREAMS - 1][0] - 1);
 }
@@ -163,6 +163,7 @@ TEST_CASE("Unit_hipGetDeviceProperties_ArchPropertiesTst") {
            hipMemcpyHostToDevice));
    hipLaunchKernelGGL(mykernel, dim3(1), dim3(1),
                       0, 0, archProp_d);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpy(archProp_h, archProp_d,
            NUM_OF_ARCHPROP*sizeof(int), hipMemcpyDeviceToHost));
    // Validate the host architecture property with device
@@ -73,6 +73,7 @@ TEMPLATE_TEST_CASE("Unit_AtomicAdd_Coherentwithnounsafeflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      testResult = HipTest::assemblyFile_Verification<TestType>(
@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_AtomicAdd_Coherentwithoutflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      bool testResult;
      testResult = HipTest::assemblyFile_Verification<TestType>(
@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_AtomicAdd_CoherentwithUnsafeflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;

@@ -73,6 +73,7 @@ TEMPLATE_TEST_CASE("Unit_AtomicAdd_NonCoherentwithnounsafeflag", "",
                         dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -73,6 +73,7 @@ TEMPLATE_TEST_CASE("Unit_AtomicAdd_NonCoherentwithoutflag", "",
                         dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_AtomicAdd_NonCoherentwithUnsafeflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -76,6 +76,7 @@ TEST_CASE("Unit_BuiltInAtomicAdd_CoherentGlobalMem") {
      hipLaunchKernelGGL(AtomicAdd_GlobalMem, dim3(1), dim3(1),
                         0, 0, A_d,
                         result);
+      HIP_CHECK(hipGetLastError()); 
       std::cout << "test 1" << std::endl;
      HIP_CHECK(hipDeviceSynchronize());
      REQUIRE(A_h[0] == INITIAL_VAL);
@@ -118,6 +119,7 @@ TEST_CASE("Unit_BuiltInAtomicAdd_NonCoherentGlobalMem") {
      hipLaunchKernelGGL(AtomicAdd_GlobalMem, dim3(1), dim3(1),
                         0, 0, static_cast<double* >(A_d),
                         static_cast<double* >(result));
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      HIP_CHECK(hipMemcpy(B_h, result, sizeof(double), hipMemcpyDeviceToHost));
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -112,6 +112,7 @@ TEST_CASE("Unit_BuiltinAtomics_fmaxCoherentGlobalMem") {
      HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&result), sizeof(double)));
      hipLaunchKernelGGL(unsafeAtomicMax_GlobalMem, dim3(1), dim3(1),
          0, 0, static_cast<double* >(A_d), result);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      HIP_CHECK(hipMemcpy(B_h, result, sizeof(double), hipMemcpyDeviceToHost));
      REQUIRE(*B_h == 0);
@@ -159,9 +160,11 @@ TEST_CASE("Unit_BuiltinAtomics_fmaxNonCoherentGlobalFlatMem") {
      if (mem_type) {
        hipLaunchKernelGGL(unsafeAtomicMax_GlobalMem, dim3(1), dim3(1),
            0, 0, static_cast<double* >(A_d), result);
+        HIP_CHECK(hipGetLastError()); 
      } else {
        hipLaunchKernelGGL(unsafeAtomicMax_FlatMem, dim3(1), dim3(1),
            0, 0, static_cast<double* >(A_d), result);
+        HIP_CHECK(hipGetLastError()); 
      }
      HIP_CHECK(hipDeviceSynchronize());
      HIP_CHECK(hipMemcpy(B_h, result, sizeof(double), hipMemcpyDeviceToHost));
@@ -112,6 +112,7 @@ TEST_CASE("Unit_BuiltinAtomics_fminCoherentGlobalMem") {
      HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&result), sizeof(double)));
      hipLaunchKernelGGL(unsafeAtomicMin_GlobalMem, dim3(1), dim3(1),
                         0, 0, static_cast<double* >(A_d), result);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      HIP_CHECK(hipMemcpy(B_h, result, sizeof(double), hipMemcpyDeviceToHost));
      REQUIRE(*B_h == 0);
@@ -159,9 +160,11 @@ TEST_CASE("Unit_BuiltinAtomics_fminNonCoherentGlobalFlatMem") {
      if (mem_type) {
        hipLaunchKernelGGL(unsafeAtomicMin_GlobalMem, dim3(1), dim3(1),
                           0, 0, static_cast<double* >(A_d), result);
+        HIP_CHECK(hipGetLastError()); 
      } else {
        hipLaunchKernelGGL(unsafeAtomicMin_FlatMem, dim3(1), dim3(1),
                           0, 0, static_cast<double* >(A_d), result);
+        HIP_CHECK(hipGetLastError()); 
      }
      HIP_CHECK(hipDeviceSynchronize());
      HIP_CHECK(hipMemcpy(B_h, result, sizeof(double), hipMemcpyDeviceToHost));
@@ -65,6 +65,7 @@ TEST_CASE("Unit_AnyAll_CompileTest") {
  hipLaunchKernelGGL(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                     device_any, device_all, pshift);

+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(
      hipMemcpy(host_any, device_any, Num_Warps_per_Grid * sizeof(int), hipMemcpyDeviceToHost));
  HIP_CHECK(
@@ -66,6 +66,7 @@ TEST_CASE("Unit_ballot") {
  hipLaunchKernelGGL(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                     device_ballot, Num_Warps_per_Block, pshift);

+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid * sizeof(unsigned int),
                       hipMemcpyDeviceToHost));

@@ -146,6 +146,7 @@ TEST_CASE("Unit_bitExtract") {
  hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block), 0, 0, deviceOut32,
                     deviceSrc032, deviceSrc132, deviceSrc232, deviceOut64, deviceSrc064,
                     deviceSrc164, deviceSrc264);
+  HIP_CHECK(hipGetLastError());


  HIP_CHECK(hipMemcpy(hostOut32, deviceOut32, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
@@ -158,6 +158,7 @@ TEST_CASE("Unit_bitInsert") {
  hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block), 0, 0, deviceOut32,
                     deviceSrc032, deviceSrc132, deviceSrc232, deviceSrc332, deviceOut64,
                     deviceSrc064, deviceSrc164, deviceSrc264, deviceSrc364);
+  HIP_CHECK(hipGetLastError());


  HIP_CHECK(hipMemcpy(hostOut32, deviceOut32, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
@@ -115,6 +115,7 @@ TEST_CASE("Unit_brev") {
  hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
                     deviceC, deviceD, WIDTH, HEIGHT);
+  HIP_CHECK(hipGetLastError());


  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
@@ -136,6 +136,7 @@ TEST_CASE("Unit_clz") {
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
                     deviceC, deviceD, WIDTH, HEIGHT);
+  HIP_CHECK(hipGetLastError());


  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
@@ -110,6 +110,7 @@ TEST_CASE("Unit_ffs") {
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
                     deviceC, deviceD, WIDTH, HEIGHT);
+  HIP_CHECK(hipGetLastError());


  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
@@ -56,6 +56,7 @@ TEST_CASE("Unit_deviceFunctions_CompileTest") {
  res = hipMalloc((void**)&Outd, SIZE);
  REQUIRE(res == hipSuccess);
  hipLaunchKernelGGL(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
+  HIP_CHECK(hipGetLastError());
  res = hipDeviceSynchronize();
  REQUIRE(res == hipSuccess);
  res = hipGetLastError();
@@ -129,6 +129,7 @@ template <typename T, typename D> void testType(int msize) {

  auto kernel = testOperationsGPU<T, D>;
  hipLaunchKernelGGL(kernel, 1, msize, 0, 0, d_fa, d_fb, msize);
+  HIP_CHECK(hipGetLastError());

  HIP_CHECK(hipMemcpy(fc, d_fa, sizeof(T) * msize, hipMemcpyDeviceToHost));

@@ -145,7 +145,7 @@ TEST_CASE("Unit_funnelshift") {

  hipLaunchKernelGGL(funnelshift_kernel, dim3(1), dim3(1), 0, 0, device_l_output, device_lc_output,
                     device_r_output, device_rc_output);
-
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpy(host_l_output, device_l_output, NUM_TESTS * sizeof(unsigned int),
                       hipMemcpyDeviceToHost));
  HIP_CHECK(hipMemcpy(host_lc_output, device_lc_output, NUM_TESTS * sizeof(unsigned int),
@@ -69,6 +69,7 @@ TEST_CASE("Unit_hipMemcpyToSymbolAsync_ToNFrom") {
        hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream));
    HIP_CHECK(hipStreamSynchronize(stream));
    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
    HIP_CHECK(hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost,
                                       stream));
@@ -83,6 +84,7 @@ TEST_CASE("Unit_hipMemcpyToSymbolAsync_ToNFrom") {
  SECTION("Calling hipMemcpyTo/FromSymbol - validate value in host memory") {
    HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice));
    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
    HIP_CHECK(hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost));

@@ -99,6 +101,7 @@ TEST_CASE("Unit_hipMemcpyToSymbolAsync_ToNFrom") {
        hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream));
    HIP_CHECK(hipStreamSynchronize(stream));
    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
    HIP_CHECK(
        hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream));
@@ -116,6 +119,7 @@ TEST_CASE("Unit_hipMemcpyToSymbolAsync_ToNFrom") {
                                     hipStreamPerThread));
    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
    HIP_CHECK(hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost,
                                       hipStreamPerThread));
@@ -139,6 +143,7 @@ TEST_CASE("Unit_hipMemcpyToSymbolAsync_ToNFrom") {
    HIP_CHECK(hipGetSymbolAddress((void**)&symbolAddress, HIP_SYMBOL(globalConst)));
    HIP_CHECK(hipMalloc((void**)&checkOkD, sizeof(bool)));
    hipLaunchKernelGGL(checkAddress, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, symbolAddress, checkOkD);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(&checkOk, checkOkD, sizeof(bool), hipMemcpyDeviceToHost));
    HIP_CHECK(hipFree(checkOkD));
    HIP_ASSERT(checkOk);
@@ -171,6 +176,7 @@ TEST_CASE("Unit_hipGetSymbolAddressAndSize_Validation") {
    HIP_CHECK(hipMalloc(&checkOkD, sizeof(bool)));
    hipLaunchKernelGGL(checkGlobalConstAddress, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
                       symbolArrAddress, checkOkD);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(&checkOk, checkOkD, sizeof(bool), hipMemcpyDeviceToHost));
    HIP_CHECK(hipFree(checkOkD));
    HIP_ASSERT(checkOk);
@@ -184,6 +190,7 @@ TEST_CASE("Unit_hipGetSymbolAddressAndSize_Validation") {
    HIP_CHECK(hipMalloc(&checkOkD, sizeof(bool)));
    hipLaunchKernelGGL(checkStaticConstVarAddress, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
                       symbolVarAddress, checkOkD);
+    HIP_CHECK(hipGetLastError()); 
    HIP_CHECK(hipMemcpy(&checkOk, checkOkD, sizeof(bool), hipMemcpyDeviceToHost));
    HIP_CHECK(hipFree(checkOkD));
    HIP_ASSERT(checkOk);
@@ -100,6 +100,7 @@ template <typename T, typename U> int dataTypesRun() {
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
+  HIP_CHECK(hipGetLastError());


  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
@@ -148,7 +149,7 @@ template <typename T, typename U> int dataTypesRun2() {
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
-
+  HIP_CHECK(hipGetLastError());

  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));

@@ -198,7 +199,7 @@ template <typename T, typename U> int dataTypesRun4() {
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
-
+  HIP_CHECK(hipGetLastError());

  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));

@@ -63,7 +63,7 @@ TEST_CASE("Unit_mbcnt") {

  hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block), 0, 0,
                     device_mbcnt_lo, device_mbcnt_hi, device_lane_id);
-
+  HIP_CHECK(hipGetLastError());
  unsigned int* host_mbcnt_lo = (unsigned int*)malloc(buffer_size);
  unsigned int* host_mbcnt_hi = (unsigned int*)malloc(buffer_size);
  unsigned int* host_lane_id = (unsigned int*)malloc(buffer_size);
@@ -100,7 +100,7 @@ TEST_CASE("Unit_popc") {
  hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
                     deviceC, deviceD, WIDTH, HEIGHT);
-
+  HIP_CHECK(hipGetLastError());

  HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
  HIP_CHECK(hipMemcpy(hostC, deviceC, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
@@ -88,7 +88,7 @@ static void test_syncthreads_and(int blockSize) {
  // Launch Kernel
  hipLaunchKernelGGL(kernel_syncthreads_and, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
                     allThreadsOneD, oneThreadZeroD, allThreadsMinusOneD);
-
+  HIP_CHECK(hipGetLastError());
  // Copy result from device to host
  HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
  HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
@@ -95,7 +95,7 @@ void test_syncthreads_count(int blockSize) {
  // Launch Kernel
  hipLaunchKernelGGL(kernel_syncthreads_count, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
                     allThreadsOneD, oddThreadsOneD, allThreadsMinusOneD, allThreadsIdD);
-
+  HIP_CHECK(hipGetLastError());
  // Copy result from device to host
  HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
  HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
@@ -88,7 +88,7 @@ static void test_syncthreads_or(int blockSize) {
  // Launch Kernel
  hipLaunchKernelGGL(kernel_syncthreads_or, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
                     allThreadsOneD, oneThreadOneD, allThreadsMinusOneD);
-
+  HIP_CHECK(hipGetLastError());
  // Copy result from device to host
  HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
  HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
@@ -92,6 +92,7 @@ TEST_CASE("Unit_threadfence_system") {
      HIP_CHECK(hipSetDevice(next_id - 1));
      hipLaunchKernelGGL(gpu_round_robin, dim_grid, dim_block, 0, 0x0, next_id, num_dev, num_iter,
                         data, flag);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipDeviceSynchronize());
    }));
  }
@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_CoherentwithnoUnsafeflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      bool testResult;

@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_Coherentwithoutflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      bool testResult;

@@ -73,6 +73,7 @@ TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_CoherentwithUnsafeflag", "",
      hipLaunchKernelGGL(AtomicCheck<TestType>, dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
      bool testResult;

@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_NonCoherentnounsafeatomicsflag", "",
                         dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_NonCoherentwithoutflag", "",
                         dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -72,6 +72,7 @@ TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_NonCoherentwithunsafeatomicsflag", "",
                         dim3(1), dim3(1),
                         0, 0, A_d,
                         result_d);
+      HIP_CHECK(hipGetLastError()); 
      HIP_CHECK(hipDeviceSynchronize());
        bool testResult;
      REQUIRE(A_h[0] == INITIAL_VAL + INC_VAL);
@@ -224,6 +224,7 @@ template <typename V> bool run_CheckSharedVectorType() {
  if (hipMalloc(&ptr, sizeof(bool)) != HIP_SUCCESS) return false;
  unique_ptr<bool, decltype(hipFree)*> correct{ptr, hipFree};
  hipLaunchKernelGGL((CheckSharedVectorType<V>), dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, correct.get());
+  HIP_CHECK(hipGetLastError());
  bool passed = true;
  if (hipMemcpyDtoH(&passed, correct.get(), sizeof(bool)) != HIP_SUCCESS) {
    return false;
@@ -252,7 +253,7 @@ TEST_CASE("Unit_vectorTypes_CompileTest") {

  unique_ptr<bool, decltype(hipFree)*> correct{ptr, hipFree};
  hipLaunchKernelGGL(CheckVectorTypes, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, correct.get());
-
+  HIP_CHECK(hipGetLastError());
  bool passed = true;
  res = hipMemcpyDtoH(&passed, correct.get(), sizeof(bool));
  REQUIRE(res == hipSuccess);
@@ -80,6 +80,7 @@ void test(unsigned testMask, int* C_d, int* C_h, int64_t numElements, hipStream_
    HIP_CHECK(hipEventRecord(start, stream));
    hipLaunchKernelGGL(HipTest::addCountReverse, dim3(blocks), dim3(threadsPerBlock), 0, stream,
                       static_cast<const int*>(C_d), C_h, numElements, count);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipEventRecord(stop, stream));

    if (waitStart) {
@@ -111,7 +112,7 @@ void test(unsigned testMask, int* C_d, int* C_h, int64_t numElements, hipStream_
        REQUIRE(false);
    }

-    if (e == hipSuccess) assert(t == 0.0f);
+    if (e == hipSuccess) HIP_ASSERT(t == 0.0f);

    // stop usually ready unless we skipped the synchronization (syncNone)
    e = hipEventElapsedTime(&t, stop, stop);
@@ -137,7 +138,7 @@ void test(unsigned testMask, int* C_d, int* C_h, int64_t numElements, hipStream_
        HIP_ASSERT(hipEventElapsedTime(&t, start, neverCreated) == hipErrorInvalidHandle);

        HIP_ASSERT(hipEventElapsedTime(&t, neverRecorded, stop) == hipErrorInvalidHandle);
-        HIP_ASSERT(hipEventElapsedTime(&t, start, neverRecorded) == hipErrorInvalidHandle);
+        HIP_ASSERT(hipGetLastError() == hipErrorInvalidHandle);
    }

    HIP_CHECK(hipEventDestroy(neverRecorded));
@@ -65,7 +65,7 @@ TEST_CASE("Unit_hipEventIpc") {

        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const float*>(A_d), static_cast<const float*>(B_d), C_d, N);
-
+        HIP_CHECK(hipGetLastError());

        HIP_CHECK(hipEventRecord(stop, NULL));
        HIP_CHECK(hipEventSynchronize(stop));
@@ -88,7 +88,7 @@ TEST_CASE("Unit_hipEventRecord") {
    HipTest::launchKernel<float>(HipTest::vectorADD<float>, blocks, 1, 0, 0,
                                 static_cast<const float*>(A_d), static_cast<const float*>(B_d),
                                 C_d, N);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipEventRecord(stop, NULL));
    HIP_CHECK(hipEventSynchronize(stop));
    long long hostStop = HipTest::get_time();
@@ -101,7 +101,8 @@ TEMPLATE_TEST_CASE("Unit_hipHostRegister_ReferenceFromKernelandhipMemset", "", i
  // Reference the registered device pointer Ad from inside the kernel:
  for (int i = 0; i < num_devices; i++) {
    HIP_CHECK(hipSetDevice(i));
-    hipLaunchKernelGGL(Inc, dim3(LEN / 512), dim3(512), 0, 0, Ad[i]);
+    hipLaunchKernelGGL(Inc, dim3(LEN / 32), dim3(32), 0, 0, Ad[i]);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipDeviceSynchronize());
  }
  REQUIRE(A[10] == 1 + static_cast<TestType>(num_devices));
@@ -111,7 +111,7 @@ static bool validateMemoryOnGPU(int gpu, bool concurOnOneGPU = false) {

  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                     static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
-
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

  if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
@@ -219,7 +219,7 @@ static bool validateMemoryOnGpuMThread(int gpu, bool concurOnOneGPU = false) {

  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                     static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
-
+  HIP_CHECK(hipGetLastError());
  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

  if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
@@ -114,6 +114,7 @@ TEST_CASE("Unit_hipMallocManaged_Advanced") {
  HIP_CHECK(hipEventRecord(event0, 0));
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                     static_cast<const float*>(A), static_cast<const float*>(B), C, numElements);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipEventRecord(event1, 0));
  HIP_CHECK(hipDeviceSynchronize());
  float time = 0.0f;
@@ -536,6 +536,7 @@ TEMPLATE_TEST_CASE("Unit_hipMallocPitch_KernelLaunch", ""
  hipLaunchKernelGGL(copy_var<TestType>, dim3(1), dim3(1),
          0, 0, static_cast<TestType*>(A_d),
          static_cast<TestType*>(B_d), ROWS, pitch_A);
+  HIP_CHECK(hipGetLastError()); 


  // hipMemcpy2D Device to Host
@@ -241,6 +241,7 @@ void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem,
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
      static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
      dmem->C_d(), numElements);
+  HIP_CHECK(hipGetLastError());

  if (useDeviceToDevice) {
    // Do an extra device-to-device copy here to mix things up:
@@ -402,7 +403,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpy_KernelLaunch", "", int, float,
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
                     static_cast<const TestType*>(A_d),
                     static_cast<const TestType*>(B_d), C_d, NUM_ELM);
-
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

  HIP_CHECK(hipDeviceSynchronize());
@@ -567,7 +568,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpy_PinnedRegMemWithKernelLaunch",
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, 0, static_cast<const TestType*>(A_d),
                       static_cast<const TestType*>(B_d), C_d, NUM_ELM);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
    HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);

@@ -595,7 +596,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpy_PinnedRegMemWithKernelLaunch",
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, 0, static_cast<const TestType*>(X_d),
                       static_cast<const TestType*>(Y_d), Z_d, NUM_ELM);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));

    HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);
@@ -319,6 +319,7 @@ TEST_CASE("Unit_hipMemcpy_HalfMemCopy") {
    HIP_CHECK(hipMemcpyAsync(B_h, A_d,
                             (NUM_ELM/2)*sizeof(float),
                             hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipDeviceSynchronize());
    HipTest::checkTest(A_h, B_h, NUM_ELM/2);
  }
  HipTest::freeArrays<float>(A_d, B_d, C_d, A_h, B_h, C_h, false);
@@ -50,6 +50,7 @@ void Thread_func(T *A_d, T *B_d, T* C_d, T* C_h, size_t Nbytes,
  hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                     dim3(threadsPerBlock), 0,
                     mystream, A_d, C_d, N_ELMTS);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
  // The following two MemcpyAsync calls are for sole
  // purpose of loading stream with multiple async calls
@@ -76,6 +77,7 @@ void Thread_func_MultiStream() {
  hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks),
                     dim3(threadsPerBlock), 0,
                     mystream, A_d, C_d, N_ELMTS);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
  // The following hipMemcpyAsync() is called only to
  // load stream with multiple Async calls
@@ -121,7 +123,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_KernelLaunch", "", int, float,
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1), 0, 0,
                     static_cast<const TestType*>(A_d),
                     static_cast<const TestType*>(B_d), C_d, NUM_ELM);
-
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream));
  HIP_CHECK(hipStreamSynchronize(stream));
  HIP_CHECK(hipStreamDestroy(stream));
@@ -348,7 +350,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_PinnedRegMemWithKernelLaunch",
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, 0, static_cast<const TestType*>(A_d),
                       static_cast<const TestType*>(B_d), C_d, NUM_ELM);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
    HipTest::checkVectorADD(A_h, B_h, C_h, NUM_ELM);

@@ -379,7 +381,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyAsync_PinnedRegMemWithKernelLaunch",
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, 0, static_cast<const TestType*>(X_d),
                       static_cast<const TestType*>(Y_d), Z_d, NUM_ELM);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpyAsync(C_h, Z_d, Nbytes,
                             hipMemcpyDeviceToHost, gpu1Stream));
    HIP_CHECK(hipStreamSynchronize(gpu1Stream));
@@ -66,6 +66,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyDtoD_Basic", "",
                         dim3(1), 0, 0,
                         static_cast<const TestType *>(A_d),
                         static_cast<const TestType *>(B_d), C_d, NUM_ELM);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
      HIP_CHECK(hipDeviceSynchronize());
      HipTest::checkVectorADD<TestType>(A_h, B_h, C_h, NUM_ELM);
@@ -80,6 +81,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyDtoD_Basic", "",
                         dim3(1), 0, 0,
                         static_cast<const TestType*>(X_d),
                         static_cast<const TestType*>(Y_d), Z_d, NUM_ELM);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
      HIP_CHECK(hipDeviceSynchronize());
      HipTest::checkVectorADD<TestType>(A_h, B_h, C_h, NUM_ELM);
@@ -68,6 +68,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyDtoDAsync_Basic", "",
                         dim3(1), 0, 0,
                         static_cast<const TestType *>(A_d),
                         static_cast<const TestType *>(B_d), C_d, NUM_ELM);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
      HIP_CHECK(hipDeviceSynchronize());
      HipTest::checkVectorADD<TestType>(A_h, B_h, C_h, NUM_ELM);
@@ -84,6 +85,7 @@ TEMPLATE_TEST_CASE("Unit_hipMemcpyDtoDAsync_Basic", "",
                         dim3(1), 0, 0,
                         static_cast<const TestType*>(X_d),
                         static_cast<const TestType*>(Y_d), Z_d, NUM_ELM);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, stream));
      HIP_CHECK(hipStreamSynchronize(stream));
      HIP_CHECK(hipDeviceSynchronize());
@@ -127,6 +127,7 @@ TEST_CASE("Unit_hipMemcpyPeer_Basic") {
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1),
          0, 0, static_cast<const int*>(A_d),
          static_cast<const int*>(B_d), C_d, numElements*sizeof(int));
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, C_d, numElements*sizeof(int),
               hipMemcpyDeviceToHost));
      HipTest::checkVectorADD<int>(A_h, B_h, C_h, numElements);
@@ -138,6 +139,7 @@ TEST_CASE("Unit_hipMemcpyPeer_Basic") {
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1),
          0, 0, static_cast<const int*>(X_d),
          static_cast<const int*>(Y_d), Z_d, numElements*sizeof(int));
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, Z_d, numElements*sizeof(int),
            hipMemcpyDeviceToHost));
      HipTest::checkVectorADD<int>(A_h, B_h, C_h, numElements);
@@ -143,6 +143,7 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Basic") {
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1),
          0, 0, static_cast<const int*>(A_d),
          static_cast<const int*>(B_d), C_d, numElements*sizeof(int));
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, C_d, numElements*sizeof(int),
               hipMemcpyDeviceToHost));
      HipTest::checkVectorADD<int>(A_h, B_h, C_h, numElements);
@@ -166,6 +167,7 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Basic") {
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1),
          0, 0, static_cast<const int*>(X_d),
          static_cast<const int*>(Y_d), Z_d, numElements*sizeof(int));
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, Z_d, numElements*sizeof(int),
               hipMemcpyDeviceToHost));
      HipTest::checkVectorADD<int>(A_h, B_h, C_h, numElements);
@@ -227,6 +229,7 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_StreamOnDiffDevice") {
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1),
          0, 0, static_cast<const int*>(A_d),
          static_cast<const int*>(B_d), C_d, numElements*sizeof(int));
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, C_d, numElements*sizeof(int),
               hipMemcpyDeviceToHost));
      HipTest::checkVectorADD<int>(A_h, B_h, C_h, numElements);
@@ -240,6 +243,7 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_StreamOnDiffDevice") {
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(1), dim3(1),
          0, 0, static_cast<const int*>(X_d),
          static_cast<const int*>(Y_d), Z_d, numElements*sizeof(int));
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipMemcpy(C_h, Z_d, numElements*sizeof(int),
               hipMemcpyDeviceToHost));

@@ -99,6 +99,7 @@ void TestwithOnestream(void) {
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, stream, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipStreamSynchronize(stream));
  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
  HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -136,6 +137,7 @@ void TestwithTwoStream(void) {
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < NUM_STREAMS; ++i) {
@@ -191,6 +193,7 @@ void TestDtoDonSameDevice(void) {
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < NUM_STREAMS; ++i) {
@@ -262,6 +265,7 @@ void TestOnMultiGPUwithOneStream(void) {
                         dim3(threadsPerBlock), 0, stream[i],
                         static_cast<const int*>(A_d[i]),
                         static_cast<const int*>(B_d[i]), C_d[i], N);
+      HIP_CHECK(hipGetLastError());
    }

    for (int i=0; i < NumDevices; ++i) {
@@ -298,6 +302,7 @@ void TestkindDtoH(void) {
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, stream, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipStreamSynchronize(stream));
  HIP_CHECK(hipMemcpyWithStream(C_h, C_d, Nbytes,
                               hipMemcpyDeviceToHost, stream));
@@ -365,6 +370,7 @@ void TestkindDtoD(void) {
                         dim3(threadsPerBlock),
                         0, stream[i], static_cast<const int*>(A_d[i]),
                         static_cast<const int*>(B_d[i]), C_d[i], N);
+      HIP_CHECK(hipGetLastError());
    }

    for (int i=0; i < NumDevices; ++i) {
@@ -411,6 +417,7 @@ void TestkindDefault(void) {
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, stream, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipStreamSynchronize(stream));
  HIP_CHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDefault, stream));
  HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -472,6 +479,7 @@ void TestkindDefaultForDtoD(void) {
                         dim3(threadsPerBlock),
                         0, stream[i], static_cast<const int*>(A_d[i]),
                         static_cast<const int*>(B_d[i]), C_d[i], N);
+      HIP_CHECK(hipGetLastError());
    }

    for (int i=0; i < NumDevices; ++i) {
@@ -167,6 +167,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestwithOnestream(bool &val_res) {
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, stream, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIPCHECK(hipStreamSynchronize(stream));
  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
  val_res = ValidateResult(A_h, B_h, C_h);
@@ -203,6 +204,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestwithTwoStream(bool &val_res) {
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < NoofStreams; ++i) {
@@ -258,6 +260,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestDtoDonSameDevice(bool &val_res) {
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < NoofStreams; ++i) {
@@ -329,6 +332,7 @@ void HipMemcpyWithStreamMultiThreadtests::
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < numDevices; ++i) {
@@ -363,6 +367,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDtoH(bool &val_res) {
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, stream, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIPCHECK(hipStreamSynchronize(stream));
  HIPCHECK(hipMemcpyWithStream(C_h, C_d, Nbytes,
                               hipMemcpyDeviceToHost, stream));
@@ -434,6 +439,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDtoD(bool &val_res) {
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < numDevices; ++i) {
@@ -481,6 +487,7 @@ void HipMemcpyWithStreamMultiThreadtests::
  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                     0, stream, static_cast<const int*>(A_d),
                     static_cast<const int*>(B_d), C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIPCHECK(hipStreamSynchronize(stream));
  HIPCHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDefault, stream));
  val_res = ValidateResult(A_h, B_h, C_h);
@@ -544,6 +551,7 @@ void HipMemcpyWithStreamMultiThreadtests::
    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
                       0, stream[i], static_cast<const int*>(A_d[i]),
                       static_cast<const int*>(B_d[i]), C_d[i], N);
+    HIP_CHECK(hipGetLastError());
  }

  for (int i=0; i < numDevices; ++i) {
@@ -53,6 +53,7 @@ TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
  std::cout << clkRate << std::endl;
  hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks),
                     0, 0, hostRes, clkRate);
+  HIP_CHECK(hipGetLastError());
  int eleCounter = 0;
  while (eleCounter < blocks) {
    // blocks until the value changes
@@ -31,7 +31,7 @@

 /* Defines */
 #define NUM_THREADS 1000
-#define ITER 100
+#define ITER 10
 #define NUM_H 256
 #define NUM_W 256

@@ -105,6 +105,7 @@ TEST_CASE("Unit_hipMemset2DAsync_WithKernel") {
    for (size_t k = 0; k < ITER; k++) {
      hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                   dim3(threadsPerBlock), 0, hipStreamPerThread, B_d, C_d, elements);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
      HIP_CHECK(hipMemset2DAsync(C_d, pitch_C, memsetval, NUM_W, NUM_H,
                hipStreamPerThread));
@@ -369,7 +369,7 @@ static void seekAndSet3DArrayPortion(bool bAsync) {
  myparms.srcPos = make_hipPos(0, 0, 0);
  myparms.dstPos = make_hipPos(0, 0, 0);
  myparms.dstPtr = make_hipPitchedPtr(array3D, sizeof(char) * arr_dimensions.x,
-                                      arr_dimensions.x, arr_dimensions.y);
+                                      arr_dimensions.y, arr_dimensions.z);
  myparms.srcPtr = devicePitchedPointer;
  myparms.extent = extent;
 #if HT_NVIDIA
@@ -84,6 +84,7 @@ static void threadFunc(hipStream_t stream, hipPitchedPtr devpPtr,

  hipLaunchKernelGGL(func_set_value, dim3(blocks), dim3(threadsPerBlock), 0,
                     stream, devpPtr, extent, memsetval);
+  HIP_CHECK(hipGetLastError());
  HIPCHECK(hipMemset3DAsync(devpPtr, testval, extent, stream));
  HIPCHECK(hipMemcpy3DAsync(&myparms, stream));
 }
@@ -92,6 +92,7 @@ static bool testhipMemsetAsyncWithKernel(bool UseStrmPerThrd) {
      hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                         dim3(threadsPerBlock), 0, hipStreamPerThread, obj.B_d,
                         obj.C_d, N);
+      HIP_CHECK(hipGetLastError());
      HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
      HIP_CHECK(hipMemsetAsync(obj.C_d , obj.memSetVal, N, hipStreamPerThread));
      HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
@@ -117,6 +118,7 @@ static bool testhipMemsetD32AsyncWithKernel() {
  for (int k = 0 ; k < ITER ; k++) {
    hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                    dim3(threadsPerBlock), 0, obj.stream, obj.B_d, obj.C_d, N);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)obj.C_d , obj.memSetVal,
                                N, obj.stream));
    HIP_CHECK(hipStreamSynchronize(obj.stream));
@@ -135,6 +137,7 @@ static bool testhipMemsetD16AsyncWithKernel() {
  for (int k = 0 ; k < ITER ; k++) {
    hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                    dim3(threadsPerBlock), 0, obj.stream, obj.B_d, obj.C_d, N);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)obj.C_d , obj.memSetVal,
                                                              N, obj.stream));
    HIP_CHECK(hipStreamSynchronize(obj.stream));
@@ -153,6 +156,7 @@ static bool testhipMemsetD8AsyncWithKernel() {
  for (int k = 0; k < ITER; k++) {
    hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                    dim3(threadsPerBlock), 0, obj.stream, obj.B_d, obj.C_d, N);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemsetD8Async((hipDeviceptr_t)obj.C_d, obj.memSetVal,
                                                              N, obj.stream));
    HIP_CHECK(hipStreamSynchronize(obj.stream));
@@ -138,6 +138,7 @@ TEST_CASE("Unit_hipPointerGetAttribute_KernelUpdation") {
                                   reinterpret_cast<hipDeviceptr_t>(A_d)));
  hipLaunchKernelGGL(var_update, dim3(1), dim3(1), 0, 0,
                     reinterpret_cast<int *>(data));
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());
  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
  for (unsigned int i = 0; i < N; i++) {
@@ -74,6 +74,7 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) {

        hipLaunchKernelGGL(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const T*>(A_d), static_cast<const T*>(B_d), C_d, numElements);
+        HIP_CHECK(hipGetLastError()); 

        MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);

@@ -56,6 +56,7 @@ void run1(size_t size, hipStream_t stream) {
    HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
    HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
+    HIP_CHECK(hipGetLastError()); 
    HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
    HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
    HIPCHECK(hipDeviceSynchronize());
@@ -97,7 +98,9 @@ void run(size_t size, hipStream_t stream1, hipStream_t stream2) {
    HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
    HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
+    HIP_CHECK(hipGetLastError()); 
    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
+    HIP_CHECK(hipGetLastError()); 
    HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
    HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
    HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
@@ -59,8 +59,10 @@ TEST_CASE("Unit_hipStreamCreate_MultistreamBasicFunctionalities") {
    hipLaunchKernelGGL(HIP_KERNEL_NAME(hipAPIStreamDisableTest::kernel),
                       dim3(1), dim3(1), 0, streams[i], data[i], xd,
                       hipAPIStreamDisableTest::NN);
+    HIP_CHECK(hipGetLastError());
    hipLaunchKernelGGL(HIP_KERNEL_NAME(hipAPIStreamDisableTest::nKernel),
                       dim3(1), dim3(1), 0, 0, yd);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipStreamDestroy(streams[i]));
  }
  HIP_CHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
@@ -49,7 +49,9 @@ TEST_CASE("Unit_hipMultiStream_sameDevice") {
    HIP_CHECK(hipStreamCreate(&streams[i]));
    HIP_CHECK(hipMalloc(&data[i], NN * sizeof(float)));
    hipLaunchKernelGGL(kernel, dim3(1), dim3(1), 0, streams[i], data[i], xd, NN);
+    HIP_CHECK(hipGetLastError());
    hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipFree(data[i]));
    HIP_CHECK(hipStreamDestroy(streams[i]));
  }
@@ -80,9 +82,11 @@ TEST_CASE("Unit_hipMultiStream_multimeDevice") {
    HIP_CHECK(hipDeviceSynchronize());
    // Launch kernel with default stream
    hipLaunchKernelGGL(kernel_do_nothing, dim3(1), dim3(1), 0, 0, 1);
+    HIP_CHECK(hipGetLastError());
    // Launch kernel on all streams
    for (int i = 0; i < nStreams; i++) {
      hipLaunchKernelGGL(kernel_do_nothing, dim3(1), dim3(1), 0, streams[i], 1);
+      HIP_CHECK(hipGetLastError());
    }
    // Sync stream 1
    HIP_CHECK(hipStreamSynchronize(streams[0]));
@@ -131,7 +131,7 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
  hipLaunchKernelGGL((device_function), dim3(blocks),
                     dim3(threadsPerBlock), 0,
                     mystream, C_d, A_d, N);
-
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(
  hipMemcpyAsync(C1_h, C_d, Nbytes,
                 hipMemcpyDeviceToHost, mystream));
@@ -86,7 +86,7 @@ bool testStreamCallbackFunctionality(bool isDefault) {
    const unsigned threadsPerBlock = 256;
    hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks),
                        dim3(threadsPerBlock), 0, 0, A_d, C_d, NSize);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
                            0));
    HIP_CHECK(hipStreamAddCallback(0, Callback, nullptr, 0));
@@ -102,7 +102,7 @@ bool testStreamCallbackFunctionality(bool isDefault) {
    const unsigned threadsPerBlock = 256;
    hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks),
                      dim3(threadsPerBlock), 0, mystream, A_d, C_d, NSize);
-
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
                            mystream));
    HIP_CHECK(hipStreamAddCallback(mystream, Callback, nullptr, 0));
@@ -146,6 +146,7 @@ void funcTestsForAllPriorityLevelsWrtNullStrm(unsigned int flags,
    hipLaunchKernelGGL((HipTest::vector_square), dim3(GRIDSIZE),
                        dim3(BLOCKSIZE), 0, stream[idx], A_d[idx],
                        C_d[idx], MEMCPYSIZE2);
+    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipMemcpyAsync(C_h[idx], C_d[idx], size,
            hipMemcpyDeviceToHost, stream[idx]));
  }
@@ -227,6 +228,7 @@ void queueTasksInStreams(std::vector<hipStream_t> *stream,
    hipLaunchKernelGGL((HipTest::vector_square), dim3(GRIDSIZE),
                        dim3(BLOCKSIZE), 0, (*stream)[idx], A_d[idx],
                        C_d[idx], MEMCPYSIZE2);
+    HIP_CHECK(hipGetLastError());
    HIPCHECK(hipMemcpyAsync(C_h[idx], C_d[idx], size,
             hipMemcpyDeviceToHost, (*stream)[idx]));
  }
@@ -426,6 +428,7 @@ bool validateStreamPrioritiesWithEvents() {
        hipLaunchKernelGGL((memcpy_kernel<T>), dim3(GRIDSIZE), \
        dim3(BLOCKSIZE), 0, stream_##x, dst_d_##x + j, src_d_##x + j, \
        (MEMCPYSIZE / sizeof(T))); \
+        HIP_CHECK(hipGetLastError()); \
      }
    OP(low)
    OP(normal)
@@ -157,6 +157,7 @@ TEST_CASE("Unit_hipExtStreamCreateWithCUMask_ValidateCallbackFunc") {
  const unsigned threadsPerBlock = BLOCKSIZE;
  hipLaunchKernelGGL((HipTest::vector_square), dim3(blocks),
                     dim3(threadsPerBlock), 0, mystream, A_d, C_d, N);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
          mystream));
  HIP_CHECK(hipStreamAddCallback(mystream, Callback, nullptr, 0));
@@ -250,6 +251,7 @@ TEST_CASE("Unit_hipExtStreamCreateWithCUMask_Functionality") {

  hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
                     dim3(threadsPerBlock), 0, streams[0], dA[0], dC[0], N);
+  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipDeviceSynchronize());

  auto single_end = std::chrono::steady_clock::now();
@@ -269,6 +271,7 @@ TEST_CASE("Unit_hipExtStreamCreateWithCUMask_Functionality") {
    << streams[np] << " with CU mask: 0x" << ss[np].str().c_str());
    hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks),
    dim3(threadsPerBlock), 0, streams[np], dA[np], dC[np], N);
+    HIP_CHECK(hipGetLastError());
  }
  HIP_CHECK(hipDeviceSynchronize());

@@ -66,6 +66,7 @@ TEST_CASE("Unit_hipBindTexture2D_Pitch") {

  hipLaunchKernelGGL(texture2dCopyKernel, dim3(4, 4, 1), dim3(32, 32, 1),
                                                               0, 0, devPtrB);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());
  HIP_CHECK(hipMemcpy2D(B, SIZE_W*sizeof(TYPE_t), devPtrB,
                                 SIZE_W*sizeof(TYPE_t), SIZE_W*sizeof(TYPE_t),
@@ -63,6 +63,7 @@ TEST_CASE("Unit_hipBindTexture_tex1DfetchVerification") {
  dim3 dimGrid(N / dimBlock.x, 1, 1);

  hipLaunchKernelGGL(kernel, dim3(dimGrid), dim3(dimBlock), 0, 0, devBuf);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());
  HIP_CHECK(hipMemcpy(output, devBuf, N * sizeof(float),
                                                  hipMemcpyDeviceToHost));
@@ -118,6 +118,7 @@ static void textureTest(texture<T, hipTextureType1D,

  hipLaunchKernelGGL(normalizedValTextureTest<T>, dim3(1, 1, 1),
                     dim3(SIZE, 1, 1), 0, 0, SIZE, dOutputData);
+  HIP_CHECK(hipGetLastError()); 

  float *hOutputData = new float[SIZE];
  REQUIRE(hOutputData != nullptr);
@@ -86,9 +86,11 @@ TEST_CASE("Unit_hipSimpleTexture2DLayered_Check") {

  dim3 dimBlock(8, 8, 1);
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
-  for (unsigned int layer = 0; layer < num_layers; layer++)
+  for (unsigned int layer = 0; layer < num_layers; layer++) {
    hipLaunchKernelGGL(simpleKernelLayeredArray, dimGrid, dimBlock, 0, 0,
                       dData, width, height, layer);
+    HIP_CHECK(hipGetLastError()); 
+  }
  HIP_CHECK(hipDeviceSynchronize());

  // Allocate mem for the result on host side
@@ -92,6 +92,7 @@ static void runSimpleTexture3D_Check(int width, int height, int depth,

  hipLaunchKernelGGL(simpleKernel3DArray, dim3(1, 1, 1), dim3(1, 1, 1),
                     0, 0, dData, width, height, depth);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());

  // Allocate mem for the result on host side
@@ -79,6 +79,7 @@ static void runTest(hipTextureAddressMode addressMode,

  hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0,
                     texBufOut, texObj);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());

  HIP_CHECK(hipMemcpy(output, texBufOut, N * sizeof(float),
@@ -85,6 +85,7 @@ TEMPLATE_TEST_CASE("Unit_hipTexObjPitch_texture2D", "", float, int,

  hipLaunchKernelGGL(texture2dCopyKernel, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
          texObj, devPtrB);
+  HIP_CHECK(hipGetLastError()); 

  HIP_CHECK(hipMemcpy2D(B, SIZE_W*sizeof(TestType), devPtrB,
                        SIZE_W*sizeof(TestType), SIZE_W*sizeof(TestType),
@@ -97,6 +97,7 @@ static void runMipMapTest(unsigned int width, unsigned int height, unsigned int

  hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, textureObject, width,
                     (2 * mipmap_level));
+  HIP_CHECK(hipGetLastError()); 
  hipDeviceSynchronize();

  float* hOutputData = reinterpret_cast<float*>(malloc(size));
@@ -73,6 +73,7 @@ static void runTest(const int width, const float offsetX) {

  hipLaunchKernelGGL(tex1DKernel<normalizedCoords>, dimGrid, dimBlock, 0, 0, dData,
                     textureObject, width, offsetX);
+  HIP_CHECK(hipGetLastError()); 

  HIP_CHECK(hipDeviceSynchronize());

@@ -113,9 +113,11 @@ static void runTest(const int width, const float offsetX = 0) {
  if (resType == hipResourceTypeArray) {
    hipLaunchKernelGGL(tex1DRGBAKernel<normalizedCoords>, dimGrid, dimBlock,
                       0, 0, dData, textureObject, width, offsetX);
+    HIP_CHECK(hipGetLastError()); 
  } else {
    hipLaunchKernelGGL(tex1DRGBAKernelFetch, dimGrid, dimBlock,
                       0, 0, dData, textureObject, offsetX);
+    HIP_CHECK(hipGetLastError()); 
  }

  HIP_CHECK(hipDeviceSynchronize());
@@ -69,6 +69,7 @@ TEST_CASE("Unit_hipCreateTextureObject_tex1DfetchVerification") {

  hipLaunchKernelGGL(tex1dKernel, dim3(dimGrid), dim3(dimBlock), 0, 0,
                     texBufOut, texObj);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());

  HIP_CHECK(hipMemcpy(output, texBufOut, N * sizeof(float),
@@ -85,6 +85,7 @@ TEST_CASE("Unit_hipTextureObj2D_Check") {

  hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock),
                     0, 0, dData, textureObject, width);
+  HIP_CHECK(hipGetLastError()); 

  HIP_CHECK(hipDeviceSynchronize());
  HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost));
@@ -81,6 +81,7 @@ static void runTest(const int width, const int height, const float offsetX, cons

  hipLaunchKernelGGL(tex2DKernel<normalizedCoords>, dimGrid, dimBlock, 0, 0, dData,
                     textureObject, width, height, offsetX, offsetY);
+  HIP_CHECK(hipGetLastError()); 

  HIP_CHECK(hipDeviceSynchronize());

@@ -86,6 +86,7 @@ static void runTest(const int width, const int height, const float offsetX, cons

  hipLaunchKernelGGL(tex2DRGBAKernel<normalizedCoords>, dimGrid, dimBlock, 0, 0, dData,
                     textureObject, width, height, offsetX, offsetY);
+  HIP_CHECK(hipGetLastError()); 

  HIP_CHECK(hipDeviceSynchronize());

@@ -113,6 +113,7 @@ static void runTest(const int width, const int height, const int depth, const fl

  hipLaunchKernelGGL(tex3DKernel<normalizedCoords>, dimGrid, dimBlock, 0, 0, dData,
                     textureObject, width, height, depth, offsetX, offsetY, offsetZ);
+  HIP_CHECK(hipGetLastError()); 

  HIP_CHECK(hipDeviceSynchronize());

@@ -178,6 +178,7 @@ bool runTest() {

  hipLaunchKernelGGL(tex1dKernelFetch<T>, dimGrid, dimBlock, 0, 0, texBufOut,
                     texObj, N);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());

  HIP_CHECK(hipMemcpy(output, texBufOut, N * sizeof(T), hipMemcpyDeviceToHost));
@@ -69,6 +69,7 @@ TEST_CASE("Unit_hipTextureRef2D_Check") {
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
  hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0,
                     dData, width);
+  HIP_CHECK(hipGetLastError()); 
  HIP_CHECK(hipDeviceSynchronize());

  float* hOutputData = reinterpret_cast<float*>(malloc(size));