diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
index 96f1ffda6c..cb10682ef9 100644
--- a/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
+++ b/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
@@ -75,11 +75,17 @@ bool MultiChunkMultiDevice(int NumDevices) {
   const unsigned threadsPerBlock = 256;
   const unsigned blocks = (NUM_ELMS + 255)/256;
   for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
+
+    // If without setting device, Hmm value will be read as 0 in kernel on
+    // GPU where Hmm isn't allocated by hipMallocManaged(). This looks like
+    // a bug of cuda. The following line is to fix the bug on cuda only.
+    HIPCHECK(hipSetDevice(Klaunch));
+
     vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
                       (&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
   }
-  HIPCHECK(hipDeviceSynchronize());
   for (int m = 0; m < NumDevices; ++m) {
+    HIPCHECK(hipStreamSynchronize(stream[m]));
     HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
                        hipMemcpyDeviceToHost));
     for (int n = 0; n < NUM_ELMS; ++n) {
diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp
index 0091633f0e..0e5ba8e453 100644
--- a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp
+++ b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp
@@ -125,8 +125,20 @@ bool MemcpyAtoH<T>::hipMemcpyAtoH_PeerDeviceContext() {
       printf("Skipped the test as there is no peer access\n");
     } else {
       HIPCHECK(hipSetDevice(0));
+
+      unsigned int flags = 0;
+      HIPCHECK(hipGetDeviceFlags(&flags));
+
       AllocateMemory();
       HIPCHECK(hipSetDevice(1));
+
+      // hipMemcpyAtoH will invoke cuda driver api cuMemcpyAtoH() which need
+      // the primary context for device 1. The primary context can be
+      // initialized at the first call of a runtime api through hipSetDeviceFlags().
+      // Because of no runtime api called before cuMemcpyAtoH(), we have to
+      // explicitly call hipSetDeviceFlags().
+      HIPCHECK(hipSetDeviceFlags(flags));  // Only cuda driver api need this
+
       HIPCHECK(hipMemcpyAtoH(B_h, A_d, 0, BYTE_COUNT*sizeof(T)));
       TestPassed = ValidateResult(B_h, hData[0]);
       DeAllocateMemory();
diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp
index 41559330f6..245ac2f13c 100644
--- a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp
+++ b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp
@@ -133,8 +133,20 @@ bool MemcpyHtoA<T>::hipMemcpyHtoA_PeerDeviceContext() {
       printf("Skipped the test as there is no peer access\n");
     } else {
       HIPCHECK(hipSetDevice(0));
+
+      unsigned int flags = 0;
+      HIPCHECK(hipGetDeviceFlags(&flags));
+
       AllocateMemory();
       HIPCHECK(hipSetDevice(1));
+
+      // hipMemcpyAtoH will invoke cuda driver api hipMemcpyHtoA() which need
+      // the primary context for device 1. The primary context can be
+      // initialized at the first call of a runtime api through hipSetDeviceFlags().
+      // Because of no runtime api called before hipMemcpyHtoA(), we have to
+      // explicitly call hipSetDeviceFlags().
+      HIPCHECK(hipSetDeviceFlags(flags));  // Only cuda driver api need this
+
       HIPCHECK(hipMemcpyHtoA(A_d, 0, B_h, BYTECOUNT*sizeof(T)));
       HIPCHECK(hipMemcpy2DFromArray(A_h, sizeof(T)*NUM_W, A_d,
       0, 0, sizeof(T)*NUM_W, 1, hipMemcpyDeviceToHost));