diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp index 96f1ffda6c..cb10682ef9 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp @@ -75,11 +75,17 @@ bool MultiChunkMultiDevice(int NumDevices) { const unsigned threadsPerBlock = 256; const unsigned blocks = (NUM_ELMS + 255)/256; for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) { + + // If without setting device, Hmm value will be read as 0 in kernel on + // GPU where Hmm isn't allocated by hipMallocManaged(). This looks like + // a bug of cuda. The following line is to fix the bug on cuda only. + HIPCHECK(hipSetDevice(Klaunch)); + vector_sum <<>> (&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS); } - HIPCHECK(hipDeviceSynchronize()); for (int m = 0; m < NumDevices; ++m) { + HIPCHECK(hipStreamSynchronize(stream[m])); HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost)); for (int n = 0; n < NUM_ELMS; ++n) { diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp index 0091633f0e..0e5ba8e453 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyAtoH.cpp @@ -125,8 +125,20 @@ bool MemcpyAtoH::hipMemcpyAtoH_PeerDeviceContext() { printf("Skipped the test as there is no peer access\n"); } else { HIPCHECK(hipSetDevice(0)); + + unsigned int flags = 0; + HIPCHECK(hipGetDeviceFlags(&flags)); + AllocateMemory(); HIPCHECK(hipSetDevice(1)); + + // hipMemcpyAtoH will invoke cuda driver api cuMemcpyAtoH() which need + // the primary context for device 1. The primary context can be + // initialized at the first call of a runtime api through hipSetDeviceFlags(). + // Because of no runtime api called before cuMemcpyAtoH(), we have to + // explicitly call hipSetDeviceFlags(). + HIPCHECK(hipSetDeviceFlags(flags)); // Only cuda driver api need this + HIPCHECK(hipMemcpyAtoH(B_h, A_d, 0, BYTE_COUNT*sizeof(T))); TestPassed = ValidateResult(B_h, hData[0]); DeAllocateMemory(); diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp index 41559330f6..245ac2f13c 100644 --- a/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp +++ b/projects/hip/tests/src/runtimeApi/memory/hipMemcpyHtoA.cpp @@ -133,8 +133,20 @@ bool MemcpyHtoA::hipMemcpyHtoA_PeerDeviceContext() { printf("Skipped the test as there is no peer access\n"); } else { HIPCHECK(hipSetDevice(0)); + + unsigned int flags = 0; + HIPCHECK(hipGetDeviceFlags(&flags)); + AllocateMemory(); HIPCHECK(hipSetDevice(1)); + + // hipMemcpyAtoH will invoke cuda driver api hipMemcpyHtoA() which need + // the primary context for device 1. The primary context can be + // initialized at the first call of a runtime api through hipSetDeviceFlags(). + // Because of no runtime api called before hipMemcpyHtoA(), we have to + // explicitly call hipSetDeviceFlags(). + HIPCHECK(hipSetDeviceFlags(flags)); // Only cuda driver api need this + HIPCHECK(hipMemcpyHtoA(A_d, 0, B_h, BYTECOUNT*sizeof(T))); HIPCHECK(hipMemcpy2DFromArray(A_h, sizeof(T)*NUM_W, A_d, 0, 0, sizeof(T)*NUM_W, 1, hipMemcpyDeviceToHost));