SWDEV-298338 - Fix memory test failure on NV MGPUs (#2339)

1. Fix hipMallocManagedMultiChunkMultiDevice
Add device setting before kernel launching
Add stream sync

2. Fix hipMemcpyAtoH and hipMemcpyHtoA
Fix primary context initialization issue when device
is changed on cuda

Change-Id: I3fe6dbc35b7b24abb21ada297b7885df83d28152

[ROCm/hip commit: ae73179d40]
Cette révision appartient à :
TomSang
2021-09-14 04:07:46 -04:00
révisé par GitHub
Parent 588a6c250e
révision 40812160de
3 fichiers modifiés avec 31 ajouts et 1 suppressions
+7 -1
Voir le fichier
@@ -75,11 +75,17 @@ bool MultiChunkMultiDevice(int NumDevices) {
const unsigned threadsPerBlock = 256;
const unsigned blocks = (NUM_ELMS + 255)/256;
for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
// If without setting device, Hmm value will be read as 0 in kernel on
// GPU where Hmm isn't allocated by hipMallocManaged(). This looks like
// a bug of cuda. The following line is to fix the bug on cuda only.
HIPCHECK(hipSetDevice(Klaunch));
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
(&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
}
HIPCHECK(hipDeviceSynchronize());
for (int m = 0; m < NumDevices; ++m) {
HIPCHECK(hipStreamSynchronize(stream[m]));
HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
for (int n = 0; n < NUM_ELMS; ++n) {
+12
Voir le fichier
@@ -125,8 +125,20 @@ bool MemcpyAtoH<T>::hipMemcpyAtoH_PeerDeviceContext() {
printf("Skipped the test as there is no peer access\n");
} else {
HIPCHECK(hipSetDevice(0));
unsigned int flags = 0;
HIPCHECK(hipGetDeviceFlags(&flags));
AllocateMemory();
HIPCHECK(hipSetDevice(1));
// hipMemcpyAtoH will invoke cuda driver api cuMemcpyAtoH() which need
// the primary context for device 1. The primary context can be
// initialized at the first call of a runtime api through hipSetDeviceFlags().
// Because of no runtime api called before cuMemcpyAtoH(), we have to
// explicitly call hipSetDeviceFlags().
HIPCHECK(hipSetDeviceFlags(flags)); // Only cuda driver api need this
HIPCHECK(hipMemcpyAtoH(B_h, A_d, 0, BYTE_COUNT*sizeof(T)));
TestPassed = ValidateResult(B_h, hData[0]);
DeAllocateMemory();
+12
Voir le fichier
@@ -133,8 +133,20 @@ bool MemcpyHtoA<T>::hipMemcpyHtoA_PeerDeviceContext() {
printf("Skipped the test as there is no peer access\n");
} else {
HIPCHECK(hipSetDevice(0));
unsigned int flags = 0;
HIPCHECK(hipGetDeviceFlags(&flags));
AllocateMemory();
HIPCHECK(hipSetDevice(1));
// hipMemcpyAtoH will invoke cuda driver api hipMemcpyHtoA() which need
// the primary context for device 1. The primary context can be
// initialized at the first call of a runtime api through hipSetDeviceFlags().
// Because of no runtime api called before hipMemcpyHtoA(), we have to
// explicitly call hipSetDeviceFlags().
HIPCHECK(hipSetDeviceFlags(flags)); // Only cuda driver api need this
HIPCHECK(hipMemcpyHtoA(A_d, 0, B_h, BYTECOUNT*sizeof(T)));
HIPCHECK(hipMemcpy2DFromArray(A_h, sizeof(T)*NUM_W, A_d,
0, 0, sizeof(T)*NUM_W, 1, hipMemcpyDeviceToHost));