SWDEV-298338 - Fix memory test failure on NV MGPUs (#2339)
1. Fix hipMallocManagedMultiChunkMultiDevice
Add device setting before kernel launching
Add stream sync
2. Fix hipMemcpyAtoH and hipMemcpyHtoA
Fix primary context initialization issue when device
is changed on cuda
Change-Id: I3fe6dbc35b7b24abb21ada297b7885df83d28152
[ROCm/hip commit: ae73179d40]
Cette révision appartient à :
@@ -75,11 +75,17 @@ bool MultiChunkMultiDevice(int NumDevices) {
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (NUM_ELMS + 255)/256;
|
||||
for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
|
||||
|
||||
// If without setting device, Hmm value will be read as 0 in kernel on
|
||||
// GPU where Hmm isn't allocated by hipMallocManaged(). This looks like
|
||||
// a bug of cuda. The following line is to fix the bug on cuda only.
|
||||
HIPCHECK(hipSetDevice(Klaunch));
|
||||
|
||||
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
|
||||
(&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (int m = 0; m < NumDevices; ++m) {
|
||||
HIPCHECK(hipStreamSynchronize(stream[m]));
|
||||
HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
for (int n = 0; n < NUM_ELMS; ++n) {
|
||||
|
||||
@@ -125,8 +125,20 @@ bool MemcpyAtoH<T>::hipMemcpyAtoH_PeerDeviceContext() {
|
||||
printf("Skipped the test as there is no peer access\n");
|
||||
} else {
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
|
||||
unsigned int flags = 0;
|
||||
HIPCHECK(hipGetDeviceFlags(&flags));
|
||||
|
||||
AllocateMemory();
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
|
||||
// hipMemcpyAtoH will invoke cuda driver api cuMemcpyAtoH() which need
|
||||
// the primary context for device 1. The primary context can be
|
||||
// initialized at the first call of a runtime api through hipSetDeviceFlags().
|
||||
// Because of no runtime api called before cuMemcpyAtoH(), we have to
|
||||
// explicitly call hipSetDeviceFlags().
|
||||
HIPCHECK(hipSetDeviceFlags(flags)); // Only cuda driver api need this
|
||||
|
||||
HIPCHECK(hipMemcpyAtoH(B_h, A_d, 0, BYTE_COUNT*sizeof(T)));
|
||||
TestPassed = ValidateResult(B_h, hData[0]);
|
||||
DeAllocateMemory();
|
||||
|
||||
@@ -133,8 +133,20 @@ bool MemcpyHtoA<T>::hipMemcpyHtoA_PeerDeviceContext() {
|
||||
printf("Skipped the test as there is no peer access\n");
|
||||
} else {
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
|
||||
unsigned int flags = 0;
|
||||
HIPCHECK(hipGetDeviceFlags(&flags));
|
||||
|
||||
AllocateMemory();
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
|
||||
// hipMemcpyAtoH will invoke cuda driver api hipMemcpyHtoA() which need
|
||||
// the primary context for device 1. The primary context can be
|
||||
// initialized at the first call of a runtime api through hipSetDeviceFlags().
|
||||
// Because of no runtime api called before hipMemcpyHtoA(), we have to
|
||||
// explicitly call hipSetDeviceFlags().
|
||||
HIPCHECK(hipSetDeviceFlags(flags)); // Only cuda driver api need this
|
||||
|
||||
HIPCHECK(hipMemcpyHtoA(A_d, 0, B_h, BYTECOUNT*sizeof(T)));
|
||||
HIPCHECK(hipMemcpy2DFromArray(A_h, sizeof(T)*NUM_W, A_d,
|
||||
0, 0, sizeof(T)*NUM_W, 1, hipMemcpyDeviceToHost));
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur