SWDEV-261391 - Add Cache Coherency Tests for gfx90a
Add a test for fine-grained device to device coherency. Add a test for fine-grained host to device coherency. Change-Id: I62482cae917fa19feaa17adb53f3084527ad8fda
This commit is contained in:
committed by
Aaron En Ye Shi
parent
ca50ac83c7
commit
eed3fe6da9
@@ -0,0 +1,264 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Simple test for Fine Grained CPU-GPU coherency.
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp HIPCC_OPTIONS -std=c++11 -lpthread EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
#include <stdio.h>
|
||||
#include <thread>
|
||||
|
||||
typedef _Atomic(unsigned int) atomic_uint;
|
||||
|
||||
// Helper function to spin on address until address equals value.
|
||||
// If the address holds the value of -1, abort because the other thread failed.
|
||||
__device__ void
|
||||
gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
|
||||
unsigned int value) {
|
||||
unsigned int compare;
|
||||
bool check = false;
|
||||
do {
|
||||
compare = value;
|
||||
check = __opencl_atomic_compare_exchange_strong(
|
||||
(atomic_uint*)address, /*expected=*/ &compare, /*desired=*/ value,
|
||||
__ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
|
||||
/*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
if (compare == -1)
|
||||
abort();
|
||||
} while(!check);
|
||||
}
|
||||
|
||||
// This kernel requires a single block, single thread dispatch.
|
||||
__global__ void
|
||||
gpu_kernel(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
// Store data into A, system fence, and atomically mark flag.
|
||||
// This guarantees this global write is visible by device 1.
|
||||
A[i] = X[i];
|
||||
__opencl_atomic_fetch_add((atomic_uint*)AA1, 1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on device 1's global write to B.
|
||||
gpu_spin_loop_or_abort_on_negative_one(BA1, i+1);
|
||||
|
||||
// Check device 1 properly stored Y into B.
|
||||
bool stored_data_matches = (B[i] == Y[i]);
|
||||
if(!stored_data_matches) {
|
||||
// If the data does not match, alert other thread and abort.
|
||||
printf("FAIL: at i=%lu, B[i]=%d, which does not match Y[i]=%d.\n",
|
||||
i, B[i], Y[i]);
|
||||
__opencl_atomic_exchange((atomic_uint*)AA2, -1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
abort();
|
||||
}
|
||||
// Otherwise tell the other thread to continue.
|
||||
__opencl_atomic_fetch_add((atomic_uint*)AA2, 1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on kernel gpu_cache1 to finish checking X is stored in A.
|
||||
gpu_spin_loop_or_abort_on_negative_one(BA2, i+1);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ void
|
||||
cpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
|
||||
unsigned int value) {
|
||||
unsigned int compare;
|
||||
bool check = false;
|
||||
do {
|
||||
compare = value;
|
||||
check = __atomic_compare_exchange_n(
|
||||
address, /*expected=*/ &compare, /*desired=*/ value,
|
||||
/*weak=*/ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
|
||||
if (compare == -1)
|
||||
abort();
|
||||
} while(!check);
|
||||
}
|
||||
|
||||
// This host thread runs only on a single CPU thread.
|
||||
__host__ void
|
||||
cpu_thread(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
B[i] = Y[i];
|
||||
__atomic_fetch_add(BA1, 1, __ATOMIC_RELEASE);
|
||||
cpu_spin_loop_or_abort_on_negative_one(AA1, i+1);
|
||||
|
||||
bool stored_data_matches = (A[i] == X[i]);
|
||||
if(!stored_data_matches) {
|
||||
printf("FAIL: at i=%lu, A[i]=%d, which does not match X[i]=%d.\n",
|
||||
i, A[i], X[i]);
|
||||
__atomic_exchange_n(BA2, -1, __ATOMIC_RELEASE);
|
||||
abort();
|
||||
}
|
||||
__atomic_fetch_add(BA2, 1, __ATOMIC_RELEASE);
|
||||
cpu_spin_loop_or_abort_on_negative_one(AA2, i+1);
|
||||
}
|
||||
}
|
||||
|
||||
// This test runs on gfx90a where XGMI enables fine-grained communication
|
||||
// between GPUs. This performs a message passing test.
|
||||
// Array A is allocated on Device 0, and remotely on host.
|
||||
// Device 0 also increments atomic ints AA1 and AA2.
|
||||
// Array B is allocated on host, and remotely on Device 0.
|
||||
// Host also increments atomic ints BA1 and BA2.
|
||||
// Kernel will launch on Device 0, and store array X into array A.
|
||||
// Host Thread will store array Y into array B.
|
||||
// Kernel will validate that the correct values of array Y are stored in B.
|
||||
// Host Thread will validate that the correct values of array X are stored in A.
|
||||
|
||||
bool cpu_to_gpu_coherency() {
|
||||
int *A_d, *B_d, *X_d, *Y_d;
|
||||
int *A_res, *A_h, *B_h, *X_h, *Y_h;
|
||||
size_t N = 1024;
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int numDevices = 0;
|
||||
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
if (numDevices < 1) {
|
||||
printf("info: no GPU detected.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip this test if device is not gfx90a.
|
||||
static int device0 = 0;
|
||||
hipDeviceProp_t props;
|
||||
HIPCHECK(hipGetDeviceProperties(&props, device0));
|
||||
if (strncmp(props.gcnArchName, "gfx90a", 6) != 0) {
|
||||
printf("info: skipping test on non-gfx90a devices.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allocate Host Side Memory. Coherent Fine-grained Memory for array B.
|
||||
printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
HIPCHECK(hipHostMalloc(&B_h, Nbytes, (hipHostMallocCoherent | hipHostMallocMapped)));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&B_d, B_h, 0));
|
||||
X_h = (int*)malloc(Nbytes); HIPCHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
Y_h = (int*)malloc(Nbytes); HIPCHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
|
||||
// Initialize the arrays and atomic variables.
|
||||
for (size_t i = 0; i < N; i++)
|
||||
{
|
||||
X_h[i] = 100000000 + i;
|
||||
Y_h[i] = 300000000 + i;
|
||||
}
|
||||
|
||||
// Initialize shared atomic flags between CPU and GPU.
|
||||
unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
|
||||
unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
|
||||
HIPCHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&AA1_d, AA1_h, 0)); *AA1_h = 0;
|
||||
HIPCHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&AA2_d, AA2_h, 0)); *AA2_h = 0;
|
||||
HIPCHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&BA1_d, BA1_h, 0)); *BA1_h = 0;
|
||||
HIPCHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&BA2_d, BA2_h, 0)); *BA2_h = 0;
|
||||
|
||||
// Skip the first stream, ensure stream is non-blocking.
|
||||
hipStream_t stream[2];
|
||||
HIPCHECK(hipStreamCreate(&stream[0]));
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
|
||||
|
||||
// Allocate Device Side Memory. Coherent Fine-grained Memory for array A.
|
||||
printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
hipError_t status = hipExtMallocWithFlags((void**)&A_d, Nbytes, hipDeviceMallocFinegrained);
|
||||
if (status == hipErrorOutOfMemory || A_d == 0 ){
|
||||
printf("info: device fine-grained memory not supported on this config\n");
|
||||
printf("info: skipping this CPU-GPU coherency test\n");
|
||||
return true;
|
||||
} else if (status != hipSuccess) {
|
||||
printf("error: failed to allocate device fine-grain memory\n");
|
||||
return false;
|
||||
}
|
||||
// SVM memory - host pointer is the same as device pointer to array A.
|
||||
A_h = A_d;
|
||||
HIPCHECK(hipMalloc(&X_d, Nbytes));
|
||||
HIPCHECK(hipMalloc(&Y_d, Nbytes));
|
||||
|
||||
printf("info: copy Host2Device\n");
|
||||
HIPCHECK(hipMemcpy(X_d, X_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(Y_d, Y_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
// Launch the GPU kernel.
|
||||
printf("info: prepare args and launch params for device 0\n");
|
||||
const unsigned blocks = 1;
|
||||
const unsigned threadsPerBlock = 1;
|
||||
printf("info: launch gpu_kernel\n");
|
||||
hipLaunchKernelGGL(gpu_kernel, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[1],
|
||||
A_d, B_d, X_d, Y_d, N,
|
||||
AA1_d, AA2_d, BA1_d, BA2_d);
|
||||
// Check if launch failed.
|
||||
HIPCHECK(hipGetLastError());
|
||||
|
||||
// Do not sync the launched stream, instead run the cpu_thread.
|
||||
printf("info: start cpu_thread\n");
|
||||
std::thread host_thread(cpu_thread,
|
||||
A_h, B_h, X_h, Y_h, N,
|
||||
AA1_h, AA2_h, BA1_h, BA2_h);
|
||||
host_thread.detach();
|
||||
|
||||
// Wait for Device side to finish.
|
||||
HIPCHECK(hipStreamSynchronize(stream[1]));
|
||||
|
||||
// Evaluate the resultant arrays A and B.
|
||||
printf("info: copy Device2Host\n");
|
||||
A_res = (int*)malloc(Nbytes); HIPCHECK(A_res == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
HIPCHECK(hipMemcpy(A_res, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
printf("info: check result\n");
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
assert(A_res[i] == (100000000 + i));
|
||||
assert(B_h[i] == (300000000 + i));
|
||||
}
|
||||
|
||||
// Free all the device and host memory allocated.
|
||||
HIPCHECK(hipFree(A_d));
|
||||
HIPCHECK(hipFree(X_d));
|
||||
HIPCHECK(hipFree(Y_d));
|
||||
HIPCHECK(hipHostFree(AA1_h));
|
||||
HIPCHECK(hipHostFree(AA2_h));
|
||||
HIPCHECK(hipHostFree(BA1_h));
|
||||
HIPCHECK(hipHostFree(BA2_h));
|
||||
HIPCHECK(hipHostFree(B_h));
|
||||
free(X_h);
|
||||
free(Y_h);
|
||||
free(A_res);
|
||||
|
||||
printf("info: finished CPU-GPU test!\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool passed = true;
|
||||
|
||||
// Coherency between CPU and GPU sharing host and device memory.
|
||||
passed = passed & cpu_to_gpu_coherency();
|
||||
|
||||
if (passed)
|
||||
passed();
|
||||
return passed;
|
||||
}
|
||||
@@ -0,0 +1,282 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Simple test for Fine Grained GPU-GPU coherency.
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp HIPCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvidia
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
#include <stdio.h>
|
||||
|
||||
typedef _Atomic(unsigned int) atomic_uint;
|
||||
|
||||
// Helper function to spin on address until address equals value.
|
||||
// If the address holds the value of -1, abort because the other thread failed.
|
||||
__device__ void
|
||||
gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
|
||||
unsigned int value) {
|
||||
unsigned int compare;
|
||||
bool check = false;
|
||||
do {
|
||||
compare = value;
|
||||
check = __opencl_atomic_compare_exchange_strong(
|
||||
(atomic_uint*)address, /*expected=*/ &compare, /*desired=*/ value,
|
||||
__ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
|
||||
/*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
if (compare == -1)
|
||||
abort();
|
||||
} while(!check);
|
||||
}
|
||||
|
||||
// This kernel requires a single block, single thread dispatch.
|
||||
__global__ void
|
||||
gpu_cache0(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
// Store data into A, system fence, and atomically mark flag.
|
||||
// This guarantees this global write is visible by device 1.
|
||||
A[i] = X[i];
|
||||
__opencl_atomic_fetch_add((atomic_uint*)AA1, 1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on device 1's global write to B.
|
||||
gpu_spin_loop_or_abort_on_negative_one(BA1, i+1);
|
||||
|
||||
// Check device 1 properly stored Y into B.
|
||||
bool stored_data_matches = (B[i] == Y[i]);
|
||||
if(!stored_data_matches) {
|
||||
// If the data does not match, alert other thread and abort.
|
||||
printf("FAIL: at i=%lu, B[i]=%d, which does not match Y[i]=%d.\n",
|
||||
i, B[i], Y[i]);
|
||||
__opencl_atomic_exchange((atomic_uint*)AA2, -1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
abort();
|
||||
}
|
||||
// Otherwise tell the other thread to continue.
|
||||
__opencl_atomic_fetch_add((atomic_uint*)AA2, 1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on kernel gpu_cache1 to finish checking X is stored in A.
|
||||
gpu_spin_loop_or_abort_on_negative_one(BA2, i+1);
|
||||
}
|
||||
}
|
||||
|
||||
// This kernel requires a single block, single thread dispatch.
|
||||
__global__ void
|
||||
gpu_cache1(int *A,int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
B[i] = Y[i];
|
||||
__opencl_atomic_fetch_add((atomic_uint*)BA1, 1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
gpu_spin_loop_or_abort_on_negative_one(AA1, i+1);
|
||||
|
||||
bool stored_data_matches = (A[i] == X[i]);
|
||||
if(!stored_data_matches) {
|
||||
printf("FAIL: at i=%lu, A[i]=%d, which does not match X[i]=%d.\n",
|
||||
i, A[i], X[i]);
|
||||
__opencl_atomic_exchange((atomic_uint*)BA2, -1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
abort();
|
||||
}
|
||||
__opencl_atomic_fetch_add((atomic_uint*)BA2, 1, __ATOMIC_RELEASE,
|
||||
__OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
gpu_spin_loop_or_abort_on_negative_one(AA2, i+1);
|
||||
}
|
||||
}
|
||||
|
||||
// This test runs on gfx90a where XGMI enables fine-grained communication
|
||||
// between GPUs. This performs a message passing test.
|
||||
// Array A is allocated on Device 0, and remotely on Device 1.
|
||||
// Device 0 also increments atomic ints AA1 and AA2.
|
||||
// Array B is allocated on Device 1, and remotely on Device 0.
|
||||
// Device 1 also increments atomic ints BA1 and BA2.
|
||||
// Kernel 0 will launch on Device 0, and store array X into array A.
|
||||
// Kernel 1 will launch on Device 1, and store array Y into array B.
|
||||
// Kernel 0 will validate that the correct values of array Y are stored in B.
|
||||
// Kernel 1 will validate that the correct values of array X are stored in A.
|
||||
|
||||
bool gpu_to_gpu_coherency() {
|
||||
int *A_d, *B_d, *X_d0, *X_d1, *Y_d0, *Y_d1;
|
||||
int *A_h, *B_h, *X_h, *Y_h;
|
||||
size_t N = 1024;
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int numDevices = 0;
|
||||
int numTestDevices = 2;
|
||||
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
if (numDevices < numTestDevices) {
|
||||
printf("info: less than 2 GPUs. skipping multi-GPU test!\n");
|
||||
return true;
|
||||
}
|
||||
printf("info: performing this test only on first two GPUs.\n");
|
||||
|
||||
// Skip this test if both devices are not gfx90a.
|
||||
hipDeviceProp_t props0, props1;
|
||||
HIPCHECK(hipGetDeviceProperties(&props0, 0));
|
||||
HIPCHECK(hipGetDeviceProperties(&props1, 1));
|
||||
if (strncmp(props0.gcnArchName, "gfx90a", 6) != 0 ||
|
||||
strncmp(props1.gcnArchName, "gfx90a", 6) != 0) {
|
||||
printf("info: skipping test on non-gfx90a devices.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allocate Host Side Memory.
|
||||
printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
A_h = (int*)malloc(Nbytes); HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
B_h = (int*)malloc(Nbytes); HIPCHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
X_h = (int*)malloc(Nbytes); HIPCHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
Y_h = (int*)malloc(Nbytes); HIPCHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess );
|
||||
|
||||
// Initialize the arrays and atomic variables.
|
||||
for (size_t i = 0; i < N; i++)
|
||||
{
|
||||
X_h[i] = 100000000 + i;
|
||||
Y_h[i] = 300000000 + i;
|
||||
}
|
||||
|
||||
// Initialize shared atomic flags on host coherent memory.
|
||||
unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
|
||||
unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
|
||||
HIPCHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&AA1_d, AA1_h, 0)); *AA1_h = 0;
|
||||
HIPCHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&AA2_d, AA2_h, 0)); *AA2_h = 0;
|
||||
HIPCHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&BA1_d, BA1_h, 0)); *BA1_h = 0;
|
||||
HIPCHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&BA2_d, BA2_h, 0)); *BA2_h = 0;
|
||||
|
||||
// Skip the first stream.
|
||||
hipStream_t stream[numTestDevices + 1];
|
||||
HIPCHECK(hipStreamCreate(&stream[0]));
|
||||
|
||||
// Set-up Device 0.
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
// Enable P2P access to Device 1.
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(1,0));
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
|
||||
// Allocating Coherent Memory for Array A_d on Device 0.
|
||||
printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
hipError_t status = hipExtMallocWithFlags((void**)&A_d, Nbytes, hipDeviceMallocFinegrained);
|
||||
if (status == hipErrorOutOfMemory || A_d == 0 ) {
|
||||
printf("info: device fine-grained memory not supported on this config\n");
|
||||
printf("info: skipping this GPU-GPU coherency test\n");
|
||||
return true;
|
||||
} else if (status != hipSuccess) {
|
||||
printf("error: failed to allocate device 0 fine-grain memory\n");
|
||||
return false;
|
||||
}
|
||||
HIPCHECK(hipMalloc(&X_d0, Nbytes));
|
||||
HIPCHECK(hipMalloc(&Y_d0, Nbytes));
|
||||
|
||||
// Set-up Device 1.
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
// Enable P2P access to Device 0.
|
||||
HIPCHECK(hipDeviceEnablePeerAccess(0,0));
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream[2], hipStreamNonBlocking));
|
||||
// Allocating Coherent Memory for Array B_d on Device 1.
|
||||
printf("info: allocate device 1 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
status = hipExtMallocWithFlags((void**)&B_d, Nbytes, hipDeviceMallocFinegrained);
|
||||
if (status == hipErrorOutOfMemory || B_d == 0 ) {
|
||||
printf("info: device fine-grained memory not supported on this config\n");
|
||||
printf("info: skipping this GPU-GPU coherency test\n");
|
||||
return true;
|
||||
} else if (status != hipSuccess) {
|
||||
printf("error: failed to allocate device 1 fine-grain memory\n");
|
||||
return false;
|
||||
}
|
||||
HIPCHECK(hipMalloc(&X_d1, Nbytes));
|
||||
HIPCHECK(hipMalloc(&Y_d1, Nbytes));
|
||||
|
||||
// Transfer initialized data onto the device arrays.
|
||||
printf("info: copy Host2Device\n");
|
||||
HIPCHECK(hipMemcpy(X_d0, X_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(X_d1, X_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(Y_d0, Y_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(Y_d1, Y_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
// Prepare and launch the device kernels.
|
||||
const unsigned blocks = 1;
|
||||
const unsigned threadsPerBlock = 1;
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
printf("info: launch gpu kernel 0\n");
|
||||
hipLaunchKernelGGL(gpu_cache0, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[1],
|
||||
A_d, B_d, X_d0, Y_d0, N,
|
||||
AA1_d, AA2_d, BA1_d, BA2_d);
|
||||
// Check if launch failed.
|
||||
HIPCHECK(hipGetLastError());
|
||||
|
||||
HIPCHECK(hipSetDevice(1));
|
||||
printf("info: launch gpu kernel 1\n");
|
||||
hipLaunchKernelGGL(gpu_cache1, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[2],
|
||||
A_d, B_d, X_d1, Y_d1, N,
|
||||
AA1_d, AA2_d, BA1_d, BA2_d);
|
||||
HIPCHECK(hipGetLastError());
|
||||
|
||||
// Wait for kernels on both devices.
|
||||
HIPCHECK(hipStreamSynchronize(stream[1]));
|
||||
HIPCHECK(hipStreamSynchronize(stream[2]));
|
||||
|
||||
// Evaluate the resultant arrays A and B.
|
||||
printf("info: copy Device2Host\n");
|
||||
HIPCHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
printf("info: check result\n");
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
assert(A_h[i] == (100000000 + i));
|
||||
assert(B_h[i] == (300000000 + i));
|
||||
}
|
||||
|
||||
// Free all the device and host memory allocated.
|
||||
HIPCHECK(hipFree(A_d));
|
||||
HIPCHECK(hipFree(B_d));
|
||||
HIPCHECK(hipFree(X_d0));
|
||||
HIPCHECK(hipFree(Y_d0));
|
||||
HIPCHECK(hipFree(X_d1));
|
||||
HIPCHECK(hipFree(Y_d1));
|
||||
HIPCHECK(hipHostFree(AA1_h));
|
||||
HIPCHECK(hipHostFree(AA2_h));
|
||||
HIPCHECK(hipHostFree(BA1_h));
|
||||
HIPCHECK(hipHostFree(BA2_h));
|
||||
free(A_h);
|
||||
free(B_h);
|
||||
free(X_h);
|
||||
free(Y_h);
|
||||
|
||||
printf("info: finished GPU-GPU coherency test!\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
bool passed = true;
|
||||
|
||||
// Coherency between GPUs accessing local or remote FB.
|
||||
passed = passed & gpu_to_gpu_coherency();
|
||||
|
||||
if (passed)
|
||||
passed();
|
||||
return passed;
|
||||
}
|
||||
Reference in New Issue
Block a user