SWDEV-388833 - [catch2][dtest] Sync tests migrated from dtests to catch2
Change-Id: I73664579ab9475e51db436b57ca6538aa7a8d5d7
[ROCm/hip-tests commit: 902c993615]
이 커밋은 다음에 포함됨:
커밋한 사람
Srinivasarao Gollamandala
부모
3e5aa93c49
커밋
d7fec53720
@@ -54,3 +54,4 @@ add_subdirectory(clock)
|
||||
add_subdirectory(vulkan_interop)
|
||||
add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246
|
||||
endif()
|
||||
add_subdirectory(synchronization)
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
# Common Tests - Test independent of all platforms
|
||||
set(TEST_SRC
|
||||
copy_coherency.cc
|
||||
)
|
||||
add_custom_target(memcpyInt.hsaco COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/memcpyIntDevice.cpp -o
|
||||
${CMAKE_CURRENT_BINARY_DIR}/../synchronization/memcpyInt.hsaco -I
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../../include -L
|
||||
${HIP_PATH}/${CMAKE_INSTALL_LIBDIR}/../../include --rocm-path=${ROCM_PATH})
|
||||
# only for AMD
|
||||
if(HIP_PLATFORM MATCHES "amd")
|
||||
set(AMD_SRC
|
||||
cache_coherency_cpu_gpu.cc
|
||||
cache_coherency_gpu_gpu.cc
|
||||
)
|
||||
set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
|
||||
endif()
|
||||
|
||||
hip_add_exe_to_target(NAME synchronizationTests
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
COMPILE_OPTIONS -std=c++14)
|
||||
add_dependencies(synchronizationTests memcpyInt.hsaco)
|
||||
|
||||
@@ -0,0 +1,282 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Simple test for Fine Grained CPU-GPU coherency.
|
||||
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
typedef _Atomic(unsigned int) atomic_uint;
|
||||
|
||||
// Helper function to spin on address until address equals value.
|
||||
// If the address holds the value of -1, abort because the other thread failed.
|
||||
__device__ int
|
||||
gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
|
||||
unsigned int value) {
|
||||
unsigned int compare;
|
||||
bool check = false;
|
||||
do {
|
||||
compare = value;
|
||||
check = __opencl_atomic_compare_exchange_strong(
|
||||
reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
|
||||
/*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
|
||||
/*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
if (compare == -1)
|
||||
return -1;
|
||||
} while (!check);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This kernel requires a single block, single thread dispatch.
|
||||
__global__ void
|
||||
gpu_kernel(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2, unsigned int *dresult) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
// Store data into A, system fence, and atomically mark flag.
|
||||
// This guarantees this global write is visible by device 1.
|
||||
A[i] = X[i];
|
||||
__opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on device 1's global write to B.
|
||||
if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
|
||||
*dresult = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
// Check device 1 properly stored Y into B.
|
||||
bool stored_data_matches = (B[i] == Y[i]);
|
||||
if (!stored_data_matches) {
|
||||
// If the data does not match, alert other thread and abort.
|
||||
printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
|
||||
i, B[i], Y[i]);
|
||||
__opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
*dresult = -1;
|
||||
}
|
||||
// Otherwise tell the other thread to continue.
|
||||
__opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on kernel gpu_cache1 to finish checking X is stored in A.
|
||||
if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
|
||||
*dresult = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*dresult = 0;
|
||||
}
|
||||
|
||||
__host__ int
|
||||
cpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
|
||||
unsigned int value) {
|
||||
unsigned int compare;
|
||||
bool check = false;
|
||||
do {
|
||||
compare = value;
|
||||
check = __atomic_compare_exchange_n(
|
||||
address, /*expected=*/ &compare, /*desired=*/ value,
|
||||
/*weak=*/ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
|
||||
if (compare == -1)
|
||||
return -1;
|
||||
} while (!check);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This host thread runs only on a single CPU thread.
|
||||
__host__ void
|
||||
cpu_thread(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2, unsigned int *hresult) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
B[i] = Y[i];
|
||||
__atomic_fetch_add(BA1, 1, __ATOMIC_RELEASE);
|
||||
if (cpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
|
||||
*hresult = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
bool stored_data_matches = (A[i] == X[i]);
|
||||
if (!stored_data_matches) {
|
||||
printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
|
||||
i, A[i], X[i]);
|
||||
__atomic_exchange_n(BA2, -1, __ATOMIC_RELEASE);
|
||||
*hresult = -1;
|
||||
break;
|
||||
}
|
||||
__atomic_fetch_add(BA2, 1, __ATOMIC_RELEASE);
|
||||
if (cpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
|
||||
*hresult = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*hresult = 0;
|
||||
}
|
||||
|
||||
static bool cpu_to_gpu_coherency() {
|
||||
int *A_d, *B_d, *X_d, *Y_d;
|
||||
int *A_res, *A_h, *B_h, *X_h, *Y_h;
|
||||
unsigned int hresult, dresult;
|
||||
size_t N = 1024;
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int numDevices = 0;
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
if (numDevices < 1) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping because devices < 1");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Skip this test if feature is not supported.
|
||||
static int device0 = 0;
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, device0));
|
||||
if (strncmp(props.gcnArchName, "gfx90a", 6) != 0 &&
|
||||
strncmp(props.gcnArchName, "gfx940", 6) != 0) {
|
||||
printf("info: skipping test on devices other than gfx90a and gfx940.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allocate Host Side Memory. Coherent Fine-grained Memory for array B.
|
||||
printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
HIP_CHECK(hipHostMalloc(&B_h, Nbytes,
|
||||
(hipHostMallocCoherent | hipHostMallocMapped)));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&B_d), B_h, 0));
|
||||
X_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
Y_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
|
||||
// Initialize the arrays and atomic variables.
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
X_h[i] = 100000000 + i;
|
||||
Y_h[i] = 300000000 + i;
|
||||
}
|
||||
|
||||
// Initialize shared atomic flags between CPU and GPU.
|
||||
unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
|
||||
unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
|
||||
HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
|
||||
AA1_h, 0));
|
||||
*AA1_h = 0;
|
||||
HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
|
||||
AA2_h, 0));
|
||||
*AA2_h = 0;
|
||||
HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
|
||||
BA1_h, 0));
|
||||
*BA1_h = 0;
|
||||
HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
|
||||
BA2_h, 0));
|
||||
*BA2_h = 0;
|
||||
|
||||
// Skip the first stream, ensure stream is non-blocking.
|
||||
hipStream_t stream[2];
|
||||
HIP_CHECK(hipStreamCreate(&stream[0]));
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
|
||||
|
||||
// Allocate Device Side Memory. Coherent Fine-grained Memory for array A.
|
||||
printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
|
||||
Nbytes, hipDeviceMallocFinegrained);
|
||||
REQUIRE(status == hipSuccess);
|
||||
// SVM memory - host pointer is the same as device pointer to array A.
|
||||
A_h = A_d;
|
||||
HIP_CHECK(hipMalloc(&X_d, Nbytes));
|
||||
HIP_CHECK(hipMalloc(&Y_d, Nbytes));
|
||||
|
||||
HIP_CHECK(hipMemcpy(X_d, X_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(Y_d, Y_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
// Launch the GPU kernel.
|
||||
const unsigned blocks = 1;
|
||||
const unsigned threadsPerBlock = 1;
|
||||
hipLaunchKernelGGL(gpu_kernel, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[1],
|
||||
A_d, B_d, X_d, Y_d, N,
|
||||
AA1_d, AA2_d, BA1_d, BA2_d, &dresult);
|
||||
// Check if launch failed.
|
||||
HIP_CHECK(hipGetLastError());
|
||||
REQUIRE(dresult == 0);
|
||||
|
||||
// Do not sync the launched stream, instead run the cpu_thread.
|
||||
std::thread host_thread(cpu_thread,
|
||||
A_h, B_h, X_h, Y_h, N,
|
||||
AA1_h, AA2_h, BA1_h, BA2_h, &hresult);
|
||||
host_thread.detach();
|
||||
REQUIRE(hresult == 0);
|
||||
// Wait for Device side to finish.
|
||||
HIP_CHECK(hipStreamSynchronize(stream[1]));
|
||||
|
||||
// Evaluate the resultant arrays A and B.
|
||||
A_res = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(A_res == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
HIP_CHECK(hipMemcpy(A_res, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
REQUIRE(A_res[i] == (100000000 + i));
|
||||
REQUIRE(B_h[i] == (300000000 + i));
|
||||
}
|
||||
|
||||
// Free all the device and host memory allocated.
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
HIP_CHECK(hipFree(X_d));
|
||||
HIP_CHECK(hipFree(Y_d));
|
||||
HIP_CHECK(hipHostFree(AA1_h));
|
||||
HIP_CHECK(hipHostFree(AA2_h));
|
||||
HIP_CHECK(hipHostFree(BA1_h));
|
||||
HIP_CHECK(hipHostFree(BA2_h));
|
||||
HIP_CHECK(hipHostFree(B_h));
|
||||
free(X_h);
|
||||
free(Y_h);
|
||||
free(A_res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - This test runs on devices where XGMI enables fine-grained communication
|
||||
* between GPUs. This performs a message passing test.
|
||||
* Array A is allocated on Device 0, and remotely on host.
|
||||
* Device 0 also increments atomic ints AA1 and AA2.
|
||||
* Array B is allocated on host, and remotely on Device 0.
|
||||
* Host also increments atomic ints BA1 and BA2.
|
||||
* Kernel will launch on Device 0, and store array X into array A.
|
||||
* Host Thread will store array Y into array B.
|
||||
* Kernel will validate that the correct values of array Y are stored in B.
|
||||
* Host Thread will validate that the correct values of array X are stored in A.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/synchronization/cache_coherency_cpu_gpu.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.5
|
||||
* - Test to be run only on AMD.
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_cache_coherency_cpu_gpu") {
|
||||
bool passed = true;
|
||||
// Coherency between CPU and GPU sharing host and device memory.
|
||||
REQUIRE(passed == cpu_to_gpu_coherency());
|
||||
}
|
||||
@@ -0,0 +1,294 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Simple test for Fine Grained GPU-GPU coherency.
|
||||
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
typedef _Atomic(unsigned int) atomic_uint;
|
||||
|
||||
// Helper function to spin on address until address equals value.
|
||||
// If the address holds the value of -1, abort because the other thread failed.
|
||||
__device__ int
|
||||
gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
|
||||
unsigned int value) {
|
||||
unsigned int compare;
|
||||
bool check = false;
|
||||
do {
|
||||
compare = value;
|
||||
check = __opencl_atomic_compare_exchange_strong(
|
||||
reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
|
||||
/*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
|
||||
/*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
if (compare == -1)
|
||||
return -1;
|
||||
} while (!check);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This kernel requires a single block, single thread dispatch.
|
||||
__global__ void
|
||||
gpu_cache0(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2, unsigned int *cache0_result) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
// Store data into A, system fence, and atomically mark flag.
|
||||
// This guarantees this global write is visible by device 1.
|
||||
A[i] = X[i];
|
||||
__opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on device 1's global write to B.
|
||||
if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
|
||||
*cache0_result = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
// Check device 1 properly stored Y into B.
|
||||
bool stored_data_matches = (B[i] == Y[i]);
|
||||
if (!stored_data_matches) {
|
||||
// If the data does not match, alert other thread and abort.
|
||||
printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
|
||||
i, B[i], Y[i]);
|
||||
__opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
*cache0_result = -1;
|
||||
}
|
||||
// Otherwise tell the other thread to continue.
|
||||
__opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
// Wait on kernel gpu_cache1 to finish checking X is stored in A.
|
||||
if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
|
||||
*cache0_result = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*cache0_result = 0;
|
||||
}
|
||||
|
||||
// This kernel requires a single block, single thread dispatch.
|
||||
__global__ void
|
||||
gpu_cache1(int *A, int *B, int *X, int *Y, size_t N,
|
||||
unsigned int *AA1, unsigned int *AA2,
|
||||
unsigned int *BA1, unsigned int *BA2, unsigned int *cache1_result) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
B[i] = Y[i];
|
||||
__opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA1), 1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
if (gpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
|
||||
*cache1_result = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
bool stored_data_matches = (A[i] == X[i]);
|
||||
if (!stored_data_matches) {
|
||||
printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
|
||||
i, A[i], X[i]);
|
||||
__opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(BA2), -1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
*cache1_result = -1;
|
||||
}
|
||||
__opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA2), 1,
|
||||
__ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
|
||||
if (gpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
|
||||
*cache1_result = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*cache1_result = 0;
|
||||
}
|
||||
|
||||
static bool gpu_to_gpu_coherency() {
|
||||
int *A_d, *B_d, *X_d0, *X_d1, *Y_d0, *Y_d1;
|
||||
int *A_h, *B_h, *X_h, *Y_h;
|
||||
unsigned int cache0_result, cache1_result;
|
||||
size_t N = 1024;
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
int numDevices = 0;
|
||||
int numTestDevices = 2;
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
if (numDevices < numTestDevices) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Skip this test if either device does not support this feature.
|
||||
hipDeviceProp_t props0, props1;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props0, 0));
|
||||
HIP_CHECK(hipGetDeviceProperties(&props1, 1));
|
||||
if ((strncmp(props0.gcnArchName, "gfx90a", 6) != 0 ||
|
||||
strncmp(props1.gcnArchName, "gfx90a", 6) != 0) &&
|
||||
(strncmp(props0.gcnArchName, "gfx940", 6) != 0 ||
|
||||
strncmp(props1.gcnArchName, "gfx940", 6) != 0)) {
|
||||
printf("info: skipping test on devices other than gfx90a and gfx940.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allocate Host Side Memory.
|
||||
printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
A_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
B_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
X_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
Y_h = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
|
||||
// Initialize the arrays and atomic variables.
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
X_h[i] = 100000000 + i;
|
||||
Y_h[i] = 300000000 + i;
|
||||
}
|
||||
|
||||
// Initialize shared atomic flags on host coherent memory.
|
||||
unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
|
||||
unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
|
||||
HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
|
||||
AA1_h, 0));
|
||||
*AA1_h = 0;
|
||||
HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
|
||||
AA2_h, 0));
|
||||
*AA2_h = 0;
|
||||
HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
|
||||
BA1_h, 0));
|
||||
*BA1_h = 0;
|
||||
HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
|
||||
BA2_h, 0));
|
||||
*BA2_h = 0;
|
||||
|
||||
// Skip the first stream.
|
||||
hipStream_t stream[3];
|
||||
HIP_CHECK(hipStreamCreate(&stream[0]));
|
||||
|
||||
// Set-up Device 0.
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
// Enable P2P access to Device 1.
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(1, 0));
|
||||
HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
|
||||
// Allocating Coherent Memory for Array A_d on Device 0.
|
||||
printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
|
||||
Nbytes, hipDeviceMallocFinegrained);
|
||||
REQUIRE(status == hipSuccess);
|
||||
HIP_CHECK(hipMalloc(&X_d0, Nbytes));
|
||||
HIP_CHECK(hipMalloc(&Y_d0, Nbytes));
|
||||
|
||||
// Set-up Device 1.
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
// Enable P2P access to Device 0.
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(0, 0));
|
||||
HIP_CHECK(hipStreamCreateWithFlags(&stream[2], hipStreamNonBlocking));
|
||||
// Allocating Coherent Memory for Array B_d on Device 1.
|
||||
printf("info: allocate device 1 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
status = hipExtMallocWithFlags(reinterpret_cast<void**>(&B_d),
|
||||
Nbytes, hipDeviceMallocFinegrained);
|
||||
REQUIRE(status == hipSuccess);
|
||||
HIP_CHECK(hipMalloc(&X_d1, Nbytes));
|
||||
HIP_CHECK(hipMalloc(&Y_d1, Nbytes));
|
||||
|
||||
// Transfer initialized data onto the device arrays.
|
||||
HIP_CHECK(hipMemcpy(X_d0, X_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(X_d1, X_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(Y_d0, Y_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(Y_d1, Y_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
// Prepare and launch the device kernels.
|
||||
const unsigned blocks = 1;
|
||||
const unsigned threadsPerBlock = 1;
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
hipLaunchKernelGGL(gpu_cache0, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[1],
|
||||
A_d, B_d, X_d0, Y_d0, N,
|
||||
AA1_d, AA2_d, BA1_d, BA2_d, &cache0_result);
|
||||
// Check if launch failed.
|
||||
HIP_CHECK(hipGetLastError());
|
||||
REQUIRE(cache0_result == 0);
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
hipLaunchKernelGGL(gpu_cache1, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, stream[2],
|
||||
A_d, B_d, X_d1, Y_d1, N,
|
||||
AA1_d, AA2_d, BA1_d, BA2_d, &cache1_result);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
REQUIRE(cache1_result == 0);
|
||||
|
||||
// Wait for kernels on both devices.
|
||||
HIP_CHECK(hipStreamSynchronize(stream[1]));
|
||||
HIP_CHECK(hipStreamSynchronize(stream[2]));
|
||||
|
||||
// Evaluate the resultant arrays A and B.
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
REQUIRE(A_h[i] == (100000000 + i));
|
||||
REQUIRE(B_h[i] == (300000000 + i));
|
||||
}
|
||||
|
||||
// Free all the device and host memory allocated.
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
HIP_CHECK(hipFree(B_d));
|
||||
HIP_CHECK(hipFree(X_d0));
|
||||
HIP_CHECK(hipFree(Y_d0));
|
||||
HIP_CHECK(hipFree(X_d1));
|
||||
HIP_CHECK(hipFree(Y_d1));
|
||||
HIP_CHECK(hipHostFree(AA1_h));
|
||||
HIP_CHECK(hipHostFree(AA2_h));
|
||||
HIP_CHECK(hipHostFree(BA1_h));
|
||||
HIP_CHECK(hipHostFree(BA2_h));
|
||||
free(A_h);
|
||||
free(B_h);
|
||||
free(X_h);
|
||||
free(Y_h);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - This test runs on devices where XGMI enables fine-grained communication
|
||||
* between GPUs. This performs a message passing test.
|
||||
* Array A is allocated on Device 0, and remotely on Device 1.
|
||||
* Device 0 also increments atomic ints AA1 and AA2.
|
||||
* Array B is allocated on Device 1, and remotely on Device 0.
|
||||
* Device 1 also increments atomic ints BA1 and BA2.
|
||||
* Kernel 0 will launch on Device 0, and store array X into array A.
|
||||
* Kernel 1 will launch on Device 1, and store array Y into array B.
|
||||
* Kernel 0 will validate that the correct values of array Y are stored in B.
|
||||
* Kernel 1 will validate that the correct values of array X are stored in A.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/synchronization/cache_coherency_gpu_gpu.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.5
|
||||
* - Test to be run only on AMD.
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_cache_coherency_gpu_gpu") {
|
||||
bool passed = true;
|
||||
// Coherency between GPUs accessing local or remote FB.
|
||||
REQUIRE(passed == gpu_to_gpu_coherency());
|
||||
}
|
||||
@@ -0,0 +1,340 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
unsigned threadsPerBlock = 256;
|
||||
unsigned blocksPerCU = 6;
|
||||
|
||||
class MemcpyFunction {
|
||||
public:
|
||||
MemcpyFunction(const char* fileName, const char* functionName) {
|
||||
load(fileName, functionName);
|
||||
}
|
||||
void load(const char* fileName, const char* functionName);
|
||||
void launch(int* dst, const int* src, size_t numElements, hipStream_t s);
|
||||
|
||||
private:
|
||||
hipFunction_t _function;
|
||||
hipModule_t _module;
|
||||
};
|
||||
|
||||
|
||||
void MemcpyFunction::load(const char* fileName, const char* functionName) {
|
||||
HIP_CHECK(hipModuleLoad(&_module, fileName));
|
||||
HIP_CHECK(hipModuleGetFunction(&_function, _module, functionName));
|
||||
}
|
||||
|
||||
void MemcpyFunction::launch(int* dst, const int* src, size_t numElements, hipStream_t s) { // NOLINT
|
||||
struct {
|
||||
int* _dst;
|
||||
const int* _src;
|
||||
size_t _numElements;
|
||||
} args;
|
||||
|
||||
args._dst = dst;
|
||||
args._src = src;
|
||||
args._numElements = numElements;
|
||||
|
||||
size_t size = sizeof(args);
|
||||
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock,
|
||||
numElements);
|
||||
HIP_CHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock,
|
||||
1, 1, 0, s, NULL,
|
||||
reinterpret_cast<void**>(&config)));
|
||||
}
|
||||
|
||||
bool g_warnOnFail = true;
|
||||
int g_elementSizes[] = {128 * 1000, 256 * 1000, 16 * 1000 * 1000};
|
||||
|
||||
// Set value of array to specified 32-bit integer:
|
||||
__global__ void memsetIntKernel(int* ptr, const int val, size_t numElements) {
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = gid; i < numElements; i += stride) {
|
||||
ptr[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void memcpyIntKernel(int* dst, const int* src, size_t numElements) {
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = gid; i < numElements; i += stride) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Check arrays in reverse order, to more easily detect cases where
|
||||
// the copy is "partially" done.
|
||||
void checkReverse(const int* ptr, int numElements, int expected) {
|
||||
int mismatchCnt = 0;
|
||||
for (int i = numElements - 1; i >= 0; i--) {
|
||||
if (!g_warnOnFail) {
|
||||
REQUIRE(ptr[i] == expected);
|
||||
}
|
||||
if (++mismatchCnt >= 10) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define ENUM_CASE_STR(x) \
|
||||
case x: \
|
||||
return #x
|
||||
|
||||
enum CmdType { COPY, KERNEL, MODULE_KERNEL, MAX_CmdType };
|
||||
|
||||
const char* CmdTypeStr(CmdType c) {
|
||||
switch (c) {
|
||||
ENUM_CASE_STR(COPY);
|
||||
ENUM_CASE_STR(KERNEL);
|
||||
ENUM_CASE_STR(MODULE_KERNEL);
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
enum SyncType {
|
||||
NONE,
|
||||
EVENT_QUERY,
|
||||
EVENT_SYNC,
|
||||
STREAM_WAIT_EVENT,
|
||||
STREAM_QUERY,
|
||||
STREAM_SYNC,
|
||||
DEVICE_SYNC,
|
||||
MAX_SyncType
|
||||
};
|
||||
|
||||
const char* SyncTypeStr(SyncType s) {
|
||||
switch (s) {
|
||||
ENUM_CASE_STR(NONE);
|
||||
ENUM_CASE_STR(EVENT_QUERY);
|
||||
ENUM_CASE_STR(EVENT_SYNC);
|
||||
ENUM_CASE_STR(STREAM_WAIT_EVENT);
|
||||
ENUM_CASE_STR(STREAM_QUERY);
|
||||
ENUM_CASE_STR(STREAM_SYNC);
|
||||
ENUM_CASE_STR(DEVICE_SYNC);
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
void runCmd(CmdType cmd, int* dst, const int* src, hipStream_t s,
|
||||
size_t numElements) {
|
||||
switch (cmd) {
|
||||
case COPY:
|
||||
HIP_CHECK(
|
||||
hipMemcpyAsync(dst, src, numElements * sizeof(int),
|
||||
hipMemcpyDeviceToDevice, s));
|
||||
break;
|
||||
case KERNEL: {
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
|
||||
threadsPerBlock, numElements);
|
||||
hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, s, dst, src, numElements);
|
||||
} break;
|
||||
case MODULE_KERNEL: {
|
||||
MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
|
||||
g_moduleMemcpy.launch(dst, src, numElements, s);
|
||||
} break;
|
||||
default:
|
||||
printf("Info:unknown cmd=%d type", cmd);
|
||||
}
|
||||
}
|
||||
|
||||
void resetInputs(int* Ad, int* Bd, int* Ch,
|
||||
size_t numElements, int expected) {
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
|
||||
threadsPerBlock, numElements);
|
||||
hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, hipStream_t(0), Ad, expected, numElements);
|
||||
// poison with bad value to ensure is overwritten correctly
|
||||
hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, hipStream_t(0), Bd, 0xDEADBEEF, numElements);
|
||||
hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, hipStream_t(0), Bd, 0xF000BA55, numElements);
|
||||
memset(Ch, 13, numElements * sizeof(int));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
// Intended to test proper synchronization and cache flushing
|
||||
// between CMDA and CMDB. CMD are of type CmdType. All command copy memory,
|
||||
// using either hipMemcpyAsync or kernel implementations.
|
||||
// Some form of synchronization is applied. Then cmdB copies from Bd to Cd.
|
||||
// CmdA copies from Ad to Bd, Cd is then copied to host Ch using a memory copy.
|
||||
// Correct result at the end is that Ch contains the
|
||||
// contents originally in Ad (integer 0x42)
|
||||
|
||||
void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
|
||||
hipStream_t stream1, hipStream_t stream2, int numElements,
|
||||
int* Ad, int* Bd, int* Cd, int* Ch, int expected) {
|
||||
hipEvent_t e;
|
||||
HIP_CHECK(hipEventCreateWithFlags(&e, 0));
|
||||
|
||||
resetInputs(Ad, Bd, Ch, numElements, expected);
|
||||
|
||||
const size_t sizeElements = numElements * sizeof(int);
|
||||
fprintf(stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", // NOLINT
|
||||
sizeElements, static_cast<double>(sizeElements / 1024.0),
|
||||
CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));
|
||||
|
||||
/*if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { // NOLINT
|
||||
fprintf(stderr, "warn: skipping since test infra does not yet support modules\n"); // NOLINT
|
||||
return;
|
||||
}*/
|
||||
|
||||
// Step A:
|
||||
runCmd(cmdAType, Bd, Ad, stream1, numElements);
|
||||
|
||||
// Sync in-between?
|
||||
switch (syncType) {
|
||||
case NONE:
|
||||
break;
|
||||
case EVENT_QUERY: {
|
||||
hipError_t st = hipErrorNotReady;
|
||||
HIP_CHECK(hipEventRecord(e, stream1));
|
||||
do {
|
||||
st = hipEventQuery(e);
|
||||
} while (st == hipErrorNotReady);
|
||||
HIP_CHECK(st);
|
||||
} break;
|
||||
case EVENT_SYNC:
|
||||
HIP_CHECK(hipEventRecord(e, stream1));
|
||||
HIP_CHECK(hipEventSynchronize(e));
|
||||
break;
|
||||
case STREAM_WAIT_EVENT:
|
||||
HIP_CHECK(hipEventRecord(e, stream1));
|
||||
HIP_CHECK(hipStreamWaitEvent(stream2, e, 0));
|
||||
break;
|
||||
case STREAM_QUERY: {
|
||||
hipError_t st = hipErrorNotReady;
|
||||
do {
|
||||
st = hipStreamQuery(stream1);
|
||||
} while (st == hipErrorNotReady);
|
||||
HIP_CHECK(st);
|
||||
} break;
|
||||
case STREAM_SYNC:
|
||||
HIP_CHECK(hipStreamSynchronize(stream1));
|
||||
break;
|
||||
case DEVICE_SYNC:
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType));
|
||||
return;
|
||||
}
|
||||
runCmd(cmdBType, Cd, Bd, stream2, numElements);
|
||||
|
||||
// Copy back to host, use async copy to avoid any extra synchronization
|
||||
// that might mask issues.
|
||||
HIP_CHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost,
|
||||
stream2));
|
||||
HIP_CHECK(hipStreamSynchronize(stream2));
|
||||
|
||||
checkReverse(Ch, numElements, expected);
|
||||
|
||||
HIP_CHECK(hipEventDestroy(e));
|
||||
}
|
||||
|
||||
void testWrapper(size_t numElements) {
|
||||
const size_t sizeElements = numElements * sizeof(int);
|
||||
const int expected = 0x42;
|
||||
int *Ad, *Bd, *Cd, *Ch;
|
||||
|
||||
HIP_CHECK(hipMalloc(&Ad, sizeElements));
|
||||
HIP_CHECK(hipMalloc(&Bd, sizeElements));
|
||||
HIP_CHECK(hipMalloc(&Cd, sizeElements));
|
||||
HIP_CHECK(hipHostMalloc(&Ch, sizeElements));
|
||||
|
||||
hipStream_t stream1, stream2;
|
||||
|
||||
HIP_CHECK(hipStreamCreate(&stream1));
|
||||
HIP_CHECK(hipStreamCreate(&stream2));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements,
|
||||
Ad, Bd, Cd, Ch, expected);
|
||||
|
||||
for (int cmdA = 0; cmdA < MAX_CmdType; cmdA++) {
|
||||
for (int cmdB = 0; cmdB < MAX_CmdType; cmdB++) {
|
||||
for (int syncMode = 0; syncMode < MAX_SyncType; syncMode++) {
|
||||
switch (syncMode) {
|
||||
// case NONE::
|
||||
case EVENT_QUERY:
|
||||
case EVENT_SYNC:
|
||||
case STREAM_WAIT_EVENT:
|
||||
// case STREAM_QUERY:
|
||||
case STREAM_SYNC:
|
||||
case DEVICE_SYNC:
|
||||
runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB),
|
||||
stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2,
|
||||
numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements,
|
||||
Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2,
|
||||
numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements,
|
||||
Ad, Bd, Cd, Ch, expected);
|
||||
#endif
|
||||
|
||||
HIP_CHECK(hipFree(Ad));
|
||||
HIP_CHECK(hipFree(Bd));
|
||||
HIP_CHECK(hipFree(Cd));
|
||||
HIP_CHECK(hipHostFree(Ch));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream1));
|
||||
HIP_CHECK(hipStreamDestroy(stream2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test cache management (fences) and synchronization between
|
||||
* kernel and copy commands. Exhaustively tests 3 command types
|
||||
* (copy, kernel, module kernel), many sync types (see SyncType), followed by
|
||||
* another command, across a sweep of data sizes designed to stress
|
||||
* various levels of the memory hierarchy.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/synchronization/copy_coherency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.5
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_Copy_Coherency") {
|
||||
for (int index = 0; index < sizeof(g_elementSizes) / sizeof(int); index++) {
|
||||
size_t numElements = g_elementSizes[index];
|
||||
testWrapper(numElements);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
extern "C" __global__ void memcpyIntKernel(int* dst, const int* src,
|
||||
size_t numElements) {
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = gid; i < numElements; i += stride) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
};
|
||||
|
||||
새 이슈에서 참조
사용자 차단