diff --git a/projects/hip-tests/catch/unit/CMakeLists.txt b/projects/hip-tests/catch/unit/CMakeLists.txt index 5543643f96..7d44709d1d 100644 --- a/projects/hip-tests/catch/unit/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/CMakeLists.txt @@ -54,3 +54,4 @@ add_subdirectory(clock) add_subdirectory(vulkan_interop) add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246 endif() +add_subdirectory(synchronization) diff --git a/projects/hip-tests/catch/unit/synchronization/CMakeLists.txt b/projects/hip-tests/catch/unit/synchronization/CMakeLists.txt new file mode 100644 index 0000000000..5e0454806f --- /dev/null +++ b/projects/hip-tests/catch/unit/synchronization/CMakeLists.txt @@ -0,0 +1,25 @@ +# Common Tests - Test independent of all platforms +set(TEST_SRC + copy_coherency.cc +) +add_custom_target(memcpyInt.hsaco COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR} + ${CMAKE_CURRENT_SOURCE_DIR}/memcpyIntDevice.cpp -o + ${CMAKE_CURRENT_BINARY_DIR}/../synchronization/memcpyInt.hsaco -I + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/ -I + ${CMAKE_CURRENT_SOURCE_DIR}/../../include -L + ${HIP_PATH}/${CMAKE_INSTALL_LIBDIR}/../../include --rocm-path=${ROCM_PATH}) +# only for AMD +if(HIP_PLATFORM MATCHES "amd") + set(AMD_SRC + cache_coherency_cpu_gpu.cc + cache_coherency_gpu_gpu.cc + ) + set(TEST_SRC ${TEST_SRC} ${AMD_SRC}) +endif() + +hip_add_exe_to_target(NAME synchronizationTests + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + COMPILE_OPTIONS -std=c++14) +add_dependencies(synchronizationTests memcpyInt.hsaco) + diff --git a/projects/hip-tests/catch/unit/synchronization/cache_coherency_cpu_gpu.cc b/projects/hip-tests/catch/unit/synchronization/cache_coherency_cpu_gpu.cc new file mode 100644 index 0000000000..c33eff82da --- /dev/null +++ b/projects/hip-tests/catch/unit/synchronization/cache_coherency_cpu_gpu.cc @@ -0,0 +1,282 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Simple test for Fine Grained CPU-GPU coherency. + +#include +#include + +typedef _Atomic(unsigned int) atomic_uint; + +// Helper function to spin on address until address equals value. +// If the address holds the value of -1, abort because the other thread failed. +__device__ int +gpu_spin_loop_or_abort_on_negative_one(unsigned int* address, + unsigned int value) { + unsigned int compare; + bool check = false; + do { + compare = value; + check = __opencl_atomic_compare_exchange_strong( + reinterpret_cast(address), /*expected=*/ &compare, + /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, + /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + if (compare == -1) + return -1; + } while (!check); + return 0; +} + +// This kernel requires a single block, single thread dispatch. +__global__ void +gpu_kernel(int *A, int *B, int *X, int *Y, size_t N, + unsigned int *AA1, unsigned int *AA2, + unsigned int *BA1, unsigned int *BA2, unsigned int *dresult) { + for (size_t i = 0; i < N; i++) { + // Store data into A, system fence, and atomically mark flag. + // This guarantees this global write is visible by device 1. + A[i] = X[i]; + __opencl_atomic_fetch_add(reinterpret_cast(AA1), 1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + // Wait on device 1's global write to B. + if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) { + *dresult = -1; + break; + } + + // Check device 1 properly stored Y into B. + bool stored_data_matches = (B[i] == Y[i]); + if (!stored_data_matches) { + // If the data does not match, alert other thread and abort. + printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n", + i, B[i], Y[i]); + __opencl_atomic_exchange(reinterpret_cast(AA2), -1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + *dresult = -1; + } + // Otherwise tell the other thread to continue. + __opencl_atomic_fetch_add(reinterpret_cast(AA2), 1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + // Wait on kernel gpu_cache1 to finish checking X is stored in A. + if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) { + *dresult = -1; + break; + } + } + *dresult = 0; +} + +__host__ int +cpu_spin_loop_or_abort_on_negative_one(unsigned int* address, + unsigned int value) { + unsigned int compare; + bool check = false; + do { + compare = value; + check = __atomic_compare_exchange_n( + address, /*expected=*/ &compare, /*desired=*/ value, + /*weak=*/ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE); + if (compare == -1) + return -1; + } while (!check); + return 0; +} + +// This host thread runs only on a single CPU thread. +__host__ void +cpu_thread(int *A, int *B, int *X, int *Y, size_t N, + unsigned int *AA1, unsigned int *AA2, + unsigned int *BA1, unsigned int *BA2, unsigned int *hresult) { + for (size_t i = 0; i < N; i++) { + B[i] = Y[i]; + __atomic_fetch_add(BA1, 1, __ATOMIC_RELEASE); + if (cpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) { + *hresult = -1; + break; + } + + bool stored_data_matches = (A[i] == X[i]); + if (!stored_data_matches) { + printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n", + i, A[i], X[i]); + __atomic_exchange_n(BA2, -1, __ATOMIC_RELEASE); + *hresult = -1; + break; + } + __atomic_fetch_add(BA2, 1, __ATOMIC_RELEASE); + if (cpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) { + *hresult = -1; + break; + } + } + *hresult = 0; +} + +static bool cpu_to_gpu_coherency() { + int *A_d, *B_d, *X_d, *Y_d; + int *A_res, *A_h, *B_h, *X_h, *Y_h; + unsigned int hresult, dresult; + size_t N = 1024; + size_t Nbytes = N * sizeof(int); + int numDevices = 0; + + HIP_CHECK(hipGetDeviceCount(&numDevices)); + if (numDevices < 1) { + HipTest::HIP_SKIP_TEST("Skipping because devices < 1"); + return 0; + } + + // Skip this test if feature is not supported. + static int device0 = 0; + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, device0)); + if (strncmp(props.gcnArchName, "gfx90a", 6) != 0 && + strncmp(props.gcnArchName, "gfx940", 6) != 0) { + printf("info: skipping test on devices other than gfx90a and gfx940.\n"); + return true; + } + + // Allocate Host Side Memory. Coherent Fine-grained Memory for array B. + printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + HIP_CHECK(hipHostMalloc(&B_h, Nbytes, + (hipHostMallocCoherent | hipHostMallocMapped))); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&B_d), B_h, 0)); + X_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess); + Y_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess); + + // Initialize the arrays and atomic variables. + for (size_t i = 0; i < N; i++) { + X_h[i] = 100000000 + i; + Y_h[i] = 300000000 + i; + } + + // Initialize shared atomic flags between CPU and GPU. + unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h; + unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d; + HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&AA1_d), + AA1_h, 0)); + *AA1_h = 0; + HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&AA2_d), + AA2_h, 0)); + *AA2_h = 0; + HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&BA1_d), + BA1_h, 0)); + *BA1_h = 0; + HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&BA2_d), + BA2_h, 0)); + *BA2_h = 0; + + // Skip the first stream, ensure stream is non-blocking. + hipStream_t stream[2]; + HIP_CHECK(hipStreamCreate(&stream[0])); + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking)); + + // Allocate Device Side Memory. Coherent Fine-grained Memory for array A. + printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + hipError_t status = hipExtMallocWithFlags(reinterpret_cast(&A_d), + Nbytes, hipDeviceMallocFinegrained); + REQUIRE(status == hipSuccess); + // SVM memory - host pointer is the same as device pointer to array A. + A_h = A_d; + HIP_CHECK(hipMalloc(&X_d, Nbytes)); + HIP_CHECK(hipMalloc(&Y_d, Nbytes)); + + HIP_CHECK(hipMemcpy(X_d, X_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(Y_d, Y_h, Nbytes, hipMemcpyHostToDevice)); + + // Launch the GPU kernel. + const unsigned blocks = 1; + const unsigned threadsPerBlock = 1; + hipLaunchKernelGGL(gpu_kernel, dim3(blocks), dim3(threadsPerBlock), + 0, stream[1], + A_d, B_d, X_d, Y_d, N, + AA1_d, AA2_d, BA1_d, BA2_d, &dresult); + // Check if launch failed. + HIP_CHECK(hipGetLastError()); + REQUIRE(dresult == 0); + + // Do not sync the launched stream, instead run the cpu_thread. + std::thread host_thread(cpu_thread, + A_h, B_h, X_h, Y_h, N, + AA1_h, AA2_h, BA1_h, BA2_h, &hresult); + host_thread.detach(); + REQUIRE(hresult == 0); + // Wait for Device side to finish. + HIP_CHECK(hipStreamSynchronize(stream[1])); + + // Evaluate the resultant arrays A and B. + A_res = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(A_res == 0 ? hipErrorOutOfMemory : hipSuccess); + HIP_CHECK(hipMemcpy(A_res, A_d, Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < N; i++) { + REQUIRE(A_res[i] == (100000000 + i)); + REQUIRE(B_h[i] == (300000000 + i)); + } + + // Free all the device and host memory allocated. + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(X_d)); + HIP_CHECK(hipFree(Y_d)); + HIP_CHECK(hipHostFree(AA1_h)); + HIP_CHECK(hipHostFree(AA2_h)); + HIP_CHECK(hipHostFree(BA1_h)); + HIP_CHECK(hipHostFree(BA2_h)); + HIP_CHECK(hipHostFree(B_h)); + free(X_h); + free(Y_h); + free(A_res); + + return true; +} + +/** + * Test Description + * ------------------------ + * - This test runs on devices where XGMI enables fine-grained communication + * between GPUs. This performs a message passing test. + * Array A is allocated on Device 0, and remotely on host. + * Device 0 also increments atomic ints AA1 and AA2. + * Array B is allocated on host, and remotely on Device 0. + * Host also increments atomic ints BA1 and BA2. + * Kernel will launch on Device 0, and store array X into array A. + * Host Thread will store array Y into array B. + * Kernel will validate that the correct values of array Y are stored in B. + * Host Thread will validate that the correct values of array X are stored in A. + + * Test source + * ------------------------ + * - catch/unit/synchronization/cache_coherency_cpu_gpu.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.5 + * - Test to be run only on AMD. + */ + +TEST_CASE("Unit_cache_coherency_cpu_gpu") { + bool passed = true; + // Coherency between CPU and GPU sharing host and device memory. + REQUIRE(passed == cpu_to_gpu_coherency()); +} diff --git a/projects/hip-tests/catch/unit/synchronization/cache_coherency_gpu_gpu.cc b/projects/hip-tests/catch/unit/synchronization/cache_coherency_gpu_gpu.cc new file mode 100644 index 0000000000..3a645c2c39 --- /dev/null +++ b/projects/hip-tests/catch/unit/synchronization/cache_coherency_gpu_gpu.cc @@ -0,0 +1,294 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Simple test for Fine Grained GPU-GPU coherency. + +#include +#include + +typedef _Atomic(unsigned int) atomic_uint; + +// Helper function to spin on address until address equals value. +// If the address holds the value of -1, abort because the other thread failed. +__device__ int +gpu_spin_loop_or_abort_on_negative_one(unsigned int* address, + unsigned int value) { + unsigned int compare; + bool check = false; + do { + compare = value; + check = __opencl_atomic_compare_exchange_strong( + reinterpret_cast(address), /*expected=*/ &compare, + /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, + /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + if (compare == -1) + return -1; + } while (!check); + return 0; +} + +// This kernel requires a single block, single thread dispatch. +__global__ void +gpu_cache0(int *A, int *B, int *X, int *Y, size_t N, + unsigned int *AA1, unsigned int *AA2, + unsigned int *BA1, unsigned int *BA2, unsigned int *cache0_result) { + for (size_t i = 0; i < N; i++) { + // Store data into A, system fence, and atomically mark flag. + // This guarantees this global write is visible by device 1. + A[i] = X[i]; + __opencl_atomic_fetch_add(reinterpret_cast(AA1), 1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + // Wait on device 1's global write to B. + if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) { + *cache0_result = -1; + break; + } + + // Check device 1 properly stored Y into B. + bool stored_data_matches = (B[i] == Y[i]); + if (!stored_data_matches) { + // If the data does not match, alert other thread and abort. + printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n", + i, B[i], Y[i]); + __opencl_atomic_exchange(reinterpret_cast(AA2), -1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + *cache0_result = -1; + } + // Otherwise tell the other thread to continue. + __opencl_atomic_fetch_add(reinterpret_cast(AA2), 1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + // Wait on kernel gpu_cache1 to finish checking X is stored in A. + if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) { + *cache0_result = -1; + break; + } + } + *cache0_result = 0; +} + +// This kernel requires a single block, single thread dispatch. +__global__ void +gpu_cache1(int *A, int *B, int *X, int *Y, size_t N, + unsigned int *AA1, unsigned int *AA2, + unsigned int *BA1, unsigned int *BA2, unsigned int *cache1_result) { + for (size_t i = 0; i < N; i++) { + B[i] = Y[i]; + __opencl_atomic_fetch_add(reinterpret_cast(BA1), 1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + if (gpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) { + *cache1_result = -1; + break; + } + + bool stored_data_matches = (A[i] == X[i]); + if (!stored_data_matches) { + printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n", + i, A[i], X[i]); + __opencl_atomic_exchange(reinterpret_cast(BA2), -1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + *cache1_result = -1; + } + __opencl_atomic_fetch_add(reinterpret_cast(BA2), 1, + __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES); + if (gpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) { + *cache1_result = -1; + break; + } + } + *cache1_result = 0; +} + +static bool gpu_to_gpu_coherency() { + int *A_d, *B_d, *X_d0, *X_d1, *Y_d0, *Y_d1; + int *A_h, *B_h, *X_h, *Y_h; + unsigned int cache0_result, cache1_result; + size_t N = 1024; + size_t Nbytes = N * sizeof(int); + int numDevices = 0; + int numTestDevices = 2; + + HIP_CHECK(hipGetDeviceCount(&numDevices)); + if (numDevices < numTestDevices) { + HipTest::HIP_SKIP_TEST("Skipping because devices < 2"); + return 0; + } + + // Skip this test if either device does not support this feature. + hipDeviceProp_t props0, props1; + HIP_CHECK(hipGetDeviceProperties(&props0, 0)); + HIP_CHECK(hipGetDeviceProperties(&props1, 1)); + if ((strncmp(props0.gcnArchName, "gfx90a", 6) != 0 || + strncmp(props1.gcnArchName, "gfx90a", 6) != 0) && + (strncmp(props0.gcnArchName, "gfx940", 6) != 0 || + strncmp(props1.gcnArchName, "gfx940", 6) != 0)) { + printf("info: skipping test on devices other than gfx90a and gfx940.\n"); + return true; + } + + // Allocate Host Side Memory. + printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + A_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess); + B_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess); + X_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess); + Y_h = reinterpret_cast(malloc(Nbytes)); + HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess); + + // Initialize the arrays and atomic variables. + for (size_t i = 0; i < N; i++) { + X_h[i] = 100000000 + i; + Y_h[i] = 300000000 + i; + } + + // Initialize shared atomic flags on host coherent memory. + unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h; + unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d; + HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&AA1_d), + AA1_h, 0)); + *AA1_h = 0; + HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&AA2_d), + AA2_h, 0)); + *AA2_h = 0; + HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&BA1_d), + BA1_h, 0)); + *BA1_h = 0; + HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&BA2_d), + BA2_h, 0)); + *BA2_h = 0; + + // Skip the first stream. + hipStream_t stream[3]; + HIP_CHECK(hipStreamCreate(&stream[0])); + + // Set-up Device 0. + HIP_CHECK(hipSetDevice(0)); + // Enable P2P access to Device 1. + HIP_CHECK(hipDeviceEnablePeerAccess(1, 0)); + HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking)); + // Allocating Coherent Memory for Array A_d on Device 0. + printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + hipError_t status = hipExtMallocWithFlags(reinterpret_cast(&A_d), + Nbytes, hipDeviceMallocFinegrained); + REQUIRE(status == hipSuccess); + HIP_CHECK(hipMalloc(&X_d0, Nbytes)); + HIP_CHECK(hipMalloc(&Y_d0, Nbytes)); + + // Set-up Device 1. + HIP_CHECK(hipSetDevice(1)); + // Enable P2P access to Device 0. + HIP_CHECK(hipDeviceEnablePeerAccess(0, 0)); + HIP_CHECK(hipStreamCreateWithFlags(&stream[2], hipStreamNonBlocking)); + // Allocating Coherent Memory for Array B_d on Device 1. + printf("info: allocate device 1 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + status = hipExtMallocWithFlags(reinterpret_cast(&B_d), + Nbytes, hipDeviceMallocFinegrained); + REQUIRE(status == hipSuccess); + HIP_CHECK(hipMalloc(&X_d1, Nbytes)); + HIP_CHECK(hipMalloc(&Y_d1, Nbytes)); + + // Transfer initialized data onto the device arrays. + HIP_CHECK(hipMemcpy(X_d0, X_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(X_d1, X_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(Y_d0, Y_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(Y_d1, Y_h, Nbytes, hipMemcpyHostToDevice)); + + // Prepare and launch the device kernels. + const unsigned blocks = 1; + const unsigned threadsPerBlock = 1; + HIP_CHECK(hipSetDevice(0)); + hipLaunchKernelGGL(gpu_cache0, dim3(blocks), dim3(threadsPerBlock), + 0, stream[1], + A_d, B_d, X_d0, Y_d0, N, + AA1_d, AA2_d, BA1_d, BA2_d, &cache0_result); + // Check if launch failed. + HIP_CHECK(hipGetLastError()); + REQUIRE(cache0_result == 0); + HIP_CHECK(hipSetDevice(1)); + hipLaunchKernelGGL(gpu_cache1, dim3(blocks), dim3(threadsPerBlock), + 0, stream[2], + A_d, B_d, X_d1, Y_d1, N, + AA1_d, AA2_d, BA1_d, BA2_d, &cache1_result); + HIP_CHECK(hipGetLastError()); + REQUIRE(cache1_result == 0); + + // Wait for kernels on both devices. + HIP_CHECK(hipStreamSynchronize(stream[1])); + HIP_CHECK(hipStreamSynchronize(stream[2])); + + // Evaluate the resultant arrays A and B. + HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost)); + + for (size_t i = 0; i < N; i++) { + REQUIRE(A_h[i] == (100000000 + i)); + REQUIRE(B_h[i] == (300000000 + i)); + } + + // Free all the device and host memory allocated. + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(B_d)); + HIP_CHECK(hipFree(X_d0)); + HIP_CHECK(hipFree(Y_d0)); + HIP_CHECK(hipFree(X_d1)); + HIP_CHECK(hipFree(Y_d1)); + HIP_CHECK(hipHostFree(AA1_h)); + HIP_CHECK(hipHostFree(AA2_h)); + HIP_CHECK(hipHostFree(BA1_h)); + HIP_CHECK(hipHostFree(BA2_h)); + free(A_h); + free(B_h); + free(X_h); + free(Y_h); + + return true; +} + +/** + * Test Description + * ------------------------ + * - This test runs on devices where XGMI enables fine-grained communication + * between GPUs. This performs a message passing test. + * Array A is allocated on Device 0, and remotely on Device 1. + * Device 0 also increments atomic ints AA1 and AA2. + * Array B is allocated on Device 1, and remotely on Device 0. + * Device 1 also increments atomic ints BA1 and BA2. + * Kernel 0 will launch on Device 0, and store array X into array A. + * Kernel 1 will launch on Device 1, and store array Y into array B. + * Kernel 0 will validate that the correct values of array Y are stored in B. + * Kernel 1 will validate that the correct values of array X are stored in A. + + * Test source + * ------------------------ + * - catch/unit/synchronization/cache_coherency_gpu_gpu.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.5 + * - Test to be run only on AMD. + */ + +TEST_CASE("Unit_cache_coherency_gpu_gpu") { + bool passed = true; + // Coherency between GPUs accessing local or remote FB. + REQUIRE(passed == gpu_to_gpu_coherency()); +} diff --git a/projects/hip-tests/catch/unit/synchronization/copy_coherency.cc b/projects/hip-tests/catch/unit/synchronization/copy_coherency.cc new file mode 100644 index 0000000000..1e57fa6815 --- /dev/null +++ b/projects/hip-tests/catch/unit/synchronization/copy_coherency.cc @@ -0,0 +1,340 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +unsigned threadsPerBlock = 256; +unsigned blocksPerCU = 6; + +class MemcpyFunction { + public: + MemcpyFunction(const char* fileName, const char* functionName) { + load(fileName, functionName); + } + void load(const char* fileName, const char* functionName); + void launch(int* dst, const int* src, size_t numElements, hipStream_t s); + + private: + hipFunction_t _function; + hipModule_t _module; +}; + + +void MemcpyFunction::load(const char* fileName, const char* functionName) { + HIP_CHECK(hipModuleLoad(&_module, fileName)); + HIP_CHECK(hipModuleGetFunction(&_function, _module, functionName)); +} + +void MemcpyFunction::launch(int* dst, const int* src, size_t numElements, hipStream_t s) { // NOLINT + struct { + int* _dst; + const int* _src; + size_t _numElements; + } args; + + args._dst = dst; + args._src = src; + args._numElements = numElements; + + size_t size = sizeof(args); + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END}; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, + numElements); + HIP_CHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock, + 1, 1, 0, s, NULL, + reinterpret_cast(&config))); +} + +bool g_warnOnFail = true; +int g_elementSizes[] = {128 * 1000, 256 * 1000, 16 * 1000 * 1000}; + +// Set value of array to specified 32-bit integer: +__global__ void memsetIntKernel(int* ptr, const int val, size_t numElements) { + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x; + for (size_t i = gid; i < numElements; i += stride) { + ptr[i] = val; + } +} + +__global__ void memcpyIntKernel(int* dst, const int* src, size_t numElements) { + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x; + for (size_t i = gid; i < numElements; i += stride) { + dst[i] = src[i]; + } +} + +// Check arrays in reverse order, to more easily detect cases where +// the copy is "partially" done. +void checkReverse(const int* ptr, int numElements, int expected) { + int mismatchCnt = 0; + for (int i = numElements - 1; i >= 0; i--) { + if (!g_warnOnFail) { + REQUIRE(ptr[i] == expected); + } + if (++mismatchCnt >= 10) { + break; + } + } +} + +#define ENUM_CASE_STR(x) \ + case x: \ + return #x + +enum CmdType { COPY, KERNEL, MODULE_KERNEL, MAX_CmdType }; + +const char* CmdTypeStr(CmdType c) { + switch (c) { + ENUM_CASE_STR(COPY); + ENUM_CASE_STR(KERNEL); + ENUM_CASE_STR(MODULE_KERNEL); + default: + return "UNKNOWN"; + } +} + +enum SyncType { + NONE, + EVENT_QUERY, + EVENT_SYNC, + STREAM_WAIT_EVENT, + STREAM_QUERY, + STREAM_SYNC, + DEVICE_SYNC, + MAX_SyncType +}; + +const char* SyncTypeStr(SyncType s) { + switch (s) { + ENUM_CASE_STR(NONE); + ENUM_CASE_STR(EVENT_QUERY); + ENUM_CASE_STR(EVENT_SYNC); + ENUM_CASE_STR(STREAM_WAIT_EVENT); + ENUM_CASE_STR(STREAM_QUERY); + ENUM_CASE_STR(STREAM_SYNC); + ENUM_CASE_STR(DEVICE_SYNC); + default: + return "UNKNOWN"; + } +} + +void runCmd(CmdType cmd, int* dst, const int* src, hipStream_t s, + size_t numElements) { + switch (cmd) { + case COPY: + HIP_CHECK( + hipMemcpyAsync(dst, src, numElements * sizeof(int), + hipMemcpyDeviceToDevice, s)); + break; + case KERNEL: { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, numElements); + hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), + 0, s, dst, src, numElements); + } break; + case MODULE_KERNEL: { + MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); + g_moduleMemcpy.launch(dst, src, numElements, s); + } break; + default: + printf("Info:unknown cmd=%d type", cmd); + } +} + +void resetInputs(int* Ad, int* Bd, int* Ch, + size_t numElements, int expected) { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, numElements); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), + 0, hipStream_t(0), Ad, expected, numElements); + // poison with bad value to ensure is overwritten correctly + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), + 0, hipStream_t(0), Bd, 0xDEADBEEF, numElements); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), + 0, hipStream_t(0), Bd, 0xF000BA55, numElements); + memset(Ch, 13, numElements * sizeof(int)); + HIP_CHECK(hipDeviceSynchronize()); +} + +// Intended to test proper synchronization and cache flushing +// between CMDA and CMDB. CMD are of type CmdType. All command copy memory, +// using either hipMemcpyAsync or kernel implementations. +// Some form of synchronization is applied. Then cmdB copies from Bd to Cd. +// CmdA copies from Ad to Bd, Cd is then copied to host Ch using a memory copy. +// Correct result at the end is that Ch contains the +// contents originally in Ad (integer 0x42) + +void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, + hipStream_t stream1, hipStream_t stream2, int numElements, + int* Ad, int* Bd, int* Cd, int* Ch, int expected) { + hipEvent_t e; + HIP_CHECK(hipEventCreateWithFlags(&e, 0)); + + resetInputs(Ad, Bd, Ch, numElements, expected); + + const size_t sizeElements = numElements * sizeof(int); + fprintf(stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", // NOLINT + sizeElements, static_cast(sizeElements / 1024.0), + CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType)); + + /*if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { // NOLINT + fprintf(stderr, "warn: skipping since test infra does not yet support modules\n"); // NOLINT + return; + }*/ + + // Step A: + runCmd(cmdAType, Bd, Ad, stream1, numElements); + + // Sync in-between? + switch (syncType) { + case NONE: + break; + case EVENT_QUERY: { + hipError_t st = hipErrorNotReady; + HIP_CHECK(hipEventRecord(e, stream1)); + do { + st = hipEventQuery(e); + } while (st == hipErrorNotReady); + HIP_CHECK(st); + } break; + case EVENT_SYNC: + HIP_CHECK(hipEventRecord(e, stream1)); + HIP_CHECK(hipEventSynchronize(e)); + break; + case STREAM_WAIT_EVENT: + HIP_CHECK(hipEventRecord(e, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e, 0)); + break; + case STREAM_QUERY: { + hipError_t st = hipErrorNotReady; + do { + st = hipStreamQuery(stream1); + } while (st == hipErrorNotReady); + HIP_CHECK(st); + } break; + case STREAM_SYNC: + HIP_CHECK(hipStreamSynchronize(stream1)); + break; + case DEVICE_SYNC: + HIP_CHECK(hipDeviceSynchronize()); + break; + default: + fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType)); + return; + } + runCmd(cmdBType, Cd, Bd, stream2, numElements); + + // Copy back to host, use async copy to avoid any extra synchronization + // that might mask issues. + HIP_CHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost, + stream2)); + HIP_CHECK(hipStreamSynchronize(stream2)); + + checkReverse(Ch, numElements, expected); + + HIP_CHECK(hipEventDestroy(e)); +} + +void testWrapper(size_t numElements) { + const size_t sizeElements = numElements * sizeof(int); + const int expected = 0x42; + int *Ad, *Bd, *Cd, *Ch; + + HIP_CHECK(hipMalloc(&Ad, sizeElements)); + HIP_CHECK(hipMalloc(&Bd, sizeElements)); + HIP_CHECK(hipMalloc(&Cd, sizeElements)); + HIP_CHECK(hipHostMalloc(&Ch, sizeElements)); + + hipStream_t stream1, stream2; + + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipDeviceSynchronize()); + + runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, + Ad, Bd, Cd, Ch, expected); + + for (int cmdA = 0; cmdA < MAX_CmdType; cmdA++) { + for (int cmdB = 0; cmdB < MAX_CmdType; cmdB++) { + for (int syncMode = 0; syncMode < MAX_SyncType; syncMode++) { + switch (syncMode) { + // case NONE:: + case EVENT_QUERY: + case EVENT_SYNC: + case STREAM_WAIT_EVENT: + // case STREAM_QUERY: + case STREAM_SYNC: + case DEVICE_SYNC: + runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), + stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); + break; + default: + break; + } + } + } + } + +#if 0 + runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, + numElements, Ad, Bd, Cd, Ch, expected); + runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, + Ad, Bd, Cd, Ch, expected); + runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, + numElements, Ad, Bd, Cd, Ch, expected); + runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, + Ad, Bd, Cd, Ch, expected); +#endif + + HIP_CHECK(hipFree(Ad)); + HIP_CHECK(hipFree(Bd)); + HIP_CHECK(hipFree(Cd)); + HIP_CHECK(hipHostFree(Ch)); + + HIP_CHECK(hipStreamDestroy(stream1)); + HIP_CHECK(hipStreamDestroy(stream2)); +} + +/** + * Test Description + * ------------------------ + * - Test cache management (fences) and synchronization between + * kernel and copy commands. Exhaustively tests 3 command types + * (copy, kernel, module kernel), many sync types (see SyncType), followed by + * another command, across a sweep of data sizes designed to stress + * various levels of the memory hierarchy. + + * Test source + * ------------------------ + * - catch/unit/synchronization/copy_coherency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.5 + */ + +TEST_CASE("Unit_Copy_Coherency") { + for (int index = 0; index < sizeof(g_elementSizes) / sizeof(int); index++) { + size_t numElements = g_elementSizes[index]; + testWrapper(numElements); + } +} diff --git a/projects/hip-tests/catch/unit/synchronization/memcpyIntDevice.cpp b/projects/hip-tests/catch/unit/synchronization/memcpyIntDevice.cpp new file mode 100644 index 0000000000..a211c8b8a7 --- /dev/null +++ b/projects/hip-tests/catch/unit/synchronization/memcpyIntDevice.cpp @@ -0,0 +1,10 @@ +#include +extern "C" __global__ void memcpyIntKernel(int* dst, const int* src, + size_t numElements) { + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x; + for (size_t i = gid; i < numElements; i += stride) { + dst[i] = src[i]; + } +}; +