diff --git a/projects/hip-tests/catch/unit/memory/CMakeLists.txt b/projects/hip-tests/catch/unit/memory/CMakeLists.txt index e10be81cc5..173b78bea1 100644 --- a/projects/hip-tests/catch/unit/memory/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/memory/CMakeLists.txt @@ -178,5 +178,16 @@ hip_add_exe_to_target(NAME MemoryTest2 TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC}) if(HIP_PLATFORM MATCHES "amd") + set(TEST_SRC + hipSVMTestByteGranularity.cpp + hipSVMTestFineGrainMemoryConsistency.cpp + hipSVMTestFineGrainSyncBuffers.cpp + hipSVMTestSharedAddressSpaceFineGrain.cpp + ) + + hip_add_exe_to_target(NAME SVMAtomicTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC}) + add_dependencies(build_tests hipHostRegisterPerf) endif() diff --git a/projects/hip-tests/catch/unit/memory/hipSVMCommon.h b/projects/hip-tests/catch/unit/memory/hipSVMCommon.h new file mode 100644 index 0000000000..5b7433fc12 --- /dev/null +++ b/projects/hip-tests/catch/unit/memory/hipSVMCommon.h @@ -0,0 +1,141 @@ +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/* + * Modifications Copyright (C)2023 Advanced + * Micro Devices, Inc. All rights reserved. + */ +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include +#include + +#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER) + #include +#endif + +// SVM Atomic wrappers. +// Platforms that support SVM atomics (atomics that work across the host and devices) need to +// implement these host side functions correctly. Platforms that do not support SVM atomics can +// simpy implement these functions as empty stubs since the functions will not be called. For now +// only Windows x86 is implemented, add support for other platforms as needed. +unsigned int inline AtomicLoad32(unsigned int* pValue) { +#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER) + return (unsigned int)InterlockedExchangeAdd((LONG*)pValue, 0l); +#elif defined(__GNUC__) + return __sync_add_and_fetch(pValue, 0); +#else + return -1; +#endif +} + +// all the x86 atomics are seq_cst, so don't need to do anything with the memory order parameter. +unsigned int inline AtomicFetchAdd32(unsigned int* object, int operand) { +#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER) + return InterlockedExchangeAdd((LONG*)object, operand); +#elif defined(__GNUC__) + return __sync_fetch_and_add(object, operand); +#else + return -1; +#endif +} + +template +T inline AtomicFetchAdd64(T* object, T operand) { +#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER) + return (T)InterlockedExchangeAdd64((LONG64*)object, (LONG64)operand); +#elif defined(__GNUC__) + return (T)__sync_fetch_and_add((intptr_t*)object, (intptr_t)operand); +#else + return -1; +#endif +} + +unsigned int inline AtomicExchange32(unsigned int* object, unsigned int desired) { +#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER) + return (unsigned int)InterlockedExchange((LONG*)object, (LONG)desired); +#elif defined(__GNUC__) + return __sync_lock_test_and_set(object, desired); +#else + return -1; +#endif +} + +template +T inline AtomicExchange64(T* a, T expected) { +#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) + return (T)InterlockedExchangePointer((PVOID volatile*)a, (PVOID)expected); +#elif defined(__GNUC__) + return (T)__sync_lock_test_and_set((long long*)a, (long long)expected); +#else + tmp = 0; +#endif +} + +template +bool AtomicCompareExchange64(T* a, T* expected, T desired) +{ +#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32)) + T tmp = (T)InterlockedCompareExchange64((LONG64 *)a, (LONG64)desired, + *(LONG64 *)expected); +#elif defined(__GNUC__) + T tmp = (T)__sync_val_compare_and_swap((intptr_t*)a, (intptr_t)(*expected), + (intptr_t)desired); +#else + tmp = 0; +#endif + if(tmp == *expected) + return true; + *expected = tmp; + return false; +} + +inline void* align_malloc(size_t size, size_t alignment) { +#if defined(_WIN32) && defined(_MSC_VER) + return _aligned_malloc(size, alignment); +#elif defined(__linux__) || defined(linux) || defined(__APPLE__) + void* ptr = NULL; +#if defined(__ANDROID__) + ptr = memalign(alignment, size); + if (ptr) return ptr; +#else + if (alignment < sizeof(void*)) { + alignment = sizeof(void*); + } + if (0 == posix_memalign(&ptr, alignment, size)) return ptr; +#endif + return NULL; +#elif defined(__MINGW32__) + return __mingw_aligned_malloc(size, alignment); +#else +#error "Please add support OS for aligned malloc" +#endif +} + +inline void align_free(void* ptr) { +#if defined(_WIN32) && defined(_MSC_VER) + _aligned_free(ptr); +#elif defined(__linux__) || defined(linux) || defined(__APPLE__) + return free(ptr); +#elif defined(__MINGW32__) + return __mingw_aligned_free(ptr); +#else +#error "Please add support OS for aligned free" +#endif +} + +#endif // #ifndef __COMMON_H__ + diff --git a/projects/hip-tests/catch/unit/memory/hipSVMTestByteGranularity.cpp b/projects/hip-tests/catch/unit/memory/hipSVMTestByteGranularity.cpp new file mode 100644 index 0000000000..8b682bb10d --- /dev/null +++ b/projects/hip-tests/catch/unit/memory/hipSVMTestByteGranularity.cpp @@ -0,0 +1,154 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/* + * Modifications Copyright (C)2023 Advanced + * Micro Devices, Inc. All rights reserved. + */ + +#include +#include +#include +#include + +// Each device will write it's id into the bytes that it "owns", ownership is based on round robin +// (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the +// host code). id is the index (id) of the device that this kernel is executing on. For example, if +// there are 2 SVM devices and the host; the buffer should look like this after each device and the +// host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2... +__global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + int owner = i % num_id; + if (id == owner) a[i] = id; // modify location if it belongs to this device, write id +} + +// Verify that a device can see the byte sized updates from the other devices, sum up the device +// id's and see if they match expected value. Note: this must be called with a reduced NDRange so +// that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and +// the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum +// at each point is 0+1+2 = 3. +__global__ void sum_neighbor_locations(char* a, unsigned int num_devices, + unsigned int* error_count) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2; + unsigned int sum = 0; + for (unsigned int j = 0; j < num_devices; j++) { + sum += a[i + j]; // add my neighbors to the right + } + if (sum != expected_sum) + atomicAdd_system(error_count, 1u); // like opencl atomic_inc() +} + +/** +* Test Description +* ------------------------ +* - The suite will test the following functions, + hipHostMalloc() with following flags, + hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS) + hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER) + atomicAdd_system()(in kernel) + hipStreamCreate() + hipStreamSynchronize() +* It will demonstrate use of SVM's atomics to do fine grain synchronization among +* devices with each stream on each device. The result will be verified on the host. +* Test source +* ------------------------ +* - catch/unit/memory/hipSVMTestByteGranularity.cpp +* Test requirements +* ------------------------ +* - Host specific (WINDOWS and LINUX) +* - Fine grain access and atomics supported on devices +* - HIP_VERSION >= 5.7 +*/ +TEST_CASE("test_svm_byte_granularity") { + const int num_elements = 2048; + int num_devices = 0; + HIP_CHECK(hipGetDeviceCount(&num_devices)); + int num_devices_plus_host = num_devices + 1; + std::vector streams(num_devices); + + for (int d = 0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); + HIP_CHECK(hipStreamCreate(&streams[d])); + } + HIP_CHECK(hipSetDevice(0)); + char* pA = nullptr; + // hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent)); + unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices); + + for(unsigned int i=0; i < num_devices; i++) { + // hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS + // We need atomic inc among different GPUs + HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements, + hipHostMallocCoherent)); + *error_counts[i] = 0; + } + for(int i = 0; i < num_elements; i++) pA[i] = -1; + + // get all the devices going simultaneously + for(unsigned int d = 0; d < num_devices; d++) // device ids starting at 1. + { + write_owned_locations<<>>(pA, num_devices_plus_host, d); + HIP_CHECK(hipGetLastError()); + } + unsigned int host_id = num_devices; // host code will take the id above the devices. + for(unsigned int i = num_devices; i < num_elements; i+= num_devices_plus_host) + pA[i] = host_id; + + for (unsigned int d = 0; d < num_devices; d++) { + HIP_CHECK(hipStreamSynchronize(streams[d])); + } + + // now check that each device can see the byte writes made by the other devices. + // adjusted so sum_neighbor_locations doesn't read past end of buffer + size_t adjusted_num_elements = num_elements - num_devices; + for(unsigned int d = 0; d < num_devices; d++) + { + sum_neighbor_locations<<>>(pA, num_devices_plus_host, + error_counts[d]); + HIP_CHECK(hipGetLastError()); + } + + for (unsigned int d = 0; d < num_devices; d++) { + HIP_CHECK(hipStreamSynchronize(streams[d])); + } + // see if any of the devices found errors + for(unsigned int d = 0; d < num_devices; d++) { + if (*error_counts[d] > 0) { + fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]); + REQUIRE(false); + } + } + unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2; + // check that host can see the byte writes made by the devices. + for(unsigned int i = 0; i < num_elements - num_devices_plus_host; i++) + { + unsigned int sum = 0; + for(unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i+j]; + if (sum != expected) { + fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected); + REQUIRE(false); + } + } + for (unsigned int i = 0; i < num_devices; i++) { + HIP_CHECK(hipStreamDestroy(streams[i])); + HIP_CHECK(hipHostFree(error_counts[i])); + } + free(error_counts); + HIP_CHECK(hipHostFree(pA)); + REQUIRE(true); +} diff --git a/projects/hip-tests/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp b/projects/hip-tests/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp new file mode 100644 index 0000000000..29d4cbc19a --- /dev/null +++ b/projects/hip-tests/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp @@ -0,0 +1,261 @@ +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/* + * Modifications Copyright (C)2023 Advanced + * Micro Devices, Inc. All rights reserved. + */ +#include +#include +#include +#include +#include +#include "hipSVMCommon.h" +//#define DEBUG_ATOMIC // To provide additional data for debugging +#ifdef DEBUG_ATOMIC +//#define DEBUG_ATOMIC_PRINT_THREAD +#endif + +typedef struct BinNode { +#ifdef DEBUG_ATOMIC + unsigned int n; + unsigned int d; + unsigned int i; +#endif + unsigned int value; + struct BinNode* pNext; +} BinNode; + +__global__ void build_hash_table_on_device(unsigned int* input, size_t inputSize, + BinNode* pNodes, + unsigned int* pNumNodes, unsigned int numBins, + unsigned int dev) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= inputSize) return; + + unsigned int n = atomicAdd_system((unsigned int*)pNumNodes, 1u); + BinNode* pNew = &pNodes[n]; + unsigned int b = input[i] % numBins; + + pNew->value = input[i]; +#ifdef DEBUG_ATOMIC + pNew->d = dev; + pNew->i = i; + pNew->n = n; +#endif + unsigned long long next = 0; + unsigned long long old = atomicAdd_system((unsigned long long*)&(pNodes[b].pNext), + 0ull); // Because of no atomicLoad() + do { + next = old; + // Use CAS to ensure atomic operation + //pNew->pNext = (BinNode*)next; + atomicExch((unsigned long long*)&(pNew->pNext), next); + old = atomicCAS_system((unsigned long long *)&(pNodes[b].pNext), next, + (unsigned long long )pNew); + } while (old != next); +#ifdef DEBUG_ATOMIC_PRINT_THREAD + printf("k%u: i=%zu, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p," + " old=%p, input[%zu]=%u\n", dev, i, n, + pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b], (void*)old, + i, input[i]); +#else + (void)dev; +#endif +} + +void build_hash_table_on_host(unsigned int* input, size_t inputSize, BinNode* pNodes, + unsigned int* pNumNodes, unsigned int numBins, + unsigned int dev) { + // wait until we see some activity from a device (try to run host side simultaneously). + while (numBins == AtomicLoad32(pNumNodes)); + for(unsigned int i = 0; i < inputSize; i++) + { + unsigned int n = AtomicFetchAdd32(pNumNodes, 1u); + BinNode* pNew = &pNodes[n]; + unsigned int b = input[i] % numBins; +#ifdef DEBUG_ATOMIC + pNew->d = dev; + pNew->i = i; + pNew->n = n; +#endif + pNew->value = input[i]; + BinNode* next = AtomicFetchAdd64(&pNodes[b].pNext, (BinNode*)0ll); + do { + AtomicExchange64(&(pNew->pNext), next); + // always inserting at head of list + } while (!AtomicCompareExchange64(&(pNodes[b].pNext), &next, + (BinNode*)pNew)); +#ifdef DEBUG_ATOMIC_PRINT_THREAD + fprintf(stderr, + "k%u: i=%u, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p, " + "input[%u]=%u\n", + dev, i, n, pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b], + i, input[i]); +#else + (void)dev; +#endif + } +} + +void launch_kernels_and_verify(std::vector &streams, unsigned int num_devices, + unsigned int numBins, size_t num_pixels) { + unsigned int* pInputImage = nullptr; + BinNode* pNodes = nullptr; + unsigned int* pNumNodes = nullptr; + unsigned int total_items = num_pixels * (num_devices + 1); + HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocCoherent)); + HIP_CHECK( + hipHostMalloc(&pNodes, sizeof(BinNode) * (total_items + numBins), hipHostMallocCoherent)); + HIP_CHECK(hipHostMalloc(&pNumNodes, sizeof(unsigned int), hipHostMallocCoherent)); + + *pNumNodes = numBins; // using the first numBins nodes to hold the list heads. + for(unsigned int i = 0; i < numBins; i++) pNodes[i].pNext = nullptr; + for(unsigned int i = 0; i < num_pixels; i++) pInputImage[i] = i; + + // Get all the devices going simultaneously, each device (and the host) will insert + // all the pixels. + for(unsigned int d=0; d < num_devices; d++) + { + build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>( + pInputImage, num_pixels, pNodes, pNumNodes, numBins, d); + HIP_CHECK(hipGetLastError()); + } + + std::vector threads; + threads.push_back(std::thread(build_hash_table_on_host, pInputImage, num_pixels, pNodes, + pNumNodes, numBins, num_devices)); + for (unsigned int d = 0; d < num_devices; d++) { + threads.push_back(std::thread( + [](hipStream_t s) { + HIP_CHECK(hipStreamSynchronize(s)); // To workarround batch dispatching on Windows + }, streams[d])); + } + std::for_each(threads.begin(), threads.end(), [](std::thread& t) { t.join(); }); + + for (unsigned int d = 0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); + HIP_CHECK(hipDeviceSynchronize()); + } + HIP_CHECK(hipSetDevice(0)); + unsigned int num_items = 0; + // check correctness of each bin in the hash table. + for(unsigned int i = 0; i < numBins; i++) + { + BinNode *pNode = pNodes[i].pNext; + unsigned int num_items_bin = 0; + unsigned int total_num_items_bin = + (num_pixels % numBins <= i) ? (num_pixels / numBins) : (num_pixels / numBins + 1); + total_num_items_bin *= (num_devices + 1); // The item number of the list in i-th bin + while(pNode) + { +#ifdef DEBUG_ATOMIC_PRINT_THREAD + fprintf(stderr, "v%u/%u: %u, pNode=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p)\n", i, numBins, + num_items_bin, pNode, pNode->n, pNode->d, pNode->i, pNode->value, pNode->pNext); +#endif + if((pNode->value % numBins) != i) + { + fprintf(stderr, + "Something went wrong at i=%u, item is in wrong hash bucket:" \ + "pNode->value=%u, numBins=%u\n", i, pNode->value, numBins); + REQUIRE(false); + } + num_items++; + num_items_bin++; + if (num_items_bin > total_num_items_bin) { + fprintf(stderr, + "Something went wrong at i=%u/%u, num_items_bin(%u)>total_num_items_bin(%u)\n", + i, numBins, num_items_bin, total_num_items_bin); + REQUIRE(false); + } + pNode = pNode->pNext; + } + if (num_items_bin != total_num_items_bin) { + fprintf(stderr, + "Something went wrong at i=%u/%u, num_items_bin(%u)!=total_num_items_bin(%u)\n", + i, numBins, num_items_bin, total_num_items_bin); + } + } + HIP_CHECK(hipHostFree(pInputImage)); + HIP_CHECK(hipHostFree(pNodes)); + HIP_CHECK(hipHostFree(pNumNodes)); + + // each device and the host inserted all of the pixels, check that none are missing. + if (num_items != total_items) + { + fprintf(stderr, "The hash table is not correct, num items %u != expected num items: %u\n", + num_items, total_items); + REQUIRE(false); // test did not pass + } + REQUIRE(true); +} + +/** +* Test Description +* ------------------------ +* - The suite will test the following functions, + hipHostMalloc() with following flags, + hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS) + atomicAdd_system()(in kernel) + atomicCAS_system()(in kernel) + atomicExch()(in kernel) + InterlockedExchangeAdd()(in WINDOWS host) + __sync_add_and_fetch()(in LINUX host) + InterlockedExchangeAdd64()(in WINDOWS host) + InterlockedExchangePointer()(in WINDOWS host) + __sync_lock_test_and_set()(in LINUX host) + InterlockedCompareExchange64()(in WINDOWS host) + __sync_val_compare_and_swap()(in LINUX host) + hipDeviceSynchronize() +* It will demonstrate use of SVM's atomics to do fine grain synchronization among +* devices and the host. +* Concept: Each device and the host simultaneously insert values into a single hash table. +* Each bin in the hash table is a linked list. Each bin is protected against simultaneous +* update using a lock free technique. The correctness of the list is verified on the host. +* Test source +* ------------------------ +* - catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp +* Test requirements +* ------------------------ +* - Host specific (WINDOWS and LINUX) +* - Fine grain access and atomics supported on devices and host +* - HIP_VERSION >= 5.7 +*/ +TEST_CASE("test_svm_fine_grain_memory_consistency") { + const int num_elements = 2167; + int num_devices = 0; + HIP_CHECK(hipGetDeviceCount(&num_devices)); + std::vector streams(num_devices); + + for (int d = 0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); + HIP_CHECK(hipStreamCreate(&streams[d])); + } + HIP_CHECK(hipSetDevice(0)); + + // all work groups in all devices and the host code will hammer on this one lock. + unsigned int numBins = 1; + launch_kernels_and_verify(streams, num_devices, numBins, num_elements); + + numBins = 2; // 2 locks within in same cache line will get hit from different devices and host. + launch_kernels_and_verify(streams, num_devices, numBins, num_elements); + + numBins = 29; // locks span a few cache lines. + launch_kernels_and_verify(streams, num_devices, numBins, num_elements); + + for (unsigned int i = 0; i < num_devices; i++) { + HIP_CHECK(hipStreamDestroy(streams[i])); + } +} diff --git a/projects/hip-tests/catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp b/projects/hip-tests/catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp new file mode 100644 index 0000000000..1e47818b0c --- /dev/null +++ b/projects/hip-tests/catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp @@ -0,0 +1,129 @@ +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/* + * Modifications Copyright (C)2023 Advanced + * Micro Devices, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "hipSVMCommon.h" +#define MAX_TARGETS 1024 + +__global__ void find_targets(unsigned int* image, unsigned int target, + unsigned int* numTargetsFound, + unsigned int* targetLocations) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int index = 0; + if (image[i] == target) { + index = atomicAdd((unsigned int*)numTargetsFound, 1u); + if (index < MAX_TARGETS) { + atomicExch_system((unsigned int *)&targetLocations[index], (unsigned int)i); + } + } +} + +void spawnAnalysisTask(int location) +{ + printf("found target at location %d\n", location); +} + +/** +* Test Description +* ------------------------ +* - The suite will test the following functions, + hipHostMalloc() with following flags, + hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS) + hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER) + atomicAdd()(in kernel) + atomicExch_system()(in kernel) + InterlockedExchangeAdd()(in WINDOWS host) + __sync_add_and_fetch()(in LINUX host) + hipStreamCreate() + hipEventCreate() + hipEventRecord() + hipEventQuery() +* It will demonstrate use of SVM's atomics to do fine grain synchronization between +* a device and the host. The result will be verified on the host. +* Concept: a device kernel is used to search an input image for regions that match a +* target pattern. The device immediately notifies the host when it finds a target +* (via an atomic operation that works across host and devices). The host is then able +* to spawn a task that further analyzes the target while the device continues searching +* for more targets. +* Test source +* ------------------------ +* - catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp +* Test requirements +* ------------------------ +* - Host specific (WINDOWS and LINUX) +* - Fine grain access and atomics supported on device and host +* - HIP_VERSION >= 5.7 +*/ +TEST_CASE("test_svm_fine_grain_sync_buffers") { + size_t num_pixels = 1024 * 1024 * 2; + hipStream_t stream; + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipStreamCreate(&stream)); + hipEvent_t event; + HIP_CHECK(hipEventCreate(&event)); + unsigned int *pInputImage, *pNumTargetsFound, *pTargetLocations; + HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocNonCoherent)); + HIP_CHECK(hipHostMalloc(&pNumTargetsFound, sizeof(unsigned int), hipHostMallocCoherent)); + HIP_CHECK(hipHostMalloc(&pTargetLocations, sizeof(int) * MAX_TARGETS, hipHostMallocCoherent)); + unsigned int targetDescriptor = 777; + *pNumTargetsFound = 0; + + unsigned int i; + for(i = 0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1; + for(i = 0; i < num_pixels; i++) pInputImage[i] = 0; + pInputImage[0] = targetDescriptor; + pInputImage[3] = targetDescriptor; + pInputImage[num_pixels - 1] = targetDescriptor; + + find_targets<<<(num_pixels + 255) / 256, 256, 0, stream>>>(pInputImage, targetDescriptor, + pNumTargetsFound, pTargetLocations); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipEventRecord(event, stream)); + + i=0; + hipError_t status = hipSuccess; + unsigned int loc = 0; + // check for new targets, if found spawn a task to analyze target. + do { + status = hipEventQuery(event); + if (status != hipErrorNotReady && status != hipSuccess) { + fprintf(stderr, "Unexpected status = %d\n", status); + REQUIRE(false); + } + loc = AtomicLoad32(&pTargetLocations[i]); + if (loc != -1) // -1 indicates slot not used yet. + { + spawnAnalysisTask(loc); // Do something... + i++; + } + } while (status == hipErrorNotReady || + AtomicLoad32(&pTargetLocations[i]) != -1); + + HIP_CHECK(hipHostFree(pInputImage)); + HIP_CHECK(hipHostFree(pNumTargetsFound)); + HIP_CHECK(hipHostFree(pTargetLocations)); + HIP_CHECK(hipEventDestroy(event)); + HIP_CHECK(hipStreamDestroy(stream)); + REQUIRE(i == 3); +} diff --git a/projects/hip-tests/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp b/projects/hip-tests/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp new file mode 100644 index 0000000000..0cb35e912e --- /dev/null +++ b/projects/hip-tests/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp @@ -0,0 +1,297 @@ +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/* + * Modifications Copyright (C)2023 Advanced + * Micro Devices, Inc. All rights reserved. + */ + +#include +#include +#include +#include "hipSVMCommon.h" + +// const char *linked_list_create_and_verify_kernels[] = { +typedef struct Node { + unsigned int global_id; + unsigned int position_in_list; + struct Node* pNext; +} Node; + +// The allocation_index parameter must be initialized on the host to N work-items +// The first N nodes in pNodes will be the heads of the lists. +__global__ void create_linked_lists_on_device(Node* pNodes, + unsigned int* allocation_index, + unsigned int list_length) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + Node* pNode = &pNodes[i]; + + pNode->global_id = i; + pNode->position_in_list = 0; + + Node* pNew; + for (unsigned int j = 1; j < list_length; j++) { + pNew = &pNodes[atomicAdd(allocation_index, 1u)]; // allocate a new node + pNew->global_id = i; + pNew->position_in_list = j; + pNode->pNext = pNew; // link new node onto end of list + pNode = pNew; // move to end of list + } +} + +__global__ void verify_linked_lists_on_device(Node* pNodes, unsigned int* num_correct, + unsigned int list_length) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + Node* pNode = &pNodes[i]; + + for (unsigned int j = 0; j < list_length; j++) { + if (pNode->global_id == i && pNode->position_in_list == j) { + atomicAdd(num_correct, 1u); + } else { + break; + } + pNode = pNode->pNext; + } +} + +// The first N nodes in pNodes will be the heads of the lists. +void create_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) { + unsigned int allocation_index = num_lists; // heads of lists are in first num_lists nodes. + for (unsigned int i = 0; i < num_lists; i++) { + Node* pNode = &pNodes[i]; + pNode->global_id = i; + pNode->position_in_list = 0; + Node* pNew; + for (unsigned int j = 1; j < list_length; j++) { + pNew = &pNodes[allocation_index++]; // allocate a new node + pNew->global_id = i; + pNew->position_in_list = j; + pNode->pNext = pNew; // link new node onto end of list + pNode = pNew; // move to end of list + } + } +} + +void verify_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) { + unsigned int numCorrect = 0; + for (unsigned int i = 0; i < num_lists; i++) { + Node* pNode = &pNodes[i]; + for (int j = 0; j < list_length; j++) { + if (pNode->global_id == i && pNode->position_in_list == j) { + numCorrect++; + } else { + break; + } + pNode = pNode->pNext; + } + } + if (numCorrect != list_length * num_lists) { + fprintf(stderr, "Failed\n"); + REQUIRE(false); + } +} + +void create_linked_lists_on_device(hipStream_t stream, Node* pNodes, + unsigned int* pAllocator, unsigned int numLists, + unsigned int ListLength) { + // reset allocator index + *pAllocator = numLists; // the first numLists elements of the nodes array are already + // allocated (they hold the head of each list). + create_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pAllocator, + ListLength); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipStreamSynchronize(stream)); +} + +void verify_linked_lists_on_device(hipStream_t stream, Node* pNodes, + unsigned int* pNumCorrect, unsigned int numLists, + unsigned int ListLength) { + *pNumCorrect = 0; // reset numCorrect to zero + + verify_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pNumCorrect, + ListLength); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipStreamSynchronize(stream)); + + int correct_count = *pNumCorrect; + if(correct_count != ListLength * numLists) + { + fprintf(stderr,"Failed\n"); + REQUIRE(false); + } +} + +/** +* Test Description +* ------------------------ +* - The suite will test the following functions, + hipHostMalloc() with following flags, + hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER) + atomicAdd()(in kernel) + hipStreamCreate() + hipStreamSynchronize() +* It will test that all devices and the host share a common address space using fine-grain +* host buffers. +* Concept: This is done by creating a linked list on a device and then verifying the +* correctness of the list on another device or the host. This basic test is performed for all +* combinations of devices and the host that exist within the platform. The test passes only if +* every combination passes. +* Test source +* ------------------------ +* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp +* Test requirements +* ------------------------ +* - Host specific (WINDOWS and LINUX) +* - Fine grain access supported on devices and host +* - HIP_VERSION >= 5.7 +*/ +TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") { + const unsigned int num_elements = 1024; + int num_devices = 0; + HIP_CHECK(hipGetDeviceCount(&num_devices)); + int num_devices_plus_host = num_devices + 1; + std::vector streams(num_devices); + + for (int d = 0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); + HIP_CHECK(hipStreamCreate(&streams[d])); + } + HIP_CHECK(hipSetDevice(0)); + + unsigned int numLists = num_elements; + unsigned int ListLength = 32; + Node* pNodes = nullptr; + unsigned int* pAllocator = nullptr; + unsigned int* pNumCorrect = nullptr; + HIP_CHECK(hipHostMalloc(&pNodes, sizeof(Node) * ListLength * numLists, hipHostMallocNonCoherent)); + HIP_CHECK(hipHostMalloc(&pAllocator, sizeof(unsigned int), hipHostMallocNonCoherent)); + HIP_CHECK(hipHostMalloc(&pNumCorrect, sizeof(unsigned int), hipHostMallocNonCoherent)); + + // Create linked list on one device and verify on another device (or the host). + // Do this for all possible combinations of devices and host within the platform. + // ci is CreationIndex, index of device/q to create linked list on + for (int ci=0; ci= 5.7 +*/ +TEST_CASE("test_svm_shared_address_space_fine_grain_system") { + fprintf(stderr, "test_svm_shared_address_space_fine_grain_system ignored\n"); + return;// blocked by SWDEV-422544 add HIP flag for APU device + const unsigned int num_elements = 1024; + int num_devices = 0; + HIP_CHECK(hipGetDeviceCount(&num_devices)); + int num_devices_plus_host = num_devices + 1; + std::vector streams(num_devices); + + for (int d = 0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); + HIP_CHECK(hipStreamCreate(&streams[d])); + } + HIP_CHECK(hipSetDevice(0)); + + unsigned int numLists = num_elements; + unsigned int ListLength = 32; + + // this allocation holds the linked list nodes. + Node* pNodes = (Node*)align_malloc(numLists * ListLength * sizeof(Node), 128); + // this allocation holds an index into the nodes buffer, it is used for node allocation + unsigned int* pAllocator = (unsigned int*)align_malloc(sizeof(unsigned int), 128); + // this allocation holds the count of correct nodes, which is computed by the verify kernel. + unsigned int* pNumCorrect = (unsigned int*)align_malloc(sizeof(unsigned int), 128); + + // ci is CreationIndex, index of device/q to create linked list on + for (int ci = 0; ci < num_devices_plus_host; ci++) { + // vi is VerificationIndex, index of device/q to verify linked list on + for (int vi = 0; vi < num_devices_plus_host; vi++) { + if (ci == num_devices) // last device index represents the host, note the num_device+1 above. + { + create_linked_lists_on_host(pNodes, numLists, ListLength); + } else { + create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists, + ListLength); + } + + if (vi == num_devices) { + verify_linked_lists_on_host(pNodes, numLists, ListLength); + } else { + verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists, + ListLength); + } + } + } + align_free(pNodes); + align_free(pAllocator); + align_free(pNumCorrect); + for (int d = 0; d < num_devices; d++) { + HIP_CHECK(hipStreamDestroy(streams[d])); + } + REQUIRE(true); +}