SWDEV-313387 - Port CTS SVM atomic tests to Hip

Change-Id: I32c9bed860ddf4fe1d7bba21dce9bd728168c398


[ROCm/hip-tests commit: f425bee1dc]
This commit is contained in:
taosang2
2023-09-07 18:27:56 -04:00
committato da Tao Sang
parent 9ead75d6a1
commit 3bcee40ac1
6 ha cambiato i file con 993 aggiunte e 0 eliminazioni
@@ -178,5 +178,16 @@ hip_add_exe_to_target(NAME MemoryTest2
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
if(HIP_PLATFORM MATCHES "amd")
set(TEST_SRC
hipSVMTestByteGranularity.cpp
hipSVMTestFineGrainMemoryConsistency.cpp
hipSVMTestFineGrainSyncBuffers.cpp
hipSVMTestSharedAddressSpaceFineGrain.cpp
)
hip_add_exe_to_target(NAME SVMAtomicTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
add_dependencies(build_tests hipHostRegisterPerf)
endif()
@@ -0,0 +1,141 @@
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
* Modifications Copyright (C)2023 Advanced
* Micro Devices, Inc. All rights reserved.
*/
#ifndef __COMMON_H__
#define __COMMON_H__
#include <vector>
#include <string>
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
#include <windows.h>
#endif
// SVM Atomic wrappers.
// Platforms that support SVM atomics (atomics that work across the host and devices) need to
// implement these host side functions correctly. Platforms that do not support SVM atomics can
// simpy implement these functions as empty stubs since the functions will not be called. For now
// only Windows x86 is implemented, add support for other platforms as needed.
unsigned int inline AtomicLoad32(unsigned int* pValue) {
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
return (unsigned int)InterlockedExchangeAdd((LONG*)pValue, 0l);
#elif defined(__GNUC__)
return __sync_add_and_fetch(pValue, 0);
#else
return -1;
#endif
}
// all the x86 atomics are seq_cst, so don't need to do anything with the memory order parameter.
unsigned int inline AtomicFetchAdd32(unsigned int* object, int operand) {
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
return InterlockedExchangeAdd((LONG*)object, operand);
#elif defined(__GNUC__)
return __sync_fetch_and_add(object, operand);
#else
return -1;
#endif
}
template <typename T>
T inline AtomicFetchAdd64(T* object, T operand) {
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
return (T)InterlockedExchangeAdd64((LONG64*)object, (LONG64)operand);
#elif defined(__GNUC__)
return (T)__sync_fetch_and_add((intptr_t*)object, (intptr_t)operand);
#else
return -1;
#endif
}
unsigned int inline AtomicExchange32(unsigned int* object, unsigned int desired) {
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
return (unsigned int)InterlockedExchange((LONG*)object, (LONG)desired);
#elif defined(__GNUC__)
return __sync_lock_test_and_set(object, desired);
#else
return -1;
#endif
}
template <typename T>
T inline AtomicExchange64(T* a, T expected) {
#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
return (T)InterlockedExchangePointer((PVOID volatile*)a, (PVOID)expected);
#elif defined(__GNUC__)
return (T)__sync_lock_test_and_set((long long*)a, (long long)expected);
#else
tmp = 0;
#endif
}
template <typename T>
bool AtomicCompareExchange64(T* a, T* expected, T desired)
{
#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
T tmp = (T)InterlockedCompareExchange64((LONG64 *)a, (LONG64)desired,
*(LONG64 *)expected);
#elif defined(__GNUC__)
T tmp = (T)__sync_val_compare_and_swap((intptr_t*)a, (intptr_t)(*expected),
(intptr_t)desired);
#else
tmp = 0;
#endif
if(tmp == *expected)
return true;
*expected = tmp;
return false;
}
inline void* align_malloc(size_t size, size_t alignment) {
#if defined(_WIN32) && defined(_MSC_VER)
return _aligned_malloc(size, alignment);
#elif defined(__linux__) || defined(linux) || defined(__APPLE__)
void* ptr = NULL;
#if defined(__ANDROID__)
ptr = memalign(alignment, size);
if (ptr) return ptr;
#else
if (alignment < sizeof(void*)) {
alignment = sizeof(void*);
}
if (0 == posix_memalign(&ptr, alignment, size)) return ptr;
#endif
return NULL;
#elif defined(__MINGW32__)
return __mingw_aligned_malloc(size, alignment);
#else
#error "Please add support OS for aligned malloc"
#endif
}
inline void align_free(void* ptr) {
#if defined(_WIN32) && defined(_MSC_VER)
_aligned_free(ptr);
#elif defined(__linux__) || defined(linux) || defined(__APPLE__)
return free(ptr);
#elif defined(__MINGW32__)
return __mingw_aligned_free(ptr);
#else
#error "Please add support OS for aligned free"
#endif
}
#endif // #ifndef __COMMON_H__
@@ -0,0 +1,154 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
* Modifications Copyright (C)2023 Advanced
* Micro Devices, Inc. All rights reserved.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <resource_guards.hh>
#include <utils.hh>
// Each device will write it's id into the bytes that it "owns", ownership is based on round robin
// (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the
// host code). id is the index (id) of the device that this kernel is executing on. For example, if
// there are 2 SVM devices and the host; the buffer should look like this after each device and the
// host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2...
__global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
int owner = i % num_id;
if (id == owner) a[i] = id; // modify location if it belongs to this device, write id
}
// Verify that a device can see the byte sized updates from the other devices, sum up the device
// id's and see if they match expected value. Note: this must be called with a reduced NDRange so
// that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and
// the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum
// at each point is 0+1+2 = 3.
__global__ void sum_neighbor_locations(char* a, unsigned int num_devices,
unsigned int* error_count) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2;
unsigned int sum = 0;
for (unsigned int j = 0; j < num_devices; j++) {
sum += a[i + j]; // add my neighbors to the right
}
if (sum != expected_sum)
atomicAdd_system(error_count, 1u); // like opencl atomic_inc()
}
/**
* Test Description
* ------------------------
* - The suite will test the following functions,
hipHostMalloc() with following flags,
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
atomicAdd_system()(in kernel)
hipStreamCreate()
hipStreamSynchronize()
* It will demonstrate use of SVM's atomics to do fine grain synchronization among
* devices with each stream on each device. The result will be verified on the host.
* Test source
* ------------------------
* - catch/unit/memory/hipSVMTestByteGranularity.cpp
* Test requirements
* ------------------------
* - Host specific (WINDOWS and LINUX)
* - Fine grain access and atomics supported on devices
* - HIP_VERSION >= 5.7
*/
TEST_CASE("test_svm_byte_granularity") {
const int num_elements = 2048;
int num_devices = 0;
HIP_CHECK(hipGetDeviceCount(&num_devices));
int num_devices_plus_host = num_devices + 1;
std::vector<hipStream_t> streams(num_devices);
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
HIP_CHECK(hipStreamCreate(&streams[d]));
}
HIP_CHECK(hipSetDevice(0));
char* pA = nullptr;
// hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER
HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent));
unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices);
for(unsigned int i=0; i < num_devices; i++) {
// hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS
// We need atomic inc among different GPUs
HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements,
hipHostMallocCoherent));
*error_counts[i] = 0;
}
for(int i = 0; i < num_elements; i++) pA[i] = -1;
// get all the devices going simultaneously
for(unsigned int d = 0; d < num_devices; d++) // device ids starting at 1.
{
write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
HIP_CHECK(hipGetLastError());
}
unsigned int host_id = num_devices; // host code will take the id above the devices.
for(unsigned int i = num_devices; i < num_elements; i+= num_devices_plus_host)
pA[i] = host_id;
for (unsigned int d = 0; d < num_devices; d++) {
HIP_CHECK(hipStreamSynchronize(streams[d]));
}
// now check that each device can see the byte writes made by the other devices.
// adjusted so sum_neighbor_locations doesn't read past end of buffer
size_t adjusted_num_elements = num_elements - num_devices;
for(unsigned int d = 0; d < num_devices; d++)
{
sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
error_counts[d]);
HIP_CHECK(hipGetLastError());
}
for (unsigned int d = 0; d < num_devices; d++) {
HIP_CHECK(hipStreamSynchronize(streams[d]));
}
// see if any of the devices found errors
for(unsigned int d = 0; d < num_devices; d++) {
if (*error_counts[d] > 0) {
fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]);
REQUIRE(false);
}
}
unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2;
// check that host can see the byte writes made by the devices.
for(unsigned int i = 0; i < num_elements - num_devices_plus_host; i++)
{
unsigned int sum = 0;
for(unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i+j];
if (sum != expected) {
fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected);
REQUIRE(false);
}
}
for (unsigned int i = 0; i < num_devices; i++) {
HIP_CHECK(hipStreamDestroy(streams[i]));
HIP_CHECK(hipHostFree(error_counts[i]));
}
free(error_counts);
HIP_CHECK(hipHostFree(pA));
REQUIRE(true);
}
@@ -0,0 +1,261 @@
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
* Modifications Copyright (C)2023 Advanced
* Micro Devices, Inc. All rights reserved.
*/
#include <chrono>
#include <thread>
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <utils.hh>
#include "hipSVMCommon.h"
//#define DEBUG_ATOMIC // To provide additional data for debugging
#ifdef DEBUG_ATOMIC
//#define DEBUG_ATOMIC_PRINT_THREAD
#endif
typedef struct BinNode {
#ifdef DEBUG_ATOMIC
unsigned int n;
unsigned int d;
unsigned int i;
#endif
unsigned int value;
struct BinNode* pNext;
} BinNode;
__global__ void build_hash_table_on_device(unsigned int* input, size_t inputSize,
BinNode* pNodes,
unsigned int* pNumNodes, unsigned int numBins,
unsigned int dev) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= inputSize) return;
unsigned int n = atomicAdd_system((unsigned int*)pNumNodes, 1u);
BinNode* pNew = &pNodes[n];
unsigned int b = input[i] % numBins;
pNew->value = input[i];
#ifdef DEBUG_ATOMIC
pNew->d = dev;
pNew->i = i;
pNew->n = n;
#endif
unsigned long long next = 0;
unsigned long long old = atomicAdd_system((unsigned long long*)&(pNodes[b].pNext),
0ull); // Because of no atomicLoad()
do {
next = old;
// Use CAS to ensure atomic operation
//pNew->pNext = (BinNode*)next;
atomicExch((unsigned long long*)&(pNew->pNext), next);
old = atomicCAS_system((unsigned long long *)&(pNodes[b].pNext), next,
(unsigned long long )pNew);
} while (old != next);
#ifdef DEBUG_ATOMIC_PRINT_THREAD
printf("k%u: i=%zu, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p,"
" old=%p, input[%zu]=%u\n", dev, i, n,
pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b], (void*)old,
i, input[i]);
#else
(void)dev;
#endif
}
void build_hash_table_on_host(unsigned int* input, size_t inputSize, BinNode* pNodes,
unsigned int* pNumNodes, unsigned int numBins,
unsigned int dev) {
// wait until we see some activity from a device (try to run host side simultaneously).
while (numBins == AtomicLoad32(pNumNodes));
for(unsigned int i = 0; i < inputSize; i++)
{
unsigned int n = AtomicFetchAdd32(pNumNodes, 1u);
BinNode* pNew = &pNodes[n];
unsigned int b = input[i] % numBins;
#ifdef DEBUG_ATOMIC
pNew->d = dev;
pNew->i = i;
pNew->n = n;
#endif
pNew->value = input[i];
BinNode* next = AtomicFetchAdd64(&pNodes[b].pNext, (BinNode*)0ll);
do {
AtomicExchange64(&(pNew->pNext), next);
// always inserting at head of list
} while (!AtomicCompareExchange64(&(pNodes[b].pNext), &next,
(BinNode*)pNew));
#ifdef DEBUG_ATOMIC_PRINT_THREAD
fprintf(stderr,
"k%u: i=%u, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p, "
"input[%u]=%u\n",
dev, i, n, pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b],
i, input[i]);
#else
(void)dev;
#endif
}
}
void launch_kernels_and_verify(std::vector<hipStream_t> &streams, unsigned int num_devices,
unsigned int numBins, size_t num_pixels) {
unsigned int* pInputImage = nullptr;
BinNode* pNodes = nullptr;
unsigned int* pNumNodes = nullptr;
unsigned int total_items = num_pixels * (num_devices + 1);
HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocCoherent));
HIP_CHECK(
hipHostMalloc(&pNodes, sizeof(BinNode) * (total_items + numBins), hipHostMallocCoherent));
HIP_CHECK(hipHostMalloc(&pNumNodes, sizeof(unsigned int), hipHostMallocCoherent));
*pNumNodes = numBins; // using the first numBins nodes to hold the list heads.
for(unsigned int i = 0; i < numBins; i++) pNodes[i].pNext = nullptr;
for(unsigned int i = 0; i < num_pixels; i++) pInputImage[i] = i;
// Get all the devices going simultaneously, each device (and the host) will insert
// all the pixels.
for(unsigned int d=0; d < num_devices; d++)
{
build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>(
pInputImage, num_pixels, pNodes, pNumNodes, numBins, d);
HIP_CHECK(hipGetLastError());
}
std::vector<std::thread> threads;
threads.push_back(std::thread(build_hash_table_on_host, pInputImage, num_pixels, pNodes,
pNumNodes, numBins, num_devices));
for (unsigned int d = 0; d < num_devices; d++) {
threads.push_back(std::thread(
[](hipStream_t s) {
HIP_CHECK(hipStreamSynchronize(s)); // To workarround batch dispatching on Windows
}, streams[d]));
}
std::for_each(threads.begin(), threads.end(), [](std::thread& t) { t.join(); });
for (unsigned int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
HIP_CHECK(hipDeviceSynchronize());
}
HIP_CHECK(hipSetDevice(0));
unsigned int num_items = 0;
// check correctness of each bin in the hash table.
for(unsigned int i = 0; i < numBins; i++)
{
BinNode *pNode = pNodes[i].pNext;
unsigned int num_items_bin = 0;
unsigned int total_num_items_bin =
(num_pixels % numBins <= i) ? (num_pixels / numBins) : (num_pixels / numBins + 1);
total_num_items_bin *= (num_devices + 1); // The item number of the list in i-th bin
while(pNode)
{
#ifdef DEBUG_ATOMIC_PRINT_THREAD
fprintf(stderr, "v%u/%u: %u, pNode=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p)\n", i, numBins,
num_items_bin, pNode, pNode->n, pNode->d, pNode->i, pNode->value, pNode->pNext);
#endif
if((pNode->value % numBins) != i)
{
fprintf(stderr,
"Something went wrong at i=%u, item is in wrong hash bucket:" \
"pNode->value=%u, numBins=%u\n", i, pNode->value, numBins);
REQUIRE(false);
}
num_items++;
num_items_bin++;
if (num_items_bin > total_num_items_bin) {
fprintf(stderr,
"Something went wrong at i=%u/%u, num_items_bin(%u)>total_num_items_bin(%u)\n",
i, numBins, num_items_bin, total_num_items_bin);
REQUIRE(false);
}
pNode = pNode->pNext;
}
if (num_items_bin != total_num_items_bin) {
fprintf(stderr,
"Something went wrong at i=%u/%u, num_items_bin(%u)!=total_num_items_bin(%u)\n",
i, numBins, num_items_bin, total_num_items_bin);
}
}
HIP_CHECK(hipHostFree(pInputImage));
HIP_CHECK(hipHostFree(pNodes));
HIP_CHECK(hipHostFree(pNumNodes));
// each device and the host inserted all of the pixels, check that none are missing.
if (num_items != total_items)
{
fprintf(stderr, "The hash table is not correct, num items %u != expected num items: %u\n",
num_items, total_items);
REQUIRE(false); // test did not pass
}
REQUIRE(true);
}
/**
* Test Description
* ------------------------
* - The suite will test the following functions,
hipHostMalloc() with following flags,
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
atomicAdd_system()(in kernel)
atomicCAS_system()(in kernel)
atomicExch()(in kernel)
InterlockedExchangeAdd()(in WINDOWS host)
__sync_add_and_fetch()(in LINUX host)
InterlockedExchangeAdd64()(in WINDOWS host)
InterlockedExchangePointer()(in WINDOWS host)
__sync_lock_test_and_set()(in LINUX host)
InterlockedCompareExchange64()(in WINDOWS host)
__sync_val_compare_and_swap()(in LINUX host)
hipDeviceSynchronize()
* It will demonstrate use of SVM's atomics to do fine grain synchronization among
* devices and the host.
* Concept: Each device and the host simultaneously insert values into a single hash table.
* Each bin in the hash table is a linked list. Each bin is protected against simultaneous
* update using a lock free technique. The correctness of the list is verified on the host.
* Test source
* ------------------------
* - catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp
* Test requirements
* ------------------------
* - Host specific (WINDOWS and LINUX)
* - Fine grain access and atomics supported on devices and host
* - HIP_VERSION >= 5.7
*/
TEST_CASE("test_svm_fine_grain_memory_consistency") {
const int num_elements = 2167;
int num_devices = 0;
HIP_CHECK(hipGetDeviceCount(&num_devices));
std::vector<hipStream_t> streams(num_devices);
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
HIP_CHECK(hipStreamCreate(&streams[d]));
}
HIP_CHECK(hipSetDevice(0));
// all work groups in all devices and the host code will hammer on this one lock.
unsigned int numBins = 1;
launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
numBins = 2; // 2 locks within in same cache line will get hit from different devices and host.
launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
numBins = 29; // locks span a few cache lines.
launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
for (unsigned int i = 0; i < num_devices; i++) {
HIP_CHECK(hipStreamDestroy(streams[i]));
}
}
@@ -0,0 +1,129 @@
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
* Modifications Copyright (C)2023 Advanced
* Micro Devices, Inc. All rights reserved.
*/
#include <chrono>
#include <thread>
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <utils.hh>
#include "hipSVMCommon.h"
#define MAX_TARGETS 1024
__global__ void find_targets(unsigned int* image, unsigned int target,
unsigned int* numTargetsFound,
unsigned int* targetLocations) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int index = 0;
if (image[i] == target) {
index = atomicAdd((unsigned int*)numTargetsFound, 1u);
if (index < MAX_TARGETS) {
atomicExch_system((unsigned int *)&targetLocations[index], (unsigned int)i);
}
}
}
void spawnAnalysisTask(int location)
{
printf("found target at location %d\n", location);
}
/**
* Test Description
* ------------------------
* - The suite will test the following functions,
hipHostMalloc() with following flags,
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
atomicAdd()(in kernel)
atomicExch_system()(in kernel)
InterlockedExchangeAdd()(in WINDOWS host)
__sync_add_and_fetch()(in LINUX host)
hipStreamCreate()
hipEventCreate()
hipEventRecord()
hipEventQuery()
* It will demonstrate use of SVM's atomics to do fine grain synchronization between
* a device and the host. The result will be verified on the host.
* Concept: a device kernel is used to search an input image for regions that match a
* target pattern. The device immediately notifies the host when it finds a target
* (via an atomic operation that works across host and devices). The host is then able
* to spawn a task that further analyzes the target while the device continues searching
* for more targets.
* Test source
* ------------------------
* - catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp
* Test requirements
* ------------------------
* - Host specific (WINDOWS and LINUX)
* - Fine grain access and atomics supported on device and host
* - HIP_VERSION >= 5.7
*/
TEST_CASE("test_svm_fine_grain_sync_buffers") {
size_t num_pixels = 1024 * 1024 * 2;
hipStream_t stream;
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipStreamCreate(&stream));
hipEvent_t event;
HIP_CHECK(hipEventCreate(&event));
unsigned int *pInputImage, *pNumTargetsFound, *pTargetLocations;
HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocNonCoherent));
HIP_CHECK(hipHostMalloc(&pNumTargetsFound, sizeof(unsigned int), hipHostMallocCoherent));
HIP_CHECK(hipHostMalloc(&pTargetLocations, sizeof(int) * MAX_TARGETS, hipHostMallocCoherent));
unsigned int targetDescriptor = 777;
*pNumTargetsFound = 0;
unsigned int i;
for(i = 0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1;
for(i = 0; i < num_pixels; i++) pInputImage[i] = 0;
pInputImage[0] = targetDescriptor;
pInputImage[3] = targetDescriptor;
pInputImage[num_pixels - 1] = targetDescriptor;
find_targets<<<(num_pixels + 255) / 256, 256, 0, stream>>>(pInputImage, targetDescriptor,
pNumTargetsFound, pTargetLocations);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipEventRecord(event, stream));
i=0;
hipError_t status = hipSuccess;
unsigned int loc = 0;
// check for new targets, if found spawn a task to analyze target.
do {
status = hipEventQuery(event);
if (status != hipErrorNotReady && status != hipSuccess) {
fprintf(stderr, "Unexpected status = %d\n", status);
REQUIRE(false);
}
loc = AtomicLoad32(&pTargetLocations[i]);
if (loc != -1) // -1 indicates slot not used yet.
{
spawnAnalysisTask(loc); // Do something...
i++;
}
} while (status == hipErrorNotReady ||
AtomicLoad32(&pTargetLocations[i]) != -1);
HIP_CHECK(hipHostFree(pInputImage));
HIP_CHECK(hipHostFree(pNumTargetsFound));
HIP_CHECK(hipHostFree(pTargetLocations));
HIP_CHECK(hipEventDestroy(event));
HIP_CHECK(hipStreamDestroy(stream));
REQUIRE(i == 3);
}
@@ -0,0 +1,297 @@
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
* Modifications Copyright (C)2023 Advanced
* Micro Devices, Inc. All rights reserved.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <utils.hh>
#include "hipSVMCommon.h"
// const char *linked_list_create_and_verify_kernels[] = {
typedef struct Node {
unsigned int global_id;
unsigned int position_in_list;
struct Node* pNext;
} Node;
// The allocation_index parameter must be initialized on the host to N work-items
// The first N nodes in pNodes will be the heads of the lists.
__global__ void create_linked_lists_on_device(Node* pNodes,
unsigned int* allocation_index,
unsigned int list_length) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
Node* pNode = &pNodes[i];
pNode->global_id = i;
pNode->position_in_list = 0;
Node* pNew;
for (unsigned int j = 1; j < list_length; j++) {
pNew = &pNodes[atomicAdd(allocation_index, 1u)]; // allocate a new node
pNew->global_id = i;
pNew->position_in_list = j;
pNode->pNext = pNew; // link new node onto end of list
pNode = pNew; // move to end of list
}
}
__global__ void verify_linked_lists_on_device(Node* pNodes, unsigned int* num_correct,
unsigned int list_length) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
Node* pNode = &pNodes[i];
for (unsigned int j = 0; j < list_length; j++) {
if (pNode->global_id == i && pNode->position_in_list == j) {
atomicAdd(num_correct, 1u);
} else {
break;
}
pNode = pNode->pNext;
}
}
// The first N nodes in pNodes will be the heads of the lists.
void create_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) {
unsigned int allocation_index = num_lists; // heads of lists are in first num_lists nodes.
for (unsigned int i = 0; i < num_lists; i++) {
Node* pNode = &pNodes[i];
pNode->global_id = i;
pNode->position_in_list = 0;
Node* pNew;
for (unsigned int j = 1; j < list_length; j++) {
pNew = &pNodes[allocation_index++]; // allocate a new node
pNew->global_id = i;
pNew->position_in_list = j;
pNode->pNext = pNew; // link new node onto end of list
pNode = pNew; // move to end of list
}
}
}
void verify_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) {
unsigned int numCorrect = 0;
for (unsigned int i = 0; i < num_lists; i++) {
Node* pNode = &pNodes[i];
for (int j = 0; j < list_length; j++) {
if (pNode->global_id == i && pNode->position_in_list == j) {
numCorrect++;
} else {
break;
}
pNode = pNode->pNext;
}
}
if (numCorrect != list_length * num_lists) {
fprintf(stderr, "Failed\n");
REQUIRE(false);
}
}
void create_linked_lists_on_device(hipStream_t stream, Node* pNodes,
unsigned int* pAllocator, unsigned int numLists,
unsigned int ListLength) {
// reset allocator index
*pAllocator = numLists; // the first numLists elements of the nodes array are already
// allocated (they hold the head of each list).
create_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pAllocator,
ListLength);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipStreamSynchronize(stream));
}
void verify_linked_lists_on_device(hipStream_t stream, Node* pNodes,
unsigned int* pNumCorrect, unsigned int numLists,
unsigned int ListLength) {
*pNumCorrect = 0; // reset numCorrect to zero
verify_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pNumCorrect,
ListLength);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipStreamSynchronize(stream));
int correct_count = *pNumCorrect;
if(correct_count != ListLength * numLists)
{
fprintf(stderr,"Failed\n");
REQUIRE(false);
}
}
/**
* Test Description
* ------------------------
* - The suite will test the following functions,
hipHostMalloc() with following flags,
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
atomicAdd()(in kernel)
hipStreamCreate()
hipStreamSynchronize()
* It will test that all devices and the host share a common address space using fine-grain
* host buffers.
* Concept: This is done by creating a linked list on a device and then verifying the
* correctness of the list on another device or the host. This basic test is performed for all
* combinations of devices and the host that exist within the platform. The test passes only if
* every combination passes.
* Test source
* ------------------------
* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
* Test requirements
* ------------------------
* - Host specific (WINDOWS and LINUX)
* - Fine grain access supported on devices and host
* - HIP_VERSION >= 5.7
*/
TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
const unsigned int num_elements = 1024;
int num_devices = 0;
HIP_CHECK(hipGetDeviceCount(&num_devices));
int num_devices_plus_host = num_devices + 1;
std::vector<hipStream_t> streams(num_devices);
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
HIP_CHECK(hipStreamCreate(&streams[d]));
}
HIP_CHECK(hipSetDevice(0));
unsigned int numLists = num_elements;
unsigned int ListLength = 32;
Node* pNodes = nullptr;
unsigned int* pAllocator = nullptr;
unsigned int* pNumCorrect = nullptr;
HIP_CHECK(hipHostMalloc(&pNodes, sizeof(Node) * ListLength * numLists, hipHostMallocNonCoherent));
HIP_CHECK(hipHostMalloc(&pAllocator, sizeof(unsigned int), hipHostMallocNonCoherent));
HIP_CHECK(hipHostMalloc(&pNumCorrect, sizeof(unsigned int), hipHostMallocNonCoherent));
// Create linked list on one device and verify on another device (or the host).
// Do this for all possible combinations of devices and host within the platform.
// ci is CreationIndex, index of device/q to create linked list on
for (int ci=0; ci<num_devices_plus_host; ci++)
{
// vi is VerificationIndex, index of device/q to verify linked list on
for (int vi = 0; vi < num_devices_plus_host; vi++)
{
if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
{
create_linked_lists_on_host(pNodes, numLists, ListLength);
}
else
{
create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
ListLength);
}
if(vi == num_devices)
{
verify_linked_lists_on_host(pNodes, numLists, ListLength);
}
else
{
verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
ListLength);
}
}
}
HIP_CHECK(hipHostFree(pNodes));
HIP_CHECK(hipHostFree(pAllocator));
HIP_CHECK(hipHostFree(pNumCorrect));
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipStreamDestroy(streams[d]));
}
REQUIRE(true);
}
/**
* Test Description
* ------------------------
* - The suite will test the following functions,
align_malloc()
atomicAdd()(in kernel)
hipStreamCreate()
hipStreamSynchronize()
* It will test that all devices and the host share a common address space using fine-grain mode
* with regular host buffers.
* Concept: This is done by creating a linked list on a device and then verifying the
* correctness of the list on another device or the host. This basic test is performed for all
* combinations of devices and the host that exist within the platform. The test passes only if
* every combination passes.
* Test source
* ------------------------
* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
* Test requirements
* ------------------------
* - Host specific (WINDOWS and LINUX)
* - System fine grain access supported on devices
* - HIP_VERSION >= 5.7
*/
TEST_CASE("test_svm_shared_address_space_fine_grain_system") {
fprintf(stderr, "test_svm_shared_address_space_fine_grain_system ignored\n");
return;// blocked by SWDEV-422544 add HIP flag for APU device
const unsigned int num_elements = 1024;
int num_devices = 0;
HIP_CHECK(hipGetDeviceCount(&num_devices));
int num_devices_plus_host = num_devices + 1;
std::vector<hipStream_t> streams(num_devices);
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
HIP_CHECK(hipStreamCreate(&streams[d]));
}
HIP_CHECK(hipSetDevice(0));
unsigned int numLists = num_elements;
unsigned int ListLength = 32;
// this allocation holds the linked list nodes.
Node* pNodes = (Node*)align_malloc(numLists * ListLength * sizeof(Node), 128);
// this allocation holds an index into the nodes buffer, it is used for node allocation
unsigned int* pAllocator = (unsigned int*)align_malloc(sizeof(unsigned int), 128);
// this allocation holds the count of correct nodes, which is computed by the verify kernel.
unsigned int* pNumCorrect = (unsigned int*)align_malloc(sizeof(unsigned int), 128);
// ci is CreationIndex, index of device/q to create linked list on
for (int ci = 0; ci < num_devices_plus_host; ci++) {
// vi is VerificationIndex, index of device/q to verify linked list on
for (int vi = 0; vi < num_devices_plus_host; vi++) {
if (ci == num_devices) // last device index represents the host, note the num_device+1 above.
{
create_linked_lists_on_host(pNodes, numLists, ListLength);
} else {
create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
ListLength);
}
if (vi == num_devices) {
verify_linked_lists_on_host(pNodes, numLists, ListLength);
} else {
verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
ListLength);
}
}
}
align_free(pNodes);
align_free(pAllocator);
align_free(pNumCorrect);
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipStreamDestroy(streams[d]));
}
REQUIRE(true);
}