SWDEV-313387 - Port CTS SVM atomic tests to Hip
Change-Id: I32c9bed860ddf4fe1d7bba21dce9bd728168c398
[ROCm/hip-tests commit: f425bee1dc]
This commit is contained in:
@@ -178,5 +178,16 @@ hip_add_exe_to_target(NAME MemoryTest2
|
||||
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
|
||||
|
||||
if(HIP_PLATFORM MATCHES "amd")
|
||||
set(TEST_SRC
|
||||
hipSVMTestByteGranularity.cpp
|
||||
hipSVMTestFineGrainMemoryConsistency.cpp
|
||||
hipSVMTestFineGrainSyncBuffers.cpp
|
||||
hipSVMTestSharedAddressSpaceFineGrain.cpp
|
||||
)
|
||||
|
||||
hip_add_exe_to_target(NAME SVMAtomicTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
|
||||
|
||||
add_dependencies(build_tests hipHostRegisterPerf)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
/*
|
||||
* Modifications Copyright (C)2023 Advanced
|
||||
* Micro Devices, Inc. All rights reserved.
|
||||
*/
|
||||
#ifndef __COMMON_H__
|
||||
#define __COMMON_H__
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// SVM Atomic wrappers.
|
||||
// Platforms that support SVM atomics (atomics that work across the host and devices) need to
|
||||
// implement these host side functions correctly. Platforms that do not support SVM atomics can
|
||||
// simpy implement these functions as empty stubs since the functions will not be called. For now
|
||||
// only Windows x86 is implemented, add support for other platforms as needed.
|
||||
unsigned int inline AtomicLoad32(unsigned int* pValue) {
|
||||
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
|
||||
return (unsigned int)InterlockedExchangeAdd((LONG*)pValue, 0l);
|
||||
#elif defined(__GNUC__)
|
||||
return __sync_add_and_fetch(pValue, 0);
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
// all the x86 atomics are seq_cst, so don't need to do anything with the memory order parameter.
|
||||
unsigned int inline AtomicFetchAdd32(unsigned int* object, int operand) {
|
||||
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
|
||||
return InterlockedExchangeAdd((LONG*)object, operand);
|
||||
#elif defined(__GNUC__)
|
||||
return __sync_fetch_and_add(object, operand);
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T inline AtomicFetchAdd64(T* object, T operand) {
|
||||
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
|
||||
return (T)InterlockedExchangeAdd64((LONG64*)object, (LONG64)operand);
|
||||
#elif defined(__GNUC__)
|
||||
return (T)__sync_fetch_and_add((intptr_t*)object, (intptr_t)operand);
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
unsigned int inline AtomicExchange32(unsigned int* object, unsigned int desired) {
|
||||
#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
|
||||
return (unsigned int)InterlockedExchange((LONG*)object, (LONG)desired);
|
||||
#elif defined(__GNUC__)
|
||||
return __sync_lock_test_and_set(object, desired);
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T inline AtomicExchange64(T* a, T expected) {
|
||||
#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
|
||||
return (T)InterlockedExchangePointer((PVOID volatile*)a, (PVOID)expected);
|
||||
#elif defined(__GNUC__)
|
||||
return (T)__sync_lock_test_and_set((long long*)a, (long long)expected);
|
||||
#else
|
||||
tmp = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool AtomicCompareExchange64(T* a, T* expected, T desired)
|
||||
{
|
||||
#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
|
||||
T tmp = (T)InterlockedCompareExchange64((LONG64 *)a, (LONG64)desired,
|
||||
*(LONG64 *)expected);
|
||||
#elif defined(__GNUC__)
|
||||
T tmp = (T)__sync_val_compare_and_swap((intptr_t*)a, (intptr_t)(*expected),
|
||||
(intptr_t)desired);
|
||||
#else
|
||||
tmp = 0;
|
||||
#endif
|
||||
if(tmp == *expected)
|
||||
return true;
|
||||
*expected = tmp;
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void* align_malloc(size_t size, size_t alignment) {
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
return _aligned_malloc(size, alignment);
|
||||
#elif defined(__linux__) || defined(linux) || defined(__APPLE__)
|
||||
void* ptr = NULL;
|
||||
#if defined(__ANDROID__)
|
||||
ptr = memalign(alignment, size);
|
||||
if (ptr) return ptr;
|
||||
#else
|
||||
if (alignment < sizeof(void*)) {
|
||||
alignment = sizeof(void*);
|
||||
}
|
||||
if (0 == posix_memalign(&ptr, alignment, size)) return ptr;
|
||||
#endif
|
||||
return NULL;
|
||||
#elif defined(__MINGW32__)
|
||||
return __mingw_aligned_malloc(size, alignment);
|
||||
#else
|
||||
#error "Please add support OS for aligned malloc"
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void align_free(void* ptr) {
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
_aligned_free(ptr);
|
||||
#elif defined(__linux__) || defined(linux) || defined(__APPLE__)
|
||||
return free(ptr);
|
||||
#elif defined(__MINGW32__)
|
||||
return __mingw_aligned_free(ptr);
|
||||
#else
|
||||
#error "Please add support OS for aligned free"
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // #ifndef __COMMON_H__
|
||||
|
||||
@@ -0,0 +1,154 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
/*
|
||||
* Modifications Copyright (C)2023 Advanced
|
||||
* Micro Devices, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <resource_guards.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
// Each device will write it's id into the bytes that it "owns", ownership is based on round robin
|
||||
// (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the
|
||||
// host code). id is the index (id) of the device that this kernel is executing on. For example, if
|
||||
// there are 2 SVM devices and the host; the buffer should look like this after each device and the
|
||||
// host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2...
|
||||
__global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int owner = i % num_id;
|
||||
if (id == owner) a[i] = id; // modify location if it belongs to this device, write id
|
||||
}
|
||||
|
||||
// Verify that a device can see the byte sized updates from the other devices, sum up the device
|
||||
// id's and see if they match expected value. Note: this must be called with a reduced NDRange so
|
||||
// that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and
|
||||
// the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum
|
||||
// at each point is 0+1+2 = 3.
|
||||
__global__ void sum_neighbor_locations(char* a, unsigned int num_devices,
|
||||
unsigned int* error_count) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2;
|
||||
unsigned int sum = 0;
|
||||
for (unsigned int j = 0; j < num_devices; j++) {
|
||||
sum += a[i + j]; // add my neighbors to the right
|
||||
}
|
||||
if (sum != expected_sum)
|
||||
atomicAdd_system(error_count, 1u); // like opencl atomic_inc()
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - The suite will test the following functions,
|
||||
hipHostMalloc() with following flags,
|
||||
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
|
||||
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
|
||||
atomicAdd_system()(in kernel)
|
||||
hipStreamCreate()
|
||||
hipStreamSynchronize()
|
||||
* It will demonstrate use of SVM's atomics to do fine grain synchronization among
|
||||
* devices with each stream on each device. The result will be verified on the host.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/memory/hipSVMTestByteGranularity.cpp
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Host specific (WINDOWS and LINUX)
|
||||
* - Fine grain access and atomics supported on devices
|
||||
* - HIP_VERSION >= 5.7
|
||||
*/
|
||||
TEST_CASE("test_svm_byte_granularity") {
|
||||
const int num_elements = 2048;
|
||||
int num_devices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
||||
int num_devices_plus_host = num_devices + 1;
|
||||
std::vector<hipStream_t> streams(num_devices);
|
||||
|
||||
for (int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipSetDevice(d));
|
||||
HIP_CHECK(hipStreamCreate(&streams[d]));
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
char* pA = nullptr;
|
||||
// hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER
|
||||
HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent));
|
||||
unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices);
|
||||
|
||||
for(unsigned int i=0; i < num_devices; i++) {
|
||||
// hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS
|
||||
// We need atomic inc among different GPUs
|
||||
HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements,
|
||||
hipHostMallocCoherent));
|
||||
*error_counts[i] = 0;
|
||||
}
|
||||
for(int i = 0; i < num_elements; i++) pA[i] = -1;
|
||||
|
||||
// get all the devices going simultaneously
|
||||
for(unsigned int d = 0; d < num_devices; d++) // device ids starting at 1.
|
||||
{
|
||||
write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
unsigned int host_id = num_devices; // host code will take the id above the devices.
|
||||
for(unsigned int i = num_devices; i < num_elements; i+= num_devices_plus_host)
|
||||
pA[i] = host_id;
|
||||
|
||||
for (unsigned int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipStreamSynchronize(streams[d]));
|
||||
}
|
||||
|
||||
// now check that each device can see the byte writes made by the other devices.
|
||||
// adjusted so sum_neighbor_locations doesn't read past end of buffer
|
||||
size_t adjusted_num_elements = num_elements - num_devices;
|
||||
for(unsigned int d = 0; d < num_devices; d++)
|
||||
{
|
||||
sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
|
||||
error_counts[d]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
for (unsigned int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipStreamSynchronize(streams[d]));
|
||||
}
|
||||
// see if any of the devices found errors
|
||||
for(unsigned int d = 0; d < num_devices; d++) {
|
||||
if (*error_counts[d] > 0) {
|
||||
fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2;
|
||||
// check that host can see the byte writes made by the devices.
|
||||
for(unsigned int i = 0; i < num_elements - num_devices_plus_host; i++)
|
||||
{
|
||||
unsigned int sum = 0;
|
||||
for(unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i+j];
|
||||
if (sum != expected) {
|
||||
fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
for (unsigned int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipStreamDestroy(streams[i]));
|
||||
HIP_CHECK(hipHostFree(error_counts[i]));
|
||||
}
|
||||
free(error_counts);
|
||||
HIP_CHECK(hipHostFree(pA));
|
||||
REQUIRE(true);
|
||||
}
|
||||
@@ -0,0 +1,261 @@
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
/*
|
||||
* Modifications Copyright (C)2023 Advanced
|
||||
* Micro Devices, Inc. All rights reserved.
|
||||
*/
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <utils.hh>
|
||||
#include "hipSVMCommon.h"
|
||||
//#define DEBUG_ATOMIC // To provide additional data for debugging
|
||||
#ifdef DEBUG_ATOMIC
|
||||
//#define DEBUG_ATOMIC_PRINT_THREAD
|
||||
#endif
|
||||
|
||||
typedef struct BinNode {
|
||||
#ifdef DEBUG_ATOMIC
|
||||
unsigned int n;
|
||||
unsigned int d;
|
||||
unsigned int i;
|
||||
#endif
|
||||
unsigned int value;
|
||||
struct BinNode* pNext;
|
||||
} BinNode;
|
||||
|
||||
__global__ void build_hash_table_on_device(unsigned int* input, size_t inputSize,
|
||||
BinNode* pNodes,
|
||||
unsigned int* pNumNodes, unsigned int numBins,
|
||||
unsigned int dev) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i >= inputSize) return;
|
||||
|
||||
unsigned int n = atomicAdd_system((unsigned int*)pNumNodes, 1u);
|
||||
BinNode* pNew = &pNodes[n];
|
||||
unsigned int b = input[i] % numBins;
|
||||
|
||||
pNew->value = input[i];
|
||||
#ifdef DEBUG_ATOMIC
|
||||
pNew->d = dev;
|
||||
pNew->i = i;
|
||||
pNew->n = n;
|
||||
#endif
|
||||
unsigned long long next = 0;
|
||||
unsigned long long old = atomicAdd_system((unsigned long long*)&(pNodes[b].pNext),
|
||||
0ull); // Because of no atomicLoad()
|
||||
do {
|
||||
next = old;
|
||||
// Use CAS to ensure atomic operation
|
||||
//pNew->pNext = (BinNode*)next;
|
||||
atomicExch((unsigned long long*)&(pNew->pNext), next);
|
||||
old = atomicCAS_system((unsigned long long *)&(pNodes[b].pNext), next,
|
||||
(unsigned long long )pNew);
|
||||
} while (old != next);
|
||||
#ifdef DEBUG_ATOMIC_PRINT_THREAD
|
||||
printf("k%u: i=%zu, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p,"
|
||||
" old=%p, input[%zu]=%u\n", dev, i, n,
|
||||
pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b], (void*)old,
|
||||
i, input[i]);
|
||||
#else
|
||||
(void)dev;
|
||||
#endif
|
||||
}
|
||||
|
||||
void build_hash_table_on_host(unsigned int* input, size_t inputSize, BinNode* pNodes,
|
||||
unsigned int* pNumNodes, unsigned int numBins,
|
||||
unsigned int dev) {
|
||||
// wait until we see some activity from a device (try to run host side simultaneously).
|
||||
while (numBins == AtomicLoad32(pNumNodes));
|
||||
for(unsigned int i = 0; i < inputSize; i++)
|
||||
{
|
||||
unsigned int n = AtomicFetchAdd32(pNumNodes, 1u);
|
||||
BinNode* pNew = &pNodes[n];
|
||||
unsigned int b = input[i] % numBins;
|
||||
#ifdef DEBUG_ATOMIC
|
||||
pNew->d = dev;
|
||||
pNew->i = i;
|
||||
pNew->n = n;
|
||||
#endif
|
||||
pNew->value = input[i];
|
||||
BinNode* next = AtomicFetchAdd64(&pNodes[b].pNext, (BinNode*)0ll);
|
||||
do {
|
||||
AtomicExchange64(&(pNew->pNext), next);
|
||||
// always inserting at head of list
|
||||
} while (!AtomicCompareExchange64(&(pNodes[b].pNext), &next,
|
||||
(BinNode*)pNew));
|
||||
#ifdef DEBUG_ATOMIC_PRINT_THREAD
|
||||
fprintf(stderr,
|
||||
"k%u: i=%u, n=%u, pNew=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p), pNodes[%u]=%p, "
|
||||
"input[%u]=%u\n",
|
||||
dev, i, n, pNew, pNew->n, pNew->d, pNew->i, pNew->value, pNew->pNext, b, &pNodes[b],
|
||||
i, input[i]);
|
||||
#else
|
||||
(void)dev;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void launch_kernels_and_verify(std::vector<hipStream_t> &streams, unsigned int num_devices,
|
||||
unsigned int numBins, size_t num_pixels) {
|
||||
unsigned int* pInputImage = nullptr;
|
||||
BinNode* pNodes = nullptr;
|
||||
unsigned int* pNumNodes = nullptr;
|
||||
unsigned int total_items = num_pixels * (num_devices + 1);
|
||||
HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocCoherent));
|
||||
HIP_CHECK(
|
||||
hipHostMalloc(&pNodes, sizeof(BinNode) * (total_items + numBins), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostMalloc(&pNumNodes, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
|
||||
*pNumNodes = numBins; // using the first numBins nodes to hold the list heads.
|
||||
for(unsigned int i = 0; i < numBins; i++) pNodes[i].pNext = nullptr;
|
||||
for(unsigned int i = 0; i < num_pixels; i++) pInputImage[i] = i;
|
||||
|
||||
// Get all the devices going simultaneously, each device (and the host) will insert
|
||||
// all the pixels.
|
||||
for(unsigned int d=0; d < num_devices; d++)
|
||||
{
|
||||
build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>(
|
||||
pInputImage, num_pixels, pNodes, pNumNodes, numBins, d);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
threads.push_back(std::thread(build_hash_table_on_host, pInputImage, num_pixels, pNodes,
|
||||
pNumNodes, numBins, num_devices));
|
||||
for (unsigned int d = 0; d < num_devices; d++) {
|
||||
threads.push_back(std::thread(
|
||||
[](hipStream_t s) {
|
||||
HIP_CHECK(hipStreamSynchronize(s)); // To workarround batch dispatching on Windows
|
||||
}, streams[d]));
|
||||
}
|
||||
std::for_each(threads.begin(), threads.end(), [](std::thread& t) { t.join(); });
|
||||
|
||||
for (unsigned int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipSetDevice(d));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
unsigned int num_items = 0;
|
||||
// check correctness of each bin in the hash table.
|
||||
for(unsigned int i = 0; i < numBins; i++)
|
||||
{
|
||||
BinNode *pNode = pNodes[i].pNext;
|
||||
unsigned int num_items_bin = 0;
|
||||
unsigned int total_num_items_bin =
|
||||
(num_pixels % numBins <= i) ? (num_pixels / numBins) : (num_pixels / numBins + 1);
|
||||
total_num_items_bin *= (num_devices + 1); // The item number of the list in i-th bin
|
||||
while(pNode)
|
||||
{
|
||||
#ifdef DEBUG_ATOMIC_PRINT_THREAD
|
||||
fprintf(stderr, "v%u/%u: %u, pNode=%p(n=%2u, d=%u, i=%4u, value=%4u, next=%p)\n", i, numBins,
|
||||
num_items_bin, pNode, pNode->n, pNode->d, pNode->i, pNode->value, pNode->pNext);
|
||||
#endif
|
||||
if((pNode->value % numBins) != i)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"Something went wrong at i=%u, item is in wrong hash bucket:" \
|
||||
"pNode->value=%u, numBins=%u\n", i, pNode->value, numBins);
|
||||
REQUIRE(false);
|
||||
}
|
||||
num_items++;
|
||||
num_items_bin++;
|
||||
if (num_items_bin > total_num_items_bin) {
|
||||
fprintf(stderr,
|
||||
"Something went wrong at i=%u/%u, num_items_bin(%u)>total_num_items_bin(%u)\n",
|
||||
i, numBins, num_items_bin, total_num_items_bin);
|
||||
REQUIRE(false);
|
||||
}
|
||||
pNode = pNode->pNext;
|
||||
}
|
||||
if (num_items_bin != total_num_items_bin) {
|
||||
fprintf(stderr,
|
||||
"Something went wrong at i=%u/%u, num_items_bin(%u)!=total_num_items_bin(%u)\n",
|
||||
i, numBins, num_items_bin, total_num_items_bin);
|
||||
}
|
||||
}
|
||||
HIP_CHECK(hipHostFree(pInputImage));
|
||||
HIP_CHECK(hipHostFree(pNodes));
|
||||
HIP_CHECK(hipHostFree(pNumNodes));
|
||||
|
||||
// each device and the host inserted all of the pixels, check that none are missing.
|
||||
if (num_items != total_items)
|
||||
{
|
||||
fprintf(stderr, "The hash table is not correct, num items %u != expected num items: %u\n",
|
||||
num_items, total_items);
|
||||
REQUIRE(false); // test did not pass
|
||||
}
|
||||
REQUIRE(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - The suite will test the following functions,
|
||||
hipHostMalloc() with following flags,
|
||||
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
|
||||
atomicAdd_system()(in kernel)
|
||||
atomicCAS_system()(in kernel)
|
||||
atomicExch()(in kernel)
|
||||
InterlockedExchangeAdd()(in WINDOWS host)
|
||||
__sync_add_and_fetch()(in LINUX host)
|
||||
InterlockedExchangeAdd64()(in WINDOWS host)
|
||||
InterlockedExchangePointer()(in WINDOWS host)
|
||||
__sync_lock_test_and_set()(in LINUX host)
|
||||
InterlockedCompareExchange64()(in WINDOWS host)
|
||||
__sync_val_compare_and_swap()(in LINUX host)
|
||||
hipDeviceSynchronize()
|
||||
* It will demonstrate use of SVM's atomics to do fine grain synchronization among
|
||||
* devices and the host.
|
||||
* Concept: Each device and the host simultaneously insert values into a single hash table.
|
||||
* Each bin in the hash table is a linked list. Each bin is protected against simultaneous
|
||||
* update using a lock free technique. The correctness of the list is verified on the host.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Host specific (WINDOWS and LINUX)
|
||||
* - Fine grain access and atomics supported on devices and host
|
||||
* - HIP_VERSION >= 5.7
|
||||
*/
|
||||
TEST_CASE("test_svm_fine_grain_memory_consistency") {
|
||||
const int num_elements = 2167;
|
||||
int num_devices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
||||
std::vector<hipStream_t> streams(num_devices);
|
||||
|
||||
for (int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipSetDevice(d));
|
||||
HIP_CHECK(hipStreamCreate(&streams[d]));
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
|
||||
// all work groups in all devices and the host code will hammer on this one lock.
|
||||
unsigned int numBins = 1;
|
||||
launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
|
||||
|
||||
numBins = 2; // 2 locks within in same cache line will get hit from different devices and host.
|
||||
launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
|
||||
|
||||
numBins = 29; // locks span a few cache lines.
|
||||
launch_kernels_and_verify(streams, num_devices, numBins, num_elements);
|
||||
|
||||
for (unsigned int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
/*
|
||||
* Modifications Copyright (C)2023 Advanced
|
||||
* Micro Devices, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <utils.hh>
|
||||
#include "hipSVMCommon.h"
|
||||
#define MAX_TARGETS 1024
|
||||
|
||||
__global__ void find_targets(unsigned int* image, unsigned int target,
|
||||
unsigned int* numTargetsFound,
|
||||
unsigned int* targetLocations) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int index = 0;
|
||||
if (image[i] == target) {
|
||||
index = atomicAdd((unsigned int*)numTargetsFound, 1u);
|
||||
if (index < MAX_TARGETS) {
|
||||
atomicExch_system((unsigned int *)&targetLocations[index], (unsigned int)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void spawnAnalysisTask(int location)
|
||||
{
|
||||
printf("found target at location %d\n", location);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - The suite will test the following functions,
|
||||
hipHostMalloc() with following flags,
|
||||
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
|
||||
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
|
||||
atomicAdd()(in kernel)
|
||||
atomicExch_system()(in kernel)
|
||||
InterlockedExchangeAdd()(in WINDOWS host)
|
||||
__sync_add_and_fetch()(in LINUX host)
|
||||
hipStreamCreate()
|
||||
hipEventCreate()
|
||||
hipEventRecord()
|
||||
hipEventQuery()
|
||||
* It will demonstrate use of SVM's atomics to do fine grain synchronization between
|
||||
* a device and the host. The result will be verified on the host.
|
||||
* Concept: a device kernel is used to search an input image for regions that match a
|
||||
* target pattern. The device immediately notifies the host when it finds a target
|
||||
* (via an atomic operation that works across host and devices). The host is then able
|
||||
* to spawn a task that further analyzes the target while the device continues searching
|
||||
* for more targets.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/memory/hipSVMTestFineGrainSyncBuffers.cpp
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Host specific (WINDOWS and LINUX)
|
||||
* - Fine grain access and atomics supported on device and host
|
||||
* - HIP_VERSION >= 5.7
|
||||
*/
|
||||
TEST_CASE("test_svm_fine_grain_sync_buffers") {
|
||||
size_t num_pixels = 1024 * 1024 * 2;
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
hipEvent_t event;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
unsigned int *pInputImage, *pNumTargetsFound, *pTargetLocations;
|
||||
HIP_CHECK(hipHostMalloc(&pInputImage, sizeof(unsigned int) * num_pixels, hipHostMallocNonCoherent));
|
||||
HIP_CHECK(hipHostMalloc(&pNumTargetsFound, sizeof(unsigned int), hipHostMallocCoherent));
|
||||
HIP_CHECK(hipHostMalloc(&pTargetLocations, sizeof(int) * MAX_TARGETS, hipHostMallocCoherent));
|
||||
unsigned int targetDescriptor = 777;
|
||||
*pNumTargetsFound = 0;
|
||||
|
||||
unsigned int i;
|
||||
for(i = 0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1;
|
||||
for(i = 0; i < num_pixels; i++) pInputImage[i] = 0;
|
||||
pInputImage[0] = targetDescriptor;
|
||||
pInputImage[3] = targetDescriptor;
|
||||
pInputImage[num_pixels - 1] = targetDescriptor;
|
||||
|
||||
find_targets<<<(num_pixels + 255) / 256, 256, 0, stream>>>(pInputImage, targetDescriptor,
|
||||
pNumTargetsFound, pTargetLocations);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipEventRecord(event, stream));
|
||||
|
||||
i=0;
|
||||
hipError_t status = hipSuccess;
|
||||
unsigned int loc = 0;
|
||||
// check for new targets, if found spawn a task to analyze target.
|
||||
do {
|
||||
status = hipEventQuery(event);
|
||||
if (status != hipErrorNotReady && status != hipSuccess) {
|
||||
fprintf(stderr, "Unexpected status = %d\n", status);
|
||||
REQUIRE(false);
|
||||
}
|
||||
loc = AtomicLoad32(&pTargetLocations[i]);
|
||||
if (loc != -1) // -1 indicates slot not used yet.
|
||||
{
|
||||
spawnAnalysisTask(loc); // Do something...
|
||||
i++;
|
||||
}
|
||||
} while (status == hipErrorNotReady ||
|
||||
AtomicLoad32(&pTargetLocations[i]) != -1);
|
||||
|
||||
HIP_CHECK(hipHostFree(pInputImage));
|
||||
HIP_CHECK(hipHostFree(pNumTargetsFound));
|
||||
HIP_CHECK(hipHostFree(pTargetLocations));
|
||||
HIP_CHECK(hipEventDestroy(event));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
REQUIRE(i == 3);
|
||||
}
|
||||
@@ -0,0 +1,297 @@
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
/*
|
||||
* Modifications Copyright (C)2023 Advanced
|
||||
* Micro Devices, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <utils.hh>
|
||||
#include "hipSVMCommon.h"
|
||||
|
||||
// const char *linked_list_create_and_verify_kernels[] = {
|
||||
typedef struct Node {
|
||||
unsigned int global_id;
|
||||
unsigned int position_in_list;
|
||||
struct Node* pNext;
|
||||
} Node;
|
||||
|
||||
// The allocation_index parameter must be initialized on the host to N work-items
|
||||
// The first N nodes in pNodes will be the heads of the lists.
|
||||
__global__ void create_linked_lists_on_device(Node* pNodes,
|
||||
unsigned int* allocation_index,
|
||||
unsigned int list_length) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Node* pNode = &pNodes[i];
|
||||
|
||||
pNode->global_id = i;
|
||||
pNode->position_in_list = 0;
|
||||
|
||||
Node* pNew;
|
||||
for (unsigned int j = 1; j < list_length; j++) {
|
||||
pNew = &pNodes[atomicAdd(allocation_index, 1u)]; // allocate a new node
|
||||
pNew->global_id = i;
|
||||
pNew->position_in_list = j;
|
||||
pNode->pNext = pNew; // link new node onto end of list
|
||||
pNode = pNew; // move to end of list
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void verify_linked_lists_on_device(Node* pNodes, unsigned int* num_correct,
|
||||
unsigned int list_length) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Node* pNode = &pNodes[i];
|
||||
|
||||
for (unsigned int j = 0; j < list_length; j++) {
|
||||
if (pNode->global_id == i && pNode->position_in_list == j) {
|
||||
atomicAdd(num_correct, 1u);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
pNode = pNode->pNext;
|
||||
}
|
||||
}
|
||||
|
||||
// The first N nodes in pNodes will be the heads of the lists.
|
||||
void create_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) {
|
||||
unsigned int allocation_index = num_lists; // heads of lists are in first num_lists nodes.
|
||||
for (unsigned int i = 0; i < num_lists; i++) {
|
||||
Node* pNode = &pNodes[i];
|
||||
pNode->global_id = i;
|
||||
pNode->position_in_list = 0;
|
||||
Node* pNew;
|
||||
for (unsigned int j = 1; j < list_length; j++) {
|
||||
pNew = &pNodes[allocation_index++]; // allocate a new node
|
||||
pNew->global_id = i;
|
||||
pNew->position_in_list = j;
|
||||
pNode->pNext = pNew; // link new node onto end of list
|
||||
pNode = pNew; // move to end of list
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void verify_linked_lists_on_host(Node* pNodes, unsigned int num_lists, unsigned int list_length) {
|
||||
unsigned int numCorrect = 0;
|
||||
for (unsigned int i = 0; i < num_lists; i++) {
|
||||
Node* pNode = &pNodes[i];
|
||||
for (int j = 0; j < list_length; j++) {
|
||||
if (pNode->global_id == i && pNode->position_in_list == j) {
|
||||
numCorrect++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
pNode = pNode->pNext;
|
||||
}
|
||||
}
|
||||
if (numCorrect != list_length * num_lists) {
|
||||
fprintf(stderr, "Failed\n");
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
void create_linked_lists_on_device(hipStream_t stream, Node* pNodes,
|
||||
unsigned int* pAllocator, unsigned int numLists,
|
||||
unsigned int ListLength) {
|
||||
// reset allocator index
|
||||
*pAllocator = numLists; // the first numLists elements of the nodes array are already
|
||||
// allocated (they hold the head of each list).
|
||||
create_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pAllocator,
|
||||
ListLength);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void verify_linked_lists_on_device(hipStream_t stream, Node* pNodes,
|
||||
unsigned int* pNumCorrect, unsigned int numLists,
|
||||
unsigned int ListLength) {
|
||||
*pNumCorrect = 0; // reset numCorrect to zero
|
||||
|
||||
verify_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pNumCorrect,
|
||||
ListLength);
|
||||
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
int correct_count = *pNumCorrect;
|
||||
if(correct_count != ListLength * numLists)
|
||||
{
|
||||
fprintf(stderr,"Failed\n");
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - The suite will test the following functions,
|
||||
hipHostMalloc() with following flags,
|
||||
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
|
||||
atomicAdd()(in kernel)
|
||||
hipStreamCreate()
|
||||
hipStreamSynchronize()
|
||||
* It will test that all devices and the host share a common address space using fine-grain
|
||||
* host buffers.
|
||||
* Concept: This is done by creating a linked list on a device and then verifying the
|
||||
* correctness of the list on another device or the host. This basic test is performed for all
|
||||
* combinations of devices and the host that exist within the platform. The test passes only if
|
||||
* every combination passes.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Host specific (WINDOWS and LINUX)
|
||||
* - Fine grain access supported on devices and host
|
||||
* - HIP_VERSION >= 5.7
|
||||
*/
|
||||
TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
|
||||
const unsigned int num_elements = 1024;
|
||||
int num_devices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
||||
int num_devices_plus_host = num_devices + 1;
|
||||
std::vector<hipStream_t> streams(num_devices);
|
||||
|
||||
for (int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipSetDevice(d));
|
||||
HIP_CHECK(hipStreamCreate(&streams[d]));
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
|
||||
unsigned int numLists = num_elements;
|
||||
unsigned int ListLength = 32;
|
||||
Node* pNodes = nullptr;
|
||||
unsigned int* pAllocator = nullptr;
|
||||
unsigned int* pNumCorrect = nullptr;
|
||||
HIP_CHECK(hipHostMalloc(&pNodes, sizeof(Node) * ListLength * numLists, hipHostMallocNonCoherent));
|
||||
HIP_CHECK(hipHostMalloc(&pAllocator, sizeof(unsigned int), hipHostMallocNonCoherent));
|
||||
HIP_CHECK(hipHostMalloc(&pNumCorrect, sizeof(unsigned int), hipHostMallocNonCoherent));
|
||||
|
||||
// Create linked list on one device and verify on another device (or the host).
|
||||
// Do this for all possible combinations of devices and host within the platform.
|
||||
// ci is CreationIndex, index of device/q to create linked list on
|
||||
for (int ci=0; ci<num_devices_plus_host; ci++)
|
||||
{
|
||||
// vi is VerificationIndex, index of device/q to verify linked list on
|
||||
for (int vi = 0; vi < num_devices_plus_host; vi++)
|
||||
{
|
||||
if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
|
||||
{
|
||||
create_linked_lists_on_host(pNodes, numLists, ListLength);
|
||||
}
|
||||
else
|
||||
{
|
||||
create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
|
||||
ListLength);
|
||||
}
|
||||
|
||||
if(vi == num_devices)
|
||||
{
|
||||
verify_linked_lists_on_host(pNodes, numLists, ListLength);
|
||||
}
|
||||
else
|
||||
{
|
||||
verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
|
||||
ListLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipHostFree(pNodes));
|
||||
HIP_CHECK(hipHostFree(pAllocator));
|
||||
HIP_CHECK(hipHostFree(pNumCorrect));
|
||||
for (int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipStreamDestroy(streams[d]));
|
||||
}
|
||||
REQUIRE(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - The suite will test the following functions,
|
||||
align_malloc()
|
||||
atomicAdd()(in kernel)
|
||||
hipStreamCreate()
|
||||
hipStreamSynchronize()
|
||||
* It will test that all devices and the host share a common address space using fine-grain mode
|
||||
* with regular host buffers.
|
||||
* Concept: This is done by creating a linked list on a device and then verifying the
|
||||
* correctness of the list on another device or the host. This basic test is performed for all
|
||||
* combinations of devices and the host that exist within the platform. The test passes only if
|
||||
* every combination passes.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Host specific (WINDOWS and LINUX)
|
||||
* - System fine grain access supported on devices
|
||||
* - HIP_VERSION >= 5.7
|
||||
*/
|
||||
TEST_CASE("test_svm_shared_address_space_fine_grain_system") {
|
||||
fprintf(stderr, "test_svm_shared_address_space_fine_grain_system ignored\n");
|
||||
return;// blocked by SWDEV-422544 add HIP flag for APU device
|
||||
const unsigned int num_elements = 1024;
|
||||
int num_devices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
||||
int num_devices_plus_host = num_devices + 1;
|
||||
std::vector<hipStream_t> streams(num_devices);
|
||||
|
||||
for (int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipSetDevice(d));
|
||||
HIP_CHECK(hipStreamCreate(&streams[d]));
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
|
||||
unsigned int numLists = num_elements;
|
||||
unsigned int ListLength = 32;
|
||||
|
||||
// this allocation holds the linked list nodes.
|
||||
Node* pNodes = (Node*)align_malloc(numLists * ListLength * sizeof(Node), 128);
|
||||
// this allocation holds an index into the nodes buffer, it is used for node allocation
|
||||
unsigned int* pAllocator = (unsigned int*)align_malloc(sizeof(unsigned int), 128);
|
||||
// this allocation holds the count of correct nodes, which is computed by the verify kernel.
|
||||
unsigned int* pNumCorrect = (unsigned int*)align_malloc(sizeof(unsigned int), 128);
|
||||
|
||||
// ci is CreationIndex, index of device/q to create linked list on
|
||||
for (int ci = 0; ci < num_devices_plus_host; ci++) {
|
||||
// vi is VerificationIndex, index of device/q to verify linked list on
|
||||
for (int vi = 0; vi < num_devices_plus_host; vi++) {
|
||||
if (ci == num_devices) // last device index represents the host, note the num_device+1 above.
|
||||
{
|
||||
create_linked_lists_on_host(pNodes, numLists, ListLength);
|
||||
} else {
|
||||
create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
|
||||
ListLength);
|
||||
}
|
||||
|
||||
if (vi == num_devices) {
|
||||
verify_linked_lists_on_host(pNodes, numLists, ListLength);
|
||||
} else {
|
||||
verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
|
||||
ListLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
align_free(pNodes);
|
||||
align_free(pAllocator);
|
||||
align_free(pNumCorrect);
|
||||
for (int d = 0; d < num_devices; d++) {
|
||||
HIP_CHECK(hipStreamDestroy(streams[d]));
|
||||
}
|
||||
REQUIRE(true);
|
||||
}
|
||||
Fai riferimento in un nuovo problema
Block a user