160 строки
6.5 KiB
C++
160 строки
6.5 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
/*
|
|
* Modifications Copyright (C)2023 Advanced
|
|
* Micro Devices, Inc. All rights reserved.
|
|
*/
|
|
|
|
#include <hip_test_common.hh>
|
|
#include <hip/hip_runtime_api.h>
|
|
#include <resource_guards.hh>
|
|
#include <utils.hh>
|
|
|
|
// Each device will write it's id into the bytes that it "owns", ownership is based on round robin
|
|
// (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the
|
|
// host code). id is the index (id) of the device that this kernel is executing on. For example, if
|
|
// there are 2 SVM devices and the host; the buffer should look like this after each device and the
|
|
// host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2...
|
|
__global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) {
|
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int owner = i % num_id;
|
|
if (id == owner) a[i] = id; // modify location if it belongs to this device, write id
|
|
}
|
|
|
|
// Verify that a device can see the byte sized updates from the other devices, sum up the device
|
|
// id's and see if they match expected value. Note: this must be called with a reduced NDRange so
|
|
// that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and
|
|
// the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum
|
|
// at each point is 0+1+2 = 3.
|
|
__global__ void sum_neighbor_locations(char* a, unsigned int num_devices,
|
|
unsigned int* error_count) {
|
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2;
|
|
unsigned int sum = 0;
|
|
for (unsigned int j = 0; j < num_devices; j++) {
|
|
sum += a[i + j]; // add my neighbors to the right
|
|
}
|
|
if (sum != expected_sum) atomicAdd_system(error_count, 1u); // like opencl atomic_inc()
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - The suite will test the following functions,
|
|
hipHostMalloc() with following flags,
|
|
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
|
|
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
|
|
atomicAdd_system()(in kernel)
|
|
hipStreamCreate()
|
|
hipStreamSynchronize()
|
|
* It will demonstrate use of SVM's atomics to do fine grain synchronization among
|
|
* devices with each stream on each device. The result will be verified on the host.
|
|
* Test source
|
|
* ------------------------
|
|
* - catch/unit/memory/hipSVMTestByteGranularity.cpp
|
|
* Test requirements
|
|
* ------------------------
|
|
* - Host specific (WINDOWS and LINUX)
|
|
* - Fine grain access and atomics supported on devices
|
|
* - HIP_VERSION >= 5.7
|
|
*/
|
|
TEST_CASE("test_svm_byte_granularity", "[multigpu]") {
|
|
int pcieAtomic = 0;
|
|
HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
|
|
if (!pcieAtomic) {
|
|
fprintf(stderr, "Device doesn't support pcie atomic, Skipped\n");
|
|
REQUIRE(true);
|
|
return;
|
|
}
|
|
const int num_elements = 2048;
|
|
int num_devices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
|
int num_devices_plus_host = num_devices + 1;
|
|
std::vector<hipStream_t> streams(num_devices);
|
|
|
|
for (int d = 0; d < num_devices; d++) {
|
|
HIP_CHECK(hipSetDevice(d));
|
|
HIP_CHECK(hipStreamCreate(&streams[d]));
|
|
}
|
|
HIP_CHECK(hipSetDevice(0));
|
|
char* pA = nullptr;
|
|
// hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER
|
|
HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent));
|
|
unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices);
|
|
|
|
for (unsigned int i = 0; i < num_devices; i++) {
|
|
// hipHostMallocCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS
|
|
// We need atomic inc among different GPUs
|
|
HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements,
|
|
hipHostMallocCoherent));
|
|
*error_counts[i] = 0;
|
|
}
|
|
for (int i = 0; i < num_elements; i++) pA[i] = -1;
|
|
|
|
// get all the devices going simultaneously
|
|
for (unsigned int d = 0; d < num_devices; d++) // device ids starting at 1.
|
|
{
|
|
HIP_CHECK(hipSetDevice(d));
|
|
write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
|
|
HIP_CHECK(hipGetLastError());
|
|
}
|
|
unsigned int host_id = num_devices; // host code will take the id above the devices.
|
|
for (unsigned int i = num_devices; i < num_elements; i += num_devices_plus_host) pA[i] = host_id;
|
|
|
|
for (unsigned int d = 0; d < num_devices; d++) {
|
|
HIP_CHECK(hipStreamSynchronize(streams[d]));
|
|
}
|
|
|
|
// now check that each device can see the byte writes made by the other devices.
|
|
// adjusted so sum_neighbor_locations doesn't read past end of buffer
|
|
size_t adjusted_num_elements = num_elements - num_devices;
|
|
for (unsigned int d = 0; d < num_devices; d++) {
|
|
HIP_CHECK(hipSetDevice(d));
|
|
sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
|
|
error_counts[d]);
|
|
HIP_CHECK(hipGetLastError());
|
|
}
|
|
|
|
for (unsigned int d = 0; d < num_devices; d++) {
|
|
HIP_CHECK(hipStreamSynchronize(streams[d]));
|
|
}
|
|
// see if any of the devices found errors
|
|
for (unsigned int d = 0; d < num_devices; d++) {
|
|
if (*error_counts[d] > 0) {
|
|
fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1)) / 2;
|
|
// check that host can see the byte writes made by the devices.
|
|
for (unsigned int i = 0; i < num_elements - num_devices_plus_host; i++) {
|
|
unsigned int sum = 0;
|
|
for (unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i + j];
|
|
if (sum != expected) {
|
|
fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
for (unsigned int i = 0; i < num_devices; i++) {
|
|
HIP_CHECK(hipStreamDestroy(streams[i]));
|
|
HIP_CHECK(hipHostFree(error_counts[i]));
|
|
}
|
|
free(error_counts);
|
|
HIP_CHECK(hipHostFree(pA));
|
|
REQUIRE(true);
|
|
}
|