Files

160 строки
6.5 KiB
C++

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
* Modifications Copyright (C)2023 Advanced
* Micro Devices, Inc. All rights reserved.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <resource_guards.hh>
#include <utils.hh>
// Each device will write it's id into the bytes that it "owns", ownership is based on round robin
// (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the
// host code). id is the index (id) of the device that this kernel is executing on. For example, if
// there are 2 SVM devices and the host; the buffer should look like this after each device and the
// host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2...
__global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
int owner = i % num_id;
if (id == owner) a[i] = id; // modify location if it belongs to this device, write id
}
// Verify that a device can see the byte sized updates from the other devices, sum up the device
// id's and see if they match expected value. Note: this must be called with a reduced NDRange so
// that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and
// the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum
// at each point is 0+1+2 = 3.
__global__ void sum_neighbor_locations(char* a, unsigned int num_devices,
unsigned int* error_count) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2;
unsigned int sum = 0;
for (unsigned int j = 0; j < num_devices; j++) {
sum += a[i + j]; // add my neighbors to the right
}
if (sum != expected_sum) atomicAdd_system(error_count, 1u); // like opencl atomic_inc()
}
/**
* Test Description
* ------------------------
* - The suite will test the following functions,
hipHostMalloc() with following flags,
hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS)
hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER)
atomicAdd_system()(in kernel)
hipStreamCreate()
hipStreamSynchronize()
* It will demonstrate use of SVM's atomics to do fine grain synchronization among
* devices with each stream on each device. The result will be verified on the host.
* Test source
* ------------------------
* - catch/unit/memory/hipSVMTestByteGranularity.cpp
* Test requirements
* ------------------------
* - Host specific (WINDOWS and LINUX)
* - Fine grain access and atomics supported on devices
* - HIP_VERSION >= 5.7
*/
TEST_CASE("test_svm_byte_granularity", "[multigpu]") {
int pcieAtomic = 0;
HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0));
if (!pcieAtomic) {
fprintf(stderr, "Device doesn't support pcie atomic, Skipped\n");
REQUIRE(true);
return;
}
const int num_elements = 2048;
int num_devices = 0;
HIP_CHECK(hipGetDeviceCount(&num_devices));
int num_devices_plus_host = num_devices + 1;
std::vector<hipStream_t> streams(num_devices);
for (int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
HIP_CHECK(hipStreamCreate(&streams[d]));
}
HIP_CHECK(hipSetDevice(0));
char* pA = nullptr;
// hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER
HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent));
unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices);
for (unsigned int i = 0; i < num_devices; i++) {
// hipHostMallocCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS
// We need atomic inc among different GPUs
HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements,
hipHostMallocCoherent));
*error_counts[i] = 0;
}
for (int i = 0; i < num_elements; i++) pA[i] = -1;
// get all the devices going simultaneously
for (unsigned int d = 0; d < num_devices; d++) // device ids starting at 1.
{
HIP_CHECK(hipSetDevice(d));
write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
HIP_CHECK(hipGetLastError());
}
unsigned int host_id = num_devices; // host code will take the id above the devices.
for (unsigned int i = num_devices; i < num_elements; i += num_devices_plus_host) pA[i] = host_id;
for (unsigned int d = 0; d < num_devices; d++) {
HIP_CHECK(hipStreamSynchronize(streams[d]));
}
// now check that each device can see the byte writes made by the other devices.
// adjusted so sum_neighbor_locations doesn't read past end of buffer
size_t adjusted_num_elements = num_elements - num_devices;
for (unsigned int d = 0; d < num_devices; d++) {
HIP_CHECK(hipSetDevice(d));
sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
error_counts[d]);
HIP_CHECK(hipGetLastError());
}
for (unsigned int d = 0; d < num_devices; d++) {
HIP_CHECK(hipStreamSynchronize(streams[d]));
}
// see if any of the devices found errors
for (unsigned int d = 0; d < num_devices; d++) {
if (*error_counts[d] > 0) {
fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]);
REQUIRE(false);
}
}
unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1)) / 2;
// check that host can see the byte writes made by the devices.
for (unsigned int i = 0; i < num_elements - num_devices_plus_host; i++) {
unsigned int sum = 0;
for (unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i + j];
if (sum != expected) {
fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected);
REQUIRE(false);
}
}
for (unsigned int i = 0; i < num_devices; i++) {
HIP_CHECK(hipStreamDestroy(streams[i]));
HIP_CHECK(hipHostFree(error_counts[i]));
}
free(error_counts);
HIP_CHECK(hipHostFree(pA));
REQUIRE(true);
}