// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /* * Modifications Copyright (C)2023 Advanced * Micro Devices, Inc. All rights reserved. */ #include #include #include #include // Each device will write it's id into the bytes that it "owns", ownership is based on round robin // (global_id % num_id) num_id is equal to number of SVM devices in the system plus one (for the // host code). id is the index (id) of the device that this kernel is executing on. For example, if // there are 2 SVM devices and the host; the buffer should look like this after each device and the // host write their id's: 0, 1, 2, 0, 1, 2, 0, 1, 2... __global__ void write_owned_locations(char* a, unsigned int num_id, unsigned int id) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; int owner = i % num_id; if (id == owner) a[i] = id; // modify location if it belongs to this device, write id } // Verify that a device can see the byte sized updates from the other devices, sum up the device // id's and see if they match expected value. Note: this must be called with a reduced NDRange so // that neighbor acesses don't go past end of buffer. For example if there are two SVM devices and // the host (3 total devices) the buffer should look like this: 0,1,2,0,1,2... and the expected sum // at each point is 0+1+2 = 3. __global__ void sum_neighbor_locations(char* a, unsigned int num_devices, unsigned int* error_count) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int expected_sum = (num_devices * (num_devices - 1)) / 2; unsigned int sum = 0; for (unsigned int j = 0; j < num_devices; j++) { sum += a[i + j]; // add my neighbors to the right } if (sum != expected_sum) atomicAdd_system(error_count, 1u); // like opencl atomic_inc() } /** * Test Description * ------------------------ * - The suite will test the following functions, hipHostMalloc() with following flags, hipHostMallocCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS) hipHostMallocNonCoherent(CL_MEM_SVM_FINE_GRAIN_BUFFER) atomicAdd_system()(in kernel) hipStreamCreate() hipStreamSynchronize() * It will demonstrate use of SVM's atomics to do fine grain synchronization among * devices with each stream on each device. The result will be verified on the host. * Test source * ------------------------ * - catch/unit/memory/hipSVMTestByteGranularity.cpp * Test requirements * ------------------------ * - Host specific (WINDOWS and LINUX) * - Fine grain access and atomics supported on devices * - HIP_VERSION >= 5.7 */ TEST_CASE("test_svm_byte_granularity", "[multigpu]") { int pcieAtomic = 0; HIP_CHECK(hipDeviceGetAttribute(&pcieAtomic, hipDeviceAttributeHostNativeAtomicSupported, 0)); if (!pcieAtomic) { fprintf(stderr, "Device doesn't support pcie atomic, Skipped\n"); REQUIRE(true); return; } const int num_elements = 2048; int num_devices = 0; HIP_CHECK(hipGetDeviceCount(&num_devices)); int num_devices_plus_host = num_devices + 1; std::vector streams(num_devices); for (int d = 0; d < num_devices; d++) { HIP_CHECK(hipSetDevice(d)); HIP_CHECK(hipStreamCreate(&streams[d])); } HIP_CHECK(hipSetDevice(0)); char* pA = nullptr; // hipHostMallocNonCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER HIP_CHECK(hipHostMalloc(&pA, sizeof(char) * num_elements, hipHostMallocNonCoherent)); unsigned int** error_counts = (unsigned int**)malloc(sizeof(void*) * num_devices); for (unsigned int i = 0; i < num_devices; i++) { // hipHostMallocCoherent means CL_MEM_SVM_FINE_GRAIN_BUFFER + CL_MEM_SVM_ATOMICS // We need atomic inc among different GPUs HIP_CHECK(hipHostMalloc(&error_counts[i], sizeof(unsigned int) * num_elements, hipHostMallocCoherent)); *error_counts[i] = 0; } for (int i = 0; i < num_elements; i++) pA[i] = -1; // get all the devices going simultaneously for (unsigned int d = 0; d < num_devices; d++) // device ids starting at 1. { HIP_CHECK(hipSetDevice(d)); write_owned_locations<<>>(pA, num_devices_plus_host, d); HIP_CHECK(hipGetLastError()); } unsigned int host_id = num_devices; // host code will take the id above the devices. for (unsigned int i = num_devices; i < num_elements; i += num_devices_plus_host) pA[i] = host_id; for (unsigned int d = 0; d < num_devices; d++) { HIP_CHECK(hipStreamSynchronize(streams[d])); } // now check that each device can see the byte writes made by the other devices. // adjusted so sum_neighbor_locations doesn't read past end of buffer size_t adjusted_num_elements = num_elements - num_devices; for (unsigned int d = 0; d < num_devices; d++) { HIP_CHECK(hipSetDevice(d)); sum_neighbor_locations<<>>(pA, num_devices_plus_host, error_counts[d]); HIP_CHECK(hipGetLastError()); } for (unsigned int d = 0; d < num_devices; d++) { HIP_CHECK(hipStreamSynchronize(streams[d])); } // see if any of the devices found errors for (unsigned int d = 0; d < num_devices; d++) { if (*error_counts[d] > 0) { fprintf(stderr, "*error_counts[%u] = %u\n", d, *error_counts[d]); REQUIRE(false); } } unsigned int expected = (num_devices_plus_host * (num_devices_plus_host - 1)) / 2; // check that host can see the byte writes made by the devices. for (unsigned int i = 0; i < num_elements - num_devices_plus_host; i++) { unsigned int sum = 0; for (unsigned int j = 0; j < num_devices_plus_host; j++) sum += pA[i + j]; if (sum != expected) { fprintf(stderr, "[%u]: sum %u != expected %u", i, sum, expected); REQUIRE(false); } } for (unsigned int i = 0; i < num_devices; i++) { HIP_CHECK(hipStreamDestroy(streams[i])); HIP_CHECK(hipHostFree(error_counts[i])); } free(error_counts); HIP_CHECK(hipHostFree(pA)); REQUIRE(true); }