SWDEV-430760 Add hipPerfBufferCopySpeedP2P.cc
Change-Id: I606226705cd441c1742e0eac4841f7b189d69149
Этот коммит содержится в:
коммит произвёл
Rakesh Roy
родитель
d92abdd926
Коммит
fbb6002829
@@ -29,6 +29,7 @@ set(TEST_SRC
|
||||
hipPerfMemset.cc
|
||||
hipPerfSampleRate.cc
|
||||
hipPerfSharedMemReadSpeed.cc
|
||||
hipPerfBufferCopySpeedP2P.cc
|
||||
)
|
||||
|
||||
if(HIP_PLATFORM MATCHES "amd")
|
||||
|
||||
@@ -0,0 +1,547 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipMemcpyAsync
|
||||
* @{
|
||||
* @ingroup perfMemoryTest
|
||||
* `hipMemcpyAsync(void* dst, const void* src, size_t count,
|
||||
* hipMemcpyKind kind, hipStream_t stream = 0)` -
|
||||
* Copies data between devices.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_array_common.hh>
|
||||
#include <cmd_options.hh>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
using namespace std;
|
||||
typedef long long T; // You may change to any type
|
||||
|
||||
//#define VERIFY_DATA
|
||||
enum TIMING_MODE { TIMING_MODE_CPU, TIMING_MODE_GPU };
|
||||
|
||||
// -sizes are in bytes, +sizes are in kb, last size must be largest
|
||||
#if 1
|
||||
static constexpr int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024,
|
||||
2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144};
|
||||
#else
|
||||
static constexpr int sizes[] = { 262144 };
|
||||
#endif
|
||||
static constexpr int nSizes = sizeof(sizes) / sizeof(sizes[0]);
|
||||
static constexpr double megaSize = 1000000.;
|
||||
static constexpr int defaultIterations = 200;
|
||||
static constexpr unsigned threadsPerBlock = 1024;
|
||||
|
||||
template <typename T>
|
||||
static __global__ void copy_kernel(T* dst, T* src, size_t N) {
|
||||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(idx < N) dst[idx] = src[idx]; // We make sure idx < N
|
||||
}
|
||||
|
||||
static size_t sizeToBytes(int size) { return (size < 0) ? -size : size * 1024; }
|
||||
|
||||
static string sizeToString(int size) {
|
||||
stringstream ss;
|
||||
if (size < 0) {
|
||||
ss << setfill('0') << setw(3) << -size / 1024.;
|
||||
} else {
|
||||
ss << size;
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static void checkP2PSupport(){
|
||||
int deviceCnt = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&deviceCnt));
|
||||
cout << "Total no. of available gpu #" << deviceCnt << "\n" << endl;
|
||||
|
||||
for (int deviceId = 0; deviceId < deviceCnt; deviceId++) {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
cout << "for gpu#" << deviceId << " " << props.name << endl;
|
||||
cout << " peer2peer supported : ";
|
||||
int PeerCnt = 0;
|
||||
for (int i = 0; i < deviceCnt; i++) {
|
||||
int isPeer;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&isPeer, i, deviceId));
|
||||
if (isPeer) {
|
||||
cout << "gpu#" << i << " ";
|
||||
++PeerCnt;
|
||||
}
|
||||
}
|
||||
if (PeerCnt == 0)
|
||||
cout << "NONE" << " ";
|
||||
|
||||
cout << std::endl;
|
||||
cout << " peer2peer not supported : ";
|
||||
int nonPeerCnt = 0;
|
||||
for (int i = 0; i < deviceCnt; i++) {
|
||||
int isPeer;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&isPeer, i, deviceId));
|
||||
if (!isPeer && (i != deviceId)) {
|
||||
cout << "gpu#" << i << " ";
|
||||
++nonPeerCnt;
|
||||
}
|
||||
}
|
||||
if (nonPeerCnt == 0)
|
||||
cout << "NONE" << " ";
|
||||
|
||||
cout << "\n" << endl;
|
||||
}
|
||||
|
||||
cout << "\nNote: For non-supported peer2peer devices, memcopy will use/follow the normal "
|
||||
"behaviour (GPU1-->host then host-->GPU2)\n\n"
|
||||
<< endl;
|
||||
}
|
||||
|
||||
static void outputMatrix(const string &title, const int numGPUs, const vector<double> &data,
|
||||
const TIMING_MODE mode, const size_t dataSize, const int iterations) {
|
||||
fprintf(stderr, "%s, Timing %s, Data %zu KB, Iterations %d\n ",
|
||||
title.c_str(), mode == TIMING_MODE_GPU ? "GPU" : "CPU", dataSize, iterations);
|
||||
for (int j = 0; j < numGPUs; j++) {
|
||||
fprintf(stderr, "%9d ", j);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
for (int i = 0; i < numGPUs; i++) {
|
||||
fprintf(stderr, "%6d ", i);
|
||||
for (int j = 0; j < numGPUs; j++) {
|
||||
fprintf(stderr, "%8.06f ", data[i * numGPUs + j]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void testP2PUniDirMemPerf(const int iterations, const TIMING_MODE timingMode,
|
||||
const bool useHipMemcpyAsync) {
|
||||
const char* method = useHipMemcpyAsync ? "hipMemcpyAsync()" : "copy kernel";
|
||||
int gpuCount = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&gpuCount));
|
||||
if (gpuCount < 1) {
|
||||
fprintf(stderr, "Need at least 1 GPU, skipped!\n");
|
||||
return;
|
||||
}
|
||||
vector<double> timeMs(gpuCount * gpuCount, 0.);
|
||||
vector<double> bandWidth(gpuCount * gpuCount, 0.);
|
||||
|
||||
size_t numMax = sizeToBytes(sizes[nSizes - 1]) * 2;
|
||||
#ifdef VERIFY_DATA
|
||||
std::vector<char> hostMem0(numMax), hostMem1(numMax, 0);
|
||||
for (size_t n = 0; n < numMax; n++) initVal(hostMem0[n]);
|
||||
#endif
|
||||
for (int currentGpu = 0; currentGpu < gpuCount; currentGpu++) {
|
||||
for (int peerGpu = 0; peerGpu < gpuCount; peerGpu++) {
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
|
||||
fprintf(stderr, "Uni: Gpu%d -> Gpu%d by %s, Timing %s, Iterations %d\n",
|
||||
currentGpu, peerGpu, method, timingMode == TIMING_MODE_GPU ? "GPU" : "CPU", iterations);
|
||||
|
||||
if (currentGpu != peerGpu) {
|
||||
int canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu));
|
||||
if (!canAccessPeer) {
|
||||
fprintf(stderr, "Gpu%d cannot access Gpu%d, Skipped\n", currentGpu, peerGpu);
|
||||
continue;
|
||||
}
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(peerGpu, 0));
|
||||
}
|
||||
fprintf(stderr, "Size(KB) Time(ms) Bandwidth(GB/s)\n");
|
||||
|
||||
unsigned char *currentGpuMem = nullptr, *peerGpuMem = nullptr;
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)¤tGpuMem, numMax));
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
HIP_CHECK(hipMalloc((void**)&peerGpuMem, numMax));
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
#ifdef VERIFY_DATA
|
||||
HIP_CHECK(hipMemcpy(currentGpuMem, hostMem0.data(), numMax, hipMemcpyHostToDevice));
|
||||
#endif
|
||||
unsigned N = numMax / sizeof(T); // Number of T in buffer of numMax bytes.
|
||||
REQUIRE(N * sizeof(T) == numMax); // To prevent verification failure
|
||||
unsigned blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
// Warmup
|
||||
if (useHipMemcpyAsync) {
|
||||
HIP_CHECK(hipMemcpyAsync(peerGpuMem, currentGpuMem, numMax,
|
||||
hipMemcpyDeviceToDevice, 0));
|
||||
}
|
||||
else {
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0, 0,
|
||||
reinterpret_cast<T*>(peerGpuMem), reinterpret_cast<T*>(currentGpuMem),
|
||||
static_cast<size_t>(N));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
#ifdef VERIFY_DATA
|
||||
// Verify
|
||||
HIP_CHECK(hipMemcpy(hostMem1.data(), peerGpuMem, numMax, hipMemcpyDeviceToHost));
|
||||
REQUIRE(hostMem1 == hostMem0);
|
||||
#endif
|
||||
float t = 0; // in ms
|
||||
auto cpuStart = std::chrono::steady_clock::now();
|
||||
hipEvent_t eventStart, eventStop;
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
if (timingMode == TIMING_MODE_GPU) {
|
||||
HIP_CHECK(hipEventCreate(&eventStart));
|
||||
HIP_CHECK(hipEventCreate(&eventStop));
|
||||
}
|
||||
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
const int thisSize = sizes[i];
|
||||
const size_t nbytes = sizeToBytes(sizes[i]);
|
||||
N = nbytes / sizeof(T); // Number of T in buffer of nbytes bytes
|
||||
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
if (timingMode == TIMING_MODE_CPU) {
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
cpuStart = std::chrono::steady_clock::now();
|
||||
} else if (timingMode == TIMING_MODE_GPU) {
|
||||
HIP_CHECK(hipEventRecord(eventStart, stream));
|
||||
}
|
||||
|
||||
for (size_t offsetEnd = numMax - nbytes, offset = 0, j = 0;
|
||||
j < iterations; j++, offset += nbytes) {
|
||||
if (offset > offsetEnd) offset = 0;
|
||||
if (useHipMemcpyAsync) {
|
||||
HIP_CHECK(hipMemcpyAsync(peerGpuMem + offset,
|
||||
currentGpuMem + offset, nbytes, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
else {
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream,
|
||||
reinterpret_cast<T*>(peerGpuMem + offset),
|
||||
reinterpret_cast<T*>(currentGpuMem + offset), static_cast<size_t>(N));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
}
|
||||
if (timingMode == TIMING_MODE_GPU) {
|
||||
HIP_CHECK(hipEventRecord(eventStop, stream));
|
||||
HIP_CHECK(hipEventSynchronize(eventStop));
|
||||
HIP_CHECK(hipEventElapsedTime(&t, eventStart, eventStop));
|
||||
} else if (timingMode == TIMING_MODE_CPU) {
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
std::chrono::duration<double, std::milli> cpuMs =
|
||||
std::chrono::steady_clock::now() - cpuStart;
|
||||
t = cpuMs.count();
|
||||
}
|
||||
t /= iterations;
|
||||
double bandwidth = nbytes / megaSize / t; // GByte/s
|
||||
|
||||
fprintf(stderr, "%8s, %-9.06lf, %-4.08lf\n",
|
||||
sizeToString(thisSize).c_str(), t, bandwidth);
|
||||
if (i == (nSizes - 1)) {
|
||||
timeMs[currentGpu * gpuCount + peerGpu] = t;
|
||||
bandWidth[currentGpu * gpuCount + peerGpu] = bandwidth;
|
||||
}
|
||||
}
|
||||
if (currentGpu != peerGpu) {
|
||||
HIP_CHECK(hipDeviceDisablePeerAccess(peerGpu));
|
||||
}
|
||||
if (timingMode == TIMING_MODE_GPU) {
|
||||
HIP_CHECK(hipEventDestroy(eventStart));
|
||||
HIP_CHECK(hipEventDestroy(eventStop));
|
||||
}
|
||||
#ifdef VERIFY_DATA
|
||||
// Verify again
|
||||
for (size_t n = 0; n < numMax; n++) hostMem1[n] = 0;
|
||||
HIP_CHECK(hipMemcpy(hostMem1.data(), peerGpuMem, numMax, hipMemcpyDeviceToHost));
|
||||
REQUIRE(hostMem1 == hostMem0);
|
||||
#endif
|
||||
// Cleanup
|
||||
HIP_CHECK(hipFree((void*)currentGpuMem));
|
||||
HIP_CHECK(hipFree((void*)peerGpuMem));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
}
|
||||
outputMatrix(string("Unidirectional ") + method + " Time Table(ms)", gpuCount, timeMs,
|
||||
timingMode, sizes[nSizes - 1], iterations);
|
||||
outputMatrix(string("Unidirectional ") + method + " Bandwith Table(GB/s)", gpuCount,
|
||||
bandWidth, timingMode, sizes[nSizes - 1], iterations);
|
||||
}
|
||||
|
||||
static void testP2PBiDirMemPerf(const int iterations, const bool useHipMemcpyAsync) {
|
||||
const char* method = useHipMemcpyAsync ? "hipMemcpyAsync()" : "copy kernel";
|
||||
int gpuCount = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&gpuCount));
|
||||
if (gpuCount < 1) {
|
||||
fprintf(stderr, "Need at least 1 GPU, skipped!\n");
|
||||
return;
|
||||
}
|
||||
vector<double> timeMs(gpuCount * gpuCount, 0.);
|
||||
vector<double> bandWidth(gpuCount * gpuCount, 0.);
|
||||
size_t numMax = sizeToBytes(sizes[nSizes - 1]) * 2;
|
||||
for (int currentGpu = 0; currentGpu < gpuCount; currentGpu++) {
|
||||
for (int peerGpu = 0; peerGpu < gpuCount; peerGpu++) {
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
fprintf(stderr, "Bi: Gpu%d <-> Gpu%d by %s, Timing GPU, Iterations %d\n",
|
||||
currentGpu, peerGpu, method, iterations);
|
||||
|
||||
if (currentGpu != peerGpu) {
|
||||
int canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu));
|
||||
if (!canAccessPeer) {
|
||||
fprintf(stderr, "currentGpu %d cannot access peerGpu %d\n", currentGpu, peerGpu);
|
||||
continue;
|
||||
}
|
||||
canAccessPeer = 0;
|
||||
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, peerGpu, currentGpu));
|
||||
if (!canAccessPeer) {
|
||||
fprintf(stderr, "peerGpu %d cannot access currentGpu %d\n", peerGpu, currentGpu);
|
||||
continue;
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(currentGpu, 0));
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(peerGpu, 0));
|
||||
}
|
||||
fprintf(stderr, "Gpu%d -> Gpu%d *"
|
||||
"* Gpu%d -> Gpu%d\n", currentGpu, peerGpu, peerGpu, currentGpu);
|
||||
fprintf(stderr, "Size(KB) Time(ms) Bandwidth(GB/s) "
|
||||
" Time(ms) Bandwidth(GB/s)\n");
|
||||
|
||||
unsigned char *currentGpuMem[2], *peerGpuMem[2];
|
||||
HIP_CHECK(hipMalloc((void**)¤tGpuMem[0], numMax));
|
||||
HIP_CHECK(hipMalloc((void**)¤tGpuMem[1], numMax));
|
||||
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
// peerGpu is the current device
|
||||
HIP_CHECK(hipMalloc((void**)&peerGpuMem[0], numMax));
|
||||
HIP_CHECK(hipMalloc((void**)&peerGpuMem[1], numMax));
|
||||
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
|
||||
unsigned N = numMax / sizeof(T); // Number of T in buffer of numMax bytes.
|
||||
REQUIRE(N * sizeof(T) == numMax);
|
||||
unsigned blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
// Warmup. currentGpu is the current device
|
||||
if (useHipMemcpyAsync) {
|
||||
HIP_CHECK(hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], numMax, hipMemcpyDeviceToDevice, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
HIP_CHECK(hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], numMax, hipMemcpyDeviceToDevice, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
else {
|
||||
// Warmup
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
0, reinterpret_cast<T*>(peerGpuMem[0]),
|
||||
reinterpret_cast<T*>(currentGpuMem[0]), static_cast<size_t>(N));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
0, reinterpret_cast<T*>(currentGpuMem[1]),
|
||||
reinterpret_cast<T*>(peerGpuMem[1]), static_cast<size_t>(N));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
hipStream_t stream[2];
|
||||
hipEvent_t eventStart[2], eventStop[2];
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
HIP_CHECK(hipStreamCreate(&stream[0]));
|
||||
HIP_CHECK(hipEventCreate(&eventStart[0]));
|
||||
HIP_CHECK(hipEventCreate(&eventStop[0]));
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
HIP_CHECK(hipStreamCreate(&stream[1]));
|
||||
HIP_CHECK(hipEventCreate(&eventStart[1]));
|
||||
HIP_CHECK(hipEventCreate(&eventStop[1]));
|
||||
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
const int thisSize = sizes[i];
|
||||
const size_t nbytes = sizeToBytes(thisSize);
|
||||
N = nbytes / sizeof(T); // Number of T in buffer of nbytes bytes
|
||||
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
HIP_CHECK(hipEventRecord(eventStart[0], stream[0]));
|
||||
HIP_CHECK(hipEventRecord(eventStart[1], stream[1]));
|
||||
|
||||
for (size_t offsetEnd = numMax - nbytes, offset = 0, j = 0; j < iterations;
|
||||
j++, offset += nbytes) {
|
||||
if (offset > offsetEnd) offset = 0;
|
||||
if (useHipMemcpyAsync) {
|
||||
HIP_CHECK(hipMemcpyAsync(peerGpuMem[0] + offset, currentGpuMem[0] + offset, nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[0]));
|
||||
HIP_CHECK(hipMemcpyAsync(currentGpuMem[1] + offset, peerGpuMem[1] + offset, nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]));
|
||||
}
|
||||
else {
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
stream[0], reinterpret_cast<T*>(peerGpuMem[0] + offset),
|
||||
reinterpret_cast<T*>(currentGpuMem[0] + offset), static_cast<size_t>(N));
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
stream[1], reinterpret_cast<T*>(currentGpuMem[1] + offset),
|
||||
reinterpret_cast<T*>(peerGpuMem[1] + offset), static_cast<size_t>(N));
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipEventRecord(eventStop[0], stream[0]));
|
||||
HIP_CHECK(hipEventRecord(eventStop[1], stream[1]));
|
||||
|
||||
HIP_CHECK(hipEventSynchronize(eventStop[0]));
|
||||
HIP_CHECK(hipEventSynchronize(eventStop[1]));
|
||||
|
||||
float t[2];
|
||||
double bandwidth[2];
|
||||
for (int n = 0; n < 2; n++) {
|
||||
HIP_CHECK(hipEventElapsedTime(&t[n], eventStart[n], eventStop[n]));
|
||||
t[n] /= iterations;
|
||||
bandwidth[n] = nbytes / megaSize / t[n]; // GByte/s
|
||||
}
|
||||
|
||||
fprintf(stderr, "%8s, %-9.06lf, %-4.08lf, "
|
||||
"%-9.06lf, %-4.08lf\n", sizeToString(thisSize).c_str(), t[0],
|
||||
bandwidth[0], t[1], bandwidth[1]);
|
||||
|
||||
if (i == (nSizes - 1)) {
|
||||
timeMs[currentGpu * gpuCount + peerGpu] = (t[0] + t[1]) / 2;
|
||||
bandWidth[currentGpu * gpuCount + peerGpu] = bandwidth[0] + bandwidth[1];
|
||||
}
|
||||
}
|
||||
if (currentGpu != peerGpu) {
|
||||
HIP_CHECK(hipSetDevice(peerGpu));
|
||||
HIPCHECK(hipDeviceDisablePeerAccess(currentGpu));
|
||||
HIP_CHECK(hipSetDevice(currentGpu));
|
||||
HIPCHECK(hipDeviceDisablePeerAccess(peerGpu));
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipEventDestroy(eventStart[i]));
|
||||
HIP_CHECK(hipEventDestroy(eventStop[i]));
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
HIP_CHECK(hipFree((void*)currentGpuMem[i]));
|
||||
HIP_CHECK(hipFree((void*)peerGpuMem[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
outputMatrix(string("Bidirectional ") + method + " Time Table(ms)", gpuCount, timeMs,
|
||||
TIMING_MODE_GPU, sizes[nSizes - 1], iterations);
|
||||
outputMatrix(string("Bidirectional ") + method + " Bandwith Table(GB/s)", gpuCount,
|
||||
bandWidth, TIMING_MODE_GPU, sizes[nSizes - 1], iterations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify P2P uni-direction memcpy performance.
|
||||
* To enable rocr kernel copying, export HSA_ENABLE_SDMA=0
|
||||
* To enable SDMA copying, export HSA_ENABLE_SDMA=1 (by default)
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedP2P.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_hipTestP2PUniDirMemcpyAsync_test - Timing CPU") {
|
||||
const int iterations = cmd_options.iterations == 1000 ?
|
||||
defaultIterations : cmd_options.iterations;
|
||||
testP2PUniDirMemPerf(iterations, TIMING_MODE_CPU, true);
|
||||
}
|
||||
|
||||
TEST_CASE("Perf_hipTestP2PUniDirMemcpyAsync_test - Timing GPU") {
|
||||
const int iterations = cmd_options.iterations == 1000 ?
|
||||
defaultIterations : cmd_options.iterations;
|
||||
testP2PUniDirMemPerf(iterations, TIMING_MODE_GPU, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify P2P uni-direction kernel copy performance.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedP2P.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_hipTestP2PUniDirKernelCopy_test - Timing CPU") {
|
||||
const int iterations = cmd_options.iterations == 1000 ?
|
||||
defaultIterations : cmd_options.iterations;
|
||||
testP2PUniDirMemPerf(iterations, TIMING_MODE_CPU, false);
|
||||
}
|
||||
|
||||
TEST_CASE("Perf_hipTestP2PUniDirKernelCopy_test - Timing GPU") {
|
||||
const int iterations = cmd_options.iterations == 1000 ?
|
||||
defaultIterations : cmd_options.iterations;
|
||||
testP2PUniDirMemPerf(iterations, TIMING_MODE_GPU, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify P2P bi-direction memcpy performance.
|
||||
* To enable rocr kernel copying, export HSA_ENABLE_SDMA=0
|
||||
* To enable SDMA copying, export HSA_ENABLE_SDMA=1 (by default)
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedP2P.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_hipTestP2PBiDirMemcpyAsync_test") {
|
||||
const int iterations = cmd_options.iterations == 1000 ?
|
||||
defaultIterations : cmd_options.iterations;
|
||||
testP2PBiDirMemPerf(iterations, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify P2P bi-direction kernel copy performance.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedP2P.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_hipTestP2PBiDirKernelCopy_test") {
|
||||
const int iterations = cmd_options.iterations == 1000 ?
|
||||
defaultIterations : cmd_options.iterations;
|
||||
testP2PBiDirMemPerf(iterations, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Check support of peer to peer
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedP2P.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_hipCheckP2PSupport") {
|
||||
checkP2PSupport();
|
||||
}
|
||||
Ссылка в новой задаче
Block a user