SWDEV-438680 - Add copy perf test cases
Add copy perf test cases for all devices to
all devices.
Change-Id: I6fa9e2c111a9ef48ef63a721e7a64c54e7f2a72f
[ROCm/hip-tests commit: c745949ec1]
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -30,6 +30,7 @@ set(TEST_SRC
|
||||
hipPerfSampleRate.cc
|
||||
hipPerfSharedMemReadSpeed.cc
|
||||
hipPerfBufferCopySpeedP2P.cc
|
||||
hipPerfBufferCopySpeedAll2All.cc
|
||||
)
|
||||
|
||||
if(HIP_PLATFORM MATCHES "amd")
|
||||
|
||||
@@ -0,0 +1,399 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipMemcpyAsync
|
||||
* @{
|
||||
* @ingroup perfMemoryTest
|
||||
* `hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice,
|
||||
size_t sizeBytes, hipStream_t stream) ` -
|
||||
* Copies data between devices.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_array_common.hh>
|
||||
#include <cmd_options.hh>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
//#define VERIFY_DATA
|
||||
using namespace std;
|
||||
enum DEV_MEM_TYPE { COARSE_GRAINED, FINE_GRAINED, EXTENDED_FINE_GRAINED, UNKNOWN_MEM};
|
||||
|
||||
typedef long long T; // You may change to any type
|
||||
|
||||
static constexpr int nWarmup = 1; // warmup iteration number
|
||||
static constexpr int nIters = 10; // interation number for test
|
||||
static constexpr size_t dataBytes = 1024*1024*1024;
|
||||
|
||||
template <typename T>
|
||||
static __global__ void copy_kernel(T* dst, T* src, size_t N) {
|
||||
const size_t off = blockDim.x * gridDim.x;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += off)
|
||||
dst[i] = src[i];
|
||||
}
|
||||
|
||||
static string getMemType(DEV_MEM_TYPE memType) {
|
||||
switch (memType) {
|
||||
case COARSE_GRAINED:
|
||||
return "coarse";
|
||||
case FINE_GRAINED:
|
||||
return "fine";
|
||||
case EXTENDED_FINE_GRAINED:
|
||||
// Extended - Scope Fine Grained Memory: read is cached, write is not
|
||||
return "extended fine";
|
||||
default:
|
||||
return "unknown mem type";
|
||||
}
|
||||
}
|
||||
|
||||
static void mallocDevBuf(void** pp, size_t size, DEV_MEM_TYPE memType) {
|
||||
switch (memType) {
|
||||
case COARSE_GRAINED:
|
||||
HIP_CHECK(hipMalloc(pp, size));
|
||||
break;
|
||||
case FINE_GRAINED:
|
||||
HIP_CHECK(hipExtMallocWithFlags(pp, size, hipDeviceMallocFinegrained));
|
||||
break;
|
||||
case EXTENDED_FINE_GRAINED:
|
||||
// Extended - Scope Fine Grained Memory: read is cached, write is not
|
||||
// Perf gain compared with cacheable write
|
||||
HIP_CHECK(hipExtMallocWithFlags(pp, size, hipDeviceMallocUncached));
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Unknown memType = %d\n", memType);
|
||||
REQUIRE(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void testCopyPerf(bool toRemote, bool kernelCopy, bool onOneGpu,
|
||||
DEV_MEM_TYPE srcType, DEV_MEM_TYPE dstType) {
|
||||
int nGpus = 0;
|
||||
unsigned int threadsPerBlock = 1024;
|
||||
unsigned int blocks = 16; // DEBUG_CLR_LIMIT_BLIT_WG
|
||||
HIP_CHECK(hipGetDeviceCount(&nGpus));
|
||||
if (nGpus < 2) {
|
||||
fprintf(stderr, "Need at least 2 GPUs, skipped!\n");
|
||||
return;
|
||||
}
|
||||
#if 0
|
||||
if (kernelCopy) {
|
||||
int minGridSize = 0;
|
||||
int blockSize = 0;
|
||||
HIP_CHECK(hipOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, copy_kernel<T>));
|
||||
blocks = minGridSize / nGpus;
|
||||
threadsPerBlock = blockSize;
|
||||
fprintf(stderr, "minGridSize %d, threadsPerBlock %u, blocks %u, nGpus %d\n",
|
||||
minGridSize, threadsPerBlock, blocks, nGpus);
|
||||
}
|
||||
#endif
|
||||
#ifdef VERIFY_DATA
|
||||
std::vector<char> hostMem0(dataBytes), hostMem1(dataBytes, 0);
|
||||
for (size_t n = 0; n < dataBytes; n++) initVal(hostMem0[n]);
|
||||
#endif
|
||||
char** srcBuf = reinterpret_cast<char**>(malloc(nGpus * nGpus * sizeof(char*)));
|
||||
char** dstBuf = reinterpret_cast<char**>(malloc(nGpus * nGpus * sizeof(char*)));
|
||||
hipStream_t *streams = (hipStream_t*)malloc(nGpus*nGpus*sizeof(hipStream_t));
|
||||
for (int local = 0; local < nGpus; local++) {
|
||||
HIP_CHECK(hipSetDevice(local));
|
||||
for (int remote = 0; remote < nGpus; remote++) {
|
||||
if (local == remote) continue;
|
||||
mallocDevBuf((void**)(srcBuf + local * nGpus + remote), dataBytes, srcType);
|
||||
mallocDevBuf((void**)(dstBuf + local * nGpus + remote), dataBytes, dstType);
|
||||
HIP_CHECK(hipStreamCreateWithFlags(&streams[local * nGpus + remote], hipStreamNonBlocking));
|
||||
HIP_CHECK(hipDeviceEnablePeerAccess(remote, 0));
|
||||
#ifdef VERIFY_DATA
|
||||
HIP_CHECK(hipMemcpy(srcBuf[local * nGpus + remote], hostMem0.data(), dataBytes, hipMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
unsigned N = dataBytes / sizeof(T); // Number of T in buffer of dataBytes bytes.
|
||||
REQUIRE(N * sizeof(T) == dataBytes);
|
||||
|
||||
auto test = [&](int iters) {
|
||||
for (int it = 0; it < iters; it++) {
|
||||
for (int local = 0; local < nGpus; local++) {
|
||||
HIP_CHECK(hipSetDevice(local));
|
||||
for (int i = 0; i < nGpus-1; i++) {
|
||||
int remote = (local + i + 1) % nGpus;
|
||||
if (toRemote) {
|
||||
// local to remotes
|
||||
if (kernelCopy) {
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
streams[local * nGpus + remote],
|
||||
reinterpret_cast<T*>(dstBuf[remote * nGpus + local]),
|
||||
reinterpret_cast<T*>(srcBuf[local * nGpus + remote]),
|
||||
static_cast<size_t>(N));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyPeerAsync(dstBuf[remote * nGpus + local], remote,
|
||||
srcBuf[local * nGpus + remote], local,
|
||||
dataBytes, streams[local * nGpus + remote]));
|
||||
}
|
||||
} else {
|
||||
// remotes to local
|
||||
if (kernelCopy) {
|
||||
hipLaunchKernelGGL(copy_kernel<T>, dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
streams[remote * nGpus + local],
|
||||
reinterpret_cast<T*>(dstBuf[local * nGpus + remote]),
|
||||
reinterpret_cast<T*>(srcBuf[remote * nGpus + local]),
|
||||
static_cast<size_t>(N));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
} else {
|
||||
HIPCHECK(hipMemcpyPeerAsync(dstBuf[local * nGpus + remote], local,
|
||||
srcBuf[remote* nGpus + local], remote,
|
||||
dataBytes, streams[remote * nGpus + local]));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (onOneGpu) break;
|
||||
}
|
||||
|
||||
for (int local = 0; local < nGpus; local++) {
|
||||
for (int remote = 0; remote < nGpus; remote++) {
|
||||
if (local == remote) continue;
|
||||
HIP_CHECK(hipStreamSynchronize(streams[local * nGpus + remote]));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
string title = kernelCopy ? "kernel copy - " : "hipMemcpyPeerAsync - ";
|
||||
title += getMemType(srcType);
|
||||
title += " to ";
|
||||
title += getMemType(dstType);
|
||||
title += toRemote ? " - local to remotes " : " - remotes to local ";
|
||||
title += onOneGpu ? "on 1 GPU " : "";
|
||||
|
||||
// warmup
|
||||
test(nWarmup);
|
||||
auto cpuStart = std::chrono::steady_clock::now();
|
||||
test(nIters);
|
||||
std::chrono::duration<double, std::milli> cpuMS =
|
||||
std::chrono::steady_clock::now() - cpuStart;
|
||||
fprintf(stderr, "%s: Time: %f ms/iter, AvgCopyBW: %f GB/s per GPU\n", title.c_str(),
|
||||
cpuMS.count()/nIters, (nGpus-1)*dataBytes/cpuMS.count()*nIters/1e6);
|
||||
|
||||
// exit
|
||||
for (int local = 0; local < nGpus; local++) {
|
||||
HIP_CHECK(hipSetDevice(local));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
for (int remote = 0; remote < nGpus; remote++) {
|
||||
if (local == remote) continue;
|
||||
#ifdef VERIFY_DATA
|
||||
// Verify
|
||||
if (local == 0 && onOneGpu) {
|
||||
memset(hostMem1.data(), 0, dataBytes);
|
||||
if (toRemote) {
|
||||
HIP_CHECK(hipMemcpy(hostMem1.data(), dstBuf[remote * nGpus + local], dataBytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
}
|
||||
else {
|
||||
HIP_CHECK(hipMemcpy(hostMem1.data(), dstBuf[local * nGpus + remote], dataBytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
}
|
||||
REQUIRE(hostMem1 == hostMem0);
|
||||
} else if (!onOneGpu) {
|
||||
// All dstBuf will be enumed regardless of toRemote
|
||||
memset(hostMem1.data(), 0, dataBytes);
|
||||
HIP_CHECK(hipMemcpy(hostMem1.data(), dstBuf[local * nGpus + remote], dataBytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
REQUIRE(hostMem1 == hostMem0);
|
||||
}
|
||||
#endif
|
||||
HIP_CHECK(hipFree(srcBuf[local * nGpus + remote]));
|
||||
HIP_CHECK(hipFree(dstBuf[local * nGpus + remote]));
|
||||
HIP_CHECK(hipStreamDestroy(streams[local * nGpus + remote]));
|
||||
}
|
||||
}
|
||||
free(streams);
|
||||
free(dstBuf);
|
||||
free(srcBuf);
|
||||
SUCCEED("");
|
||||
}
|
||||
|
||||
static void testCopyPerf(bool toRemote, bool kernelCopy, bool onOneGpu) {
|
||||
fprintf(stderr, "**********************************************************\n");
|
||||
for (int srcType = COARSE_GRAINED; srcType < UNKNOWN_MEM; srcType++) {
|
||||
for (int dstType = COARSE_GRAINED; dstType < UNKNOWN_MEM; dstType++) {
|
||||
testCopyPerf(toRemote, kernelCopy, onOneGpu,
|
||||
static_cast<DEV_MEM_TYPE>(srcType), static_cast<DEV_MEM_TYPE>(dstType));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify all devices to all devices copy performance via hipMemcpyPeerAsync
|
||||
* from remotes to local.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedAll2All_test - hipMemcpyPeerAsync - remotes to local") {
|
||||
testCopyPerf(false, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify all devices to all devices copy performance via hipMemcpyPeerAsync
|
||||
* from local to remotes.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedAll2All_test - hipMemcpyPeerAsync - local to remotes") {
|
||||
testCopyPerf(true, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify all devices to all devices copy performance via kernel copy
|
||||
* from remotes to local.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* If GPU number is bigger than 4, export GPU_MAX_HW_QUEUES="GPU number -1" to
|
||||
* prevent HW queue serialization because GPU_MAX_HW_QUEUES=4 by default.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedAll2All_test - kernel copy - remotes to local") {
|
||||
testCopyPerf(false, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify all devices to all devices copy performance via kernel copy
|
||||
* from local to remotes.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* If GPU number is bigger than 4, export GPU_MAX_HW_QUEUES="GPU number -1" to
|
||||
* prevent HW queue serialization because GPU_MAX_HW_QUEUES=4 by default.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedAll2All_test - kernel copy - local to remotes") {
|
||||
testCopyPerf(true, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify all other devices to the first devices copy performance via hipMemcpyPeerAsync
|
||||
* from remotes to local.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedAll2One_test - hipMemcpyPeerAsync - remotes to local") {
|
||||
testCopyPerf(false, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the first device to all other devices copy performance via hipMemcpyPeerAsync
|
||||
* from local to remotes.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedOne2All_test - hipMemcpyPeerAsync - local to remotes") {
|
||||
testCopyPerf(true, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the all other devices to the first devices copy performance via kernel copy
|
||||
* from remotes to local.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedAll2One_test - kernel copy - remotes to local") {
|
||||
testCopyPerf(false, true, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the first device to all other devices copy performance via kernel copy
|
||||
* from local to remotes.
|
||||
* To specify devices to be tested, export HIP_VISIBLE_DEVICES=gpuid0, gupid1,...
|
||||
* For example, to test first 2 devices, export HIP_VISIBLE_DEVICES=0,1
|
||||
* If GPU number is less than 2, the test will be skipped.
|
||||
* If GPU number is bigger than 4, export GPU_MAX_HW_QUEUES="GPU number -1" to
|
||||
* prevent HW queue serialization because GPU_MAX_HW_QUEUES=4 by default.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/memory/hipPerfBufferCopySpeedAll2All.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 6.0
|
||||
*/
|
||||
TEST_CASE("Perf_PerfBufferCopySpeedOne2All_test - kernel copy - local to remotes") {
|
||||
testCopyPerf(true, true, true);
|
||||
}
|
||||
Αναφορά σε νέο ζήτημα
Block a user