2019-08-07 17:21:41 -06:00
|
|
|
/*
|
2022-04-08 15:20:55 -06:00
|
|
|
Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
2019-08-07 17:21:41 -06:00
|
|
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
|
|
|
in the Software without restriction, including without limitation the rights
|
|
|
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
|
|
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
|
|
|
all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
|
THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
// This program measures simultaneous copy performance across multiple GPUs
|
|
|
|
|
// on the same node
|
2020-12-11 10:21:14 -07:00
|
|
|
#include <numa.h>
|
|
|
|
|
#include <numaif.h>
|
|
|
|
|
#include <stack>
|
|
|
|
|
#include <thread>
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2022-01-05 16:33:25 -07:00
|
|
|
#include "TransferBench.hpp"
|
|
|
|
|
#include "GetClosestNumaNode.hpp"
|
|
|
|
|
#include "Kernels.hpp"
|
|
|
|
|
|
2019-08-07 17:21:41 -06:00
|
|
|
int main(int argc, char **argv)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
// Display usage instructions and detected topology
|
2019-11-05 17:10:16 -07:00
|
|
|
if (argc <= 1)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
|
|
|
|
|
if (!outputToCsv) DisplayUsage(argv[0]);
|
|
|
|
|
DisplayTopology(outputToCsv);
|
2020-10-27 09:00:33 -06:00
|
|
|
exit(0);
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
// Collect environment variables / display current run configuration
|
|
|
|
|
EnvVars ev;
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Determine number of bytes to run per Transfer
|
2020-12-11 10:21:14 -07:00
|
|
|
// If a non-zero number of bytes is specified, use it
|
|
|
|
|
// Otherwise generate array of bytes values to execute over
|
|
|
|
|
std::vector<size_t> valuesOfN;
|
2022-04-27 20:43:24 -06:00
|
|
|
size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
|
2021-10-08 16:36:19 -06:00
|
|
|
if (argc > 2)
|
|
|
|
|
{
|
|
|
|
|
// Adjust bytes if unit specified
|
|
|
|
|
char units = argv[2][strlen(argv[2])-1];
|
|
|
|
|
switch (units)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
case 'K': case 'k': numBytesPerTransfer *= 1024; break;
|
|
|
|
|
case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
|
|
|
|
|
case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
|
2021-10-08 16:36:19 -06:00
|
|
|
}
|
|
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);
|
2020-12-11 10:21:14 -07:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Find the largest N to be used - memory will only be allocated once per set of simulatenous Transfers
|
2020-12-11 10:21:14 -07:00
|
|
|
size_t maxN = valuesOfN[0];
|
|
|
|
|
for (auto N : valuesOfN)
|
|
|
|
|
maxN = std::max(maxN, N);
|
|
|
|
|
|
2021-10-21 15:28:16 -06:00
|
|
|
// Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
|
2021-11-24 18:05:37 -07:00
|
|
|
if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
|
|
|
|
|
!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
2021-11-08 14:36:04 -07:00
|
|
|
int numBlocksToUse = 0;
|
|
|
|
|
if (argc > 3)
|
|
|
|
|
numBlocksToUse = atoi(argv[3]);
|
|
|
|
|
else
|
|
|
|
|
HIP_CALL(hipDeviceGetAttribute(&numBlocksToUse, hipDeviceAttributeMultiprocessorCount, 0));
|
|
|
|
|
|
2021-11-24 18:05:37 -07:00
|
|
|
// Perform either local read (+remote write) [EXE = SRC] or
|
|
|
|
|
// remote read (+local write) [EXE = DST]
|
|
|
|
|
int readMode = (!strcmp(argv[1], "p2p_rr") || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
|
2022-04-08 15:20:55 -06:00
|
|
|
int skipCpu = (!strcmp(argv[1], "g2g" ) || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
|
2021-11-24 18:05:37 -07:00
|
|
|
|
2021-10-21 15:28:16 -06:00
|
|
|
// Execute peer to peer benchmark mode
|
2022-04-27 20:43:24 -06:00
|
|
|
RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float), numBlocksToUse, readMode, skipCpu);
|
2021-10-21 15:28:16 -06:00
|
|
|
exit(0);
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Check that Transfer configuration file can be opened
|
2021-10-21 15:28:16 -06:00
|
|
|
FILE* fp = fopen(argv[1], "r");
|
|
|
|
|
if (!fp)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
|
2021-10-21 15:28:16 -06:00
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check for NUMA library support
|
|
|
|
|
if (numa_available() == -1)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
ev.DisplayEnvVars();
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
int const initOffset = ev.byteOffset / sizeof(float);
|
|
|
|
|
std::stack<std::thread> threads;
|
2020-09-25 12:20:48 -06:00
|
|
|
|
|
|
|
|
// Collect the number of available CPUs/GPUs on this machine
|
|
|
|
|
int numGpuDevices;
|
|
|
|
|
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
2020-12-11 10:21:14 -07:00
|
|
|
int const numCpuDevices = numa_num_configured_nodes();
|
2019-11-05 17:10:16 -07:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Track unique pair of transfers that get used
|
2020-12-11 10:21:14 -07:00
|
|
|
std::set<std::pair<int, int>> peerAccessTracker;
|
2019-11-05 17:10:16 -07:00
|
|
|
|
2020-10-27 09:00:33 -06:00
|
|
|
// Print CSV header
|
2020-12-11 10:21:14 -07:00
|
|
|
if (ev.outputToCsv)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),"
|
2022-04-27 20:43:24 -06:00
|
|
|
"TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
|
2020-10-27 09:00:33 -06:00
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Loop over each line in the Transfer configuration file
|
2020-10-27 09:00:33 -06:00
|
|
|
int testNum = 0;
|
2019-11-05 17:10:16 -07:00
|
|
|
char line[2048];
|
|
|
|
|
while(fgets(line, 2048, fp))
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
// Check if line is a comment to be echoed to output (starts with ##)
|
|
|
|
|
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
|
2021-10-13 14:56:57 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Parse transfers from configuration file
|
|
|
|
|
TransferMap transferMap;
|
|
|
|
|
ParseTransfers(line, numCpuDevices, numGpuDevices, transferMap);
|
|
|
|
|
if (transferMap.size() == 0) continue;
|
2019-11-05 17:10:16 -07:00
|
|
|
|
2020-10-27 09:00:33 -06:00
|
|
|
testNum++;
|
2019-11-05 17:10:16 -07:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Prepare (maximum) memory for each transfer
|
|
|
|
|
std::vector<Transfer*> transferList;
|
|
|
|
|
for (auto& exeInfoPair : transferMap)
|
2020-08-19 09:47:19 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
ExecutorInfo& exeInfo = exeInfoPair.second;
|
|
|
|
|
exeInfo.totalTime = 0.0;
|
|
|
|
|
exeInfo.totalBlocks = 0;
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
for (Transfer& transfer : exeInfo.transfers)
|
2019-11-05 17:10:16 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
// Get some aliases to transfer variables
|
|
|
|
|
MemType const& exeMemType = transfer.exeMemType;
|
|
|
|
|
MemType const& srcMemType = transfer.srcMemType;
|
|
|
|
|
MemType const& dstMemType = transfer.dstMemType;
|
|
|
|
|
int const& blocksToUse = transfer.numBlocksToUse;
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
// Get potentially remapped device indices
|
2022-04-27 20:43:24 -06:00
|
|
|
int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
|
|
|
|
|
int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
|
|
|
|
|
int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
// Enable peer-to-peer access if necessary (can only be called once per unique pair)
|
|
|
|
|
if (exeMemType == MEM_GPU)
|
2019-11-05 17:10:16 -07:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
// Ensure executing GPU can access source memory
|
|
|
|
|
if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
|
2020-08-19 09:47:19 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
|
|
|
|
|
if (!peerAccessTracker.count(exeSrcPair))
|
|
|
|
|
{
|
|
|
|
|
EnablePeerAccess(exeIndex, srcIndex);
|
|
|
|
|
peerAccessTracker.insert(exeSrcPair);
|
|
|
|
|
}
|
2020-08-19 09:47:19 -06:00
|
|
|
}
|
2019-08-07 17:21:41 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
// Ensure executing GPU can access destination memory
|
|
|
|
|
if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
|
2020-08-19 09:47:19 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
auto exeDstPair = std::make_pair(exeIndex, dstIndex);
|
|
|
|
|
if (!peerAccessTracker.count(exeDstPair))
|
|
|
|
|
{
|
|
|
|
|
EnablePeerAccess(exeIndex, dstIndex);
|
|
|
|
|
peerAccessTracker.insert(exeDstPair);
|
|
|
|
|
}
|
2020-08-19 09:47:19 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
// Allocate (maximum) source / destination memory based on type / device index
|
2022-04-27 20:43:24 -06:00
|
|
|
AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
|
|
|
|
|
AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
|
|
|
|
|
transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
|
|
|
|
|
exeInfo.totalBlocks += transfer.blockParam.size();
|
|
|
|
|
transferList.push_back(&transfer);
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2020-08-19 09:47:19 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
// Prepare GPU resources for GPU executors
|
|
|
|
|
MemType const exeMemType = exeInfoPair.first.first;
|
|
|
|
|
int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType);
|
2020-12-11 10:21:14 -07:00
|
|
|
if (exeMemType == MEM_GPU)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
|
|
|
|
|
(void**)&exeInfo.blockParamGpu);
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
|
|
|
|
|
exeInfo.streams.resize(numTransfersToRun);
|
|
|
|
|
exeInfo.startEvents.resize(numTransfersToRun);
|
|
|
|
|
exeInfo.stopEvents.resize(numTransfersToRun);
|
|
|
|
|
for (int i = 0; i < numTransfersToRun; ++i)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
|
|
|
|
HIP_CALL(hipSetDevice(exeIndex));
|
|
|
|
|
HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
|
|
|
|
|
HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
|
|
|
|
|
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
int transferOffset = 0;
|
|
|
|
|
for (int i = 0; i < exeInfo.transfers.size(); i++)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
|
|
|
|
|
transferOffset += exeInfo.transfers[i].blockParam.size();
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
|
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Loop over all the different number of bytes to use per Transfer
|
2020-12-11 10:21:14 -07:00
|
|
|
for (auto N : valuesOfN)
|
|
|
|
|
{
|
|
|
|
|
if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
|
2020-08-19 09:47:19 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
// Prepare input memory and block parameters for current N
|
2022-04-27 20:43:24 -06:00
|
|
|
for (auto& exeInfoPair : transferMap)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
ExecutorInfo& exeInfo = exeInfoPair.second;
|
2020-12-04 14:57:13 -07:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
int transferOffset = 0;
|
2021-10-25 11:23:29 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
for (int i = 0; i < exeInfo.transfers.size(); ++i)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
Transfer& transfer = exeInfo.transfers[i];
|
|
|
|
|
transfer.PrepareBlockParams(ev, N);
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
// Copy block parameters to GPU for GPU executors
|
2022-04-27 20:43:24 -06:00
|
|
|
if (transfer.exeMemType == MEM_GPU)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
|
|
|
|
|
transfer.blockParam.data(),
|
|
|
|
|
transfer.blockParam.size() * sizeof(BlockParam),
|
2022-04-08 15:20:55 -06:00
|
|
|
hipMemcpyHostToDevice));
|
2022-04-27 20:43:24 -06:00
|
|
|
transferOffset += transfer.blockParam.size();
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
2020-08-19 09:47:19 -06:00
|
|
|
}
|
2019-11-05 17:10:16 -07:00
|
|
|
}
|
2019-08-07 17:21:41 -06:00
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
// Launch kernels (warmup iterations are not counted)
|
2022-04-08 15:20:55 -06:00
|
|
|
double totalCpuTime = 0;
|
2022-04-27 20:43:24 -06:00
|
|
|
size_t numTimedIterations = 0;
|
|
|
|
|
for (int iteration = -ev.numWarmups; ; iteration++)
|
2019-11-05 17:10:16 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
|
|
|
|
|
if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
|
|
|
|
|
|
2020-09-25 12:20:48 -06:00
|
|
|
// Pause before starting first timed iteration in interactive mode
|
2020-12-11 10:21:14 -07:00
|
|
|
if (ev.useInteractive && iteration == 0)
|
2020-08-19 09:47:19 -06:00
|
|
|
{
|
|
|
|
|
printf("Hit <Enter> to continue: ");
|
|
|
|
|
scanf("%*c");
|
|
|
|
|
printf("\n");
|
|
|
|
|
}
|
2019-08-07 17:21:41 -06:00
|
|
|
|
2020-09-25 12:20:48 -06:00
|
|
|
// Start CPU timing for this iteration
|
2020-08-19 09:47:19 -06:00
|
|
|
auto cpuStart = std::chrono::high_resolution_clock::now();
|
2019-08-07 17:21:41 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Execute all Transfers in parallel
|
|
|
|
|
for (auto& exeInfoPair : transferMap)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
|
|
|
|
ExecutorInfo& exeInfo = exeInfoPair.second;
|
2022-04-27 20:43:24 -06:00
|
|
|
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
|
|
|
|
|
for (int i = 0; i < numTransfersToRun; ++i)
|
|
|
|
|
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2020-08-19 09:47:19 -06:00
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
// Wait for all threads to finish
|
2022-04-27 20:43:24 -06:00
|
|
|
int const numTransfers = threads.size();
|
|
|
|
|
for (int i = 0; i < numTransfers; i++)
|
2019-08-07 17:21:41 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
threads.top().join();
|
|
|
|
|
threads.pop();
|
2019-08-07 17:21:41 -06:00
|
|
|
}
|
2019-11-05 17:10:16 -07:00
|
|
|
|
2020-09-25 12:20:48 -06:00
|
|
|
// Stop CPU timing for this iteration
|
2020-08-19 09:47:19 -06:00
|
|
|
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
|
|
|
|
|
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
if (iteration >= 0)
|
|
|
|
|
{
|
|
|
|
|
++numTimedIterations;
|
|
|
|
|
totalCpuTime += deltaSec;
|
|
|
|
|
}
|
2019-11-05 17:10:16 -07:00
|
|
|
}
|
2019-08-07 17:21:41 -06:00
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
// Pause for interactive mode
|
|
|
|
|
if (ev.useInteractive)
|
2019-11-05 17:10:16 -07:00
|
|
|
{
|
2020-08-19 09:47:19 -06:00
|
|
|
printf("Transfers complete. Hit <Enter> to continue: ");
|
|
|
|
|
scanf("%*c");
|
|
|
|
|
printf("\n");
|
2019-11-05 17:10:16 -07:00
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Validate that each transfer has transferred correctly
|
|
|
|
|
int const numTransfers = transferList.size();
|
|
|
|
|
for (auto transfer : transferList)
|
|
|
|
|
CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
|
2020-08-19 09:47:19 -06:00
|
|
|
|
|
|
|
|
// Report timings
|
2022-04-27 20:43:24 -06:00
|
|
|
totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
|
|
|
|
|
double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime;
|
2021-09-07 15:28:16 -06:00
|
|
|
double maxGpuTime = 0;
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
if (ev.useSingleStream)
|
2019-11-05 17:10:16 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
for (auto& exeInfoPair : transferMap)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
ExecutorInfo const& exeInfo = exeInfoPair.second;
|
|
|
|
|
MemType const exeMemType = exeInfoPair.first.first;
|
|
|
|
|
int const exeIndex = exeInfoPair.first.second;
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
|
|
|
|
|
double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f;
|
2022-04-08 15:20:55 -06:00
|
|
|
maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
|
|
|
|
|
|
|
|
|
|
if (!ev.outputToCsv)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf(" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
|
|
|
|
|
MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
|
|
|
|
|
for (auto transfer : exeInfo.transfers)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
|
|
|
|
|
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
|
|
|
|
|
|
|
|
|
|
printf(" Transfer %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
|
|
|
|
|
transfer.transferIndex,
|
|
|
|
|
transferBandwidthGbs,
|
|
|
|
|
transferDurationMsec,
|
|
|
|
|
MemTypeStr[transfer.srcMemType], transfer.srcIndex,
|
|
|
|
|
MemTypeStr[transfer.exeMemType], transfer.exeIndex,
|
|
|
|
|
transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
|
|
|
|
|
MemTypeStr[transfer.dstMemType], transfer.dstIndex);
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
|
2022-04-08 15:20:55 -06:00
|
|
|
testNum, N * sizeof(float),
|
|
|
|
|
MemTypeStr[exeMemType], exeIndex,
|
|
|
|
|
exeBandwidthGbs, exeDurationMsec,
|
|
|
|
|
ev.byteOffset,
|
2022-04-27 20:43:24 -06:00
|
|
|
ev.numWarmups, numTimedIterations);
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2020-10-27 09:00:33 -06:00
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
for (auto transfer : transferList)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
|
|
|
|
|
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
|
|
|
|
|
maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
|
2022-04-08 15:20:55 -06:00
|
|
|
if (!ev.outputToCsv)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n",
|
|
|
|
|
transfer->transferIndex,
|
|
|
|
|
MemTypeStr[transfer->srcMemType], transfer->srcIndex,
|
|
|
|
|
MemTypeStr[transfer->exeMemType], transfer->exeIndex,
|
|
|
|
|
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
|
|
|
|
|
MemTypeStr[transfer->dstMemType], transfer->dstIndex,
|
|
|
|
|
transferBandwidthGbs, transferDurationMsec,
|
|
|
|
|
GetTransferDesc(*transfer).c_str());
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
|
2022-04-08 15:20:55 -06:00
|
|
|
testNum, N * sizeof(float),
|
2022-04-27 20:43:24 -06:00
|
|
|
MemTypeStr[transfer->srcMemType], transfer->srcIndex,
|
|
|
|
|
MemTypeStr[transfer->exeMemType], transfer->exeIndex,
|
|
|
|
|
MemTypeStr[transfer->dstMemType], transfer->dstIndex,
|
|
|
|
|
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
|
|
|
|
|
transferBandwidthGbs, transferDurationMsec,
|
|
|
|
|
GetTransferDesc(*transfer).c_str(),
|
|
|
|
|
transfer->srcMem + initOffset, transfer->dstMem + initOffset,
|
2022-04-08 15:20:55 -06:00
|
|
|
ev.byteOffset,
|
2022-04-27 20:43:24 -06:00
|
|
|
ev.numWarmups, numTimedIterations);
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2020-10-27 09:00:33 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Display aggregate statistics
|
2020-12-11 10:21:14 -07:00
|
|
|
if (!ev.outputToCsv)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
|
2021-09-07 15:28:16 -06:00
|
|
|
totalCpuTime - maxGpuTime);
|
2020-10-27 09:00:33 -06:00
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
|
2020-12-11 10:21:14 -07:00
|
|
|
testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
|
2022-04-27 20:43:24 -06:00
|
|
|
ev.numWarmups, numTimedIterations);
|
2019-11-05 17:10:16 -07:00
|
|
|
}
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
2019-11-05 17:10:16 -07:00
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
// Release GPU memory
|
2022-04-27 20:43:24 -06:00
|
|
|
for (auto exeInfoPair : transferMap)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
ExecutorInfo& exeInfo = exeInfoPair.second;
|
2022-04-27 20:43:24 -06:00
|
|
|
for (auto& transfer : exeInfo.transfers)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
// Get some aliases to Transfer variables
|
|
|
|
|
MemType const& exeMemType = transfer.exeMemType;
|
|
|
|
|
MemType const& srcMemType = transfer.srcMemType;
|
|
|
|
|
MemType const& dstMemType = transfer.dstMemType;
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
// Allocate (maximum) source / destination memory based on type / device index
|
2022-04-27 20:43:24 -06:00
|
|
|
DeallocateMemory(srcMemType, transfer.srcMem);
|
|
|
|
|
DeallocateMemory(dstMemType, transfer.dstMem);
|
|
|
|
|
transfer.blockParam.clear();
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
MemType const exeMemType = exeInfoPair.first.first;
|
|
|
|
|
int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType);
|
|
|
|
|
if (exeMemType == MEM_GPU)
|
2020-08-19 09:47:19 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
DeallocateMemory(exeMemType, exeInfo.blockParamGpu);
|
2022-04-27 20:43:24 -06:00
|
|
|
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
|
|
|
|
|
for (int i = 0; i < numTransfersToRun; ++i)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
|
|
|
|
HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
|
|
|
|
|
HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
|
|
|
|
|
HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
|
|
|
|
|
}
|
2020-08-19 09:47:19 -06:00
|
|
|
}
|
2019-08-07 17:21:41 -06:00
|
|
|
}
|
2019-11-05 17:10:16 -07:00
|
|
|
}
|
|
|
|
|
fclose(fp);
|
|
|
|
|
|
|
|
|
|
return 0;
|
2019-08-07 17:21:41 -06:00
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
|
|
|
|
|
void DisplayUsage(char const* cmdName)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("TransferBench v%s\n", TB_VERSION);
|
|
|
|
|
printf("========================================\n");
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
if (numa_available() == -1)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
int numGpuDevices;
|
|
|
|
|
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
|
|
|
|
int const numCpuDevices = numa_num_configured_nodes();
|
|
|
|
|
|
2021-11-24 18:05:37 -07:00
|
|
|
printf("Usage: %s config <N>\n", cmdName);
|
|
|
|
|
printf(" config: Either:\n");
|
2022-04-27 20:43:24 -06:00
|
|
|
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
|
2021-11-24 18:05:37 -07:00
|
|
|
printf(" - Name of preset benchmark:\n");
|
|
|
|
|
printf(" p2p - All CPU/GPU pairs benchmark\n");
|
|
|
|
|
printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
|
|
|
|
|
printf(" g2g - All GPU/GPU pairs benchmark\n");
|
|
|
|
|
printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
|
|
|
|
|
printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
|
2022-04-27 20:43:24 -06:00
|
|
|
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
|
2022-04-08 15:20:55 -06:00
|
|
|
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
|
2022-04-27 20:43:24 -06:00
|
|
|
DEFAULT_BYTES_PER_TRANSFER);
|
2021-11-24 18:05:37 -07:00
|
|
|
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
|
|
|
|
|
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
|
2020-09-25 12:20:48 -06:00
|
|
|
printf("\n");
|
2020-12-11 10:21:14 -07:00
|
|
|
|
|
|
|
|
EnvVars::DisplayUsage();
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
|
2022-02-02 08:51:41 -07:00
|
|
|
int RemappedIndex(int const origIdx, MemType const memType)
|
|
|
|
|
{
|
|
|
|
|
static std::vector<int> remapping;
|
|
|
|
|
|
|
|
|
|
// No need to re-map CPU devices
|
|
|
|
|
if (memType == MEM_CPU) return origIdx;
|
|
|
|
|
|
|
|
|
|
// Build remapping on first use
|
|
|
|
|
if (remapping.empty())
|
|
|
|
|
{
|
|
|
|
|
int numGpuDevices;
|
|
|
|
|
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
|
|
|
|
remapping.resize(numGpuDevices);
|
|
|
|
|
|
|
|
|
|
int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
|
|
|
|
|
if (!usePcieIndexing)
|
|
|
|
|
{
|
|
|
|
|
// For HIP-based indexing no remapping is necessary
|
|
|
|
|
for (int i = 0; i < numGpuDevices; ++i)
|
|
|
|
|
remapping[i] = i;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Collect PCIe address for each GPU
|
|
|
|
|
std::vector<std::pair<std::string, int>> mapping;
|
|
|
|
|
char pciBusId[20];
|
|
|
|
|
for (int i = 0; i < numGpuDevices; ++i)
|
|
|
|
|
{
|
|
|
|
|
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
|
|
|
|
|
mapping.push_back(std::make_pair(pciBusId, i));
|
|
|
|
|
}
|
|
|
|
|
// Sort GPUs by PCIe address then use that as mapping
|
|
|
|
|
std::sort(mapping.begin(), mapping.end());
|
|
|
|
|
for (int i = 0; i < numGpuDevices; ++i)
|
|
|
|
|
remapping[i] = mapping[i].second;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return remapping[origIdx];
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
void DisplayTopology(bool const outputToCsv)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
|
|
|
|
int numGpuDevices;
|
|
|
|
|
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
if (outputToCsv)
|
|
|
|
|
{
|
|
|
|
|
printf("NumCpus,%d\n", numa_num_configured_nodes());
|
|
|
|
|
printf("NumGpus,%d\n", numGpuDevices);
|
|
|
|
|
printf("GPU");
|
|
|
|
|
for (int j = 0; j < numGpuDevices; j++)
|
|
|
|
|
printf(",GPU %02d", j);
|
|
|
|
|
printf(",PCIe Bus ID,ClosestNUMA\n");
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
|
|
|
|
|
printf(" |");
|
|
|
|
|
for (int j = 0; j < numGpuDevices; j++)
|
|
|
|
|
printf(" GPU %02d |", j);
|
|
|
|
|
printf(" PCIe Bus ID | Closest NUMA\n");
|
|
|
|
|
for (int j = 0; j <= numGpuDevices; j++)
|
|
|
|
|
printf("--------+");
|
|
|
|
|
printf("--------------+-------------\n");
|
|
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2021-02-01 09:48:09 -07:00
|
|
|
char pciBusId[20];
|
2022-04-08 15:20:55 -06:00
|
|
|
|
2020-09-25 12:20:48 -06:00
|
|
|
for (int i = 0; i < numGpuDevices; i++)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
|
2020-09-25 12:20:48 -06:00
|
|
|
for (int j = 0; j < numGpuDevices; j++)
|
|
|
|
|
{
|
|
|
|
|
if (i == j)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
|
|
|
|
if (outputToCsv)
|
|
|
|
|
printf("-,");
|
|
|
|
|
else
|
|
|
|
|
printf(" - |");
|
|
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
uint32_t linkType, hopCount;
|
2022-02-02 08:51:41 -07:00
|
|
|
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, MEM_GPU),
|
|
|
|
|
RemappedIndex(j, MEM_GPU),
|
|
|
|
|
&linkType, &hopCount));
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("%s%s-%d%s",
|
|
|
|
|
outputToCsv ? "" : " ",
|
2020-09-25 12:20:48 -06:00
|
|
|
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" :
|
|
|
|
|
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" :
|
|
|
|
|
linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" :
|
|
|
|
|
linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" :
|
|
|
|
|
linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????",
|
2022-04-08 15:20:55 -06:00
|
|
|
hopCount, outputToCsv ? "," : " |");
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
}
|
2022-02-02 08:51:41 -07:00
|
|
|
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, RemappedIndex(i, MEM_GPU)));
|
2022-04-08 15:20:55 -06:00
|
|
|
if (outputToCsv)
|
|
|
|
|
printf("%s,%d\n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU)));
|
|
|
|
|
else
|
|
|
|
|
printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU)));
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
void PopulateTestSizes(size_t const numBytesPerTransfer,
|
2020-12-11 10:21:14 -07:00
|
|
|
int const samplingFactor,
|
|
|
|
|
std::vector<size_t>& valuesOfN)
|
|
|
|
|
{
|
|
|
|
|
valuesOfN.clear();
|
|
|
|
|
|
|
|
|
|
// If the number of bytes is specified, use it
|
2022-04-27 20:43:24 -06:00
|
|
|
if (numBytesPerTransfer != 0)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
if (numBytesPerTransfer % 4)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
|
2020-12-11 10:21:14 -07:00
|
|
|
exit(1);
|
|
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
size_t N = numBytesPerTransfer / sizeof(float);
|
2020-12-11 10:21:14 -07:00
|
|
|
valuesOfN.push_back(N);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// Otherwise generate a range of values
|
|
|
|
|
// (Powers of 2, with samplingFactor samples between successive powers of 2)
|
|
|
|
|
for (int N = 256; N <= (1<<27); N *= 2)
|
|
|
|
|
{
|
|
|
|
|
int delta = std::max(32, N / samplingFactor);
|
|
|
|
|
int curr = N;
|
|
|
|
|
while (curr < N * 2)
|
|
|
|
|
{
|
|
|
|
|
valuesOfN.push_back(curr);
|
|
|
|
|
curr += delta;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
|
|
|
|
char typeChar;
|
|
|
|
|
if (sscanf(token.c_str(), " %c %d", &typeChar, memIndex) != 2)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("[ERROR] Unable to parse memory type token %s - expecting either 'B,C,G or F' followed by an index\n",
|
|
|
|
|
token.c_str());
|
2020-09-25 12:20:48 -06:00
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (typeChar)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
case 'C': case 'c': case 'B': case 'b':
|
|
|
|
|
*memType = (typeChar == 'C' || typeChar == 'c') ? MEM_CPU : MEM_CPU_FINE;
|
2020-12-11 10:21:14 -07:00
|
|
|
if (*memIndex < 0 || *memIndex >= numCpus)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, *memIndex);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
break;
|
2022-04-08 15:20:55 -06:00
|
|
|
case 'G': case 'g': case 'F': case 'f':
|
|
|
|
|
*memType = (typeChar == 'G' || typeChar == 'g') ? MEM_GPU : MEM_GPU_FINE;
|
2020-12-11 10:21:14 -07:00
|
|
|
if (*memIndex < 0 || *memIndex >= numGpus)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("[ERROR] Unrecognized memory type %s. Expecting either 'B', 'C' or 'G' or 'F'\n", token.c_str());
|
2020-12-11 10:21:14 -07:00
|
|
|
exit(1);
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Helper function to parse a list of Transfer definitions
|
|
|
|
|
void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferMap)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Replace any round brackets or '->' with spaces,
|
2021-10-07 15:57:21 -06:00
|
|
|
for (int i = 1; line[i]; i++)
|
2020-12-11 10:21:14 -07:00
|
|
|
if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
transferMap.clear();
|
|
|
|
|
int numTransfers = 0;
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
std::istringstream iss(line);
|
2022-04-27 20:43:24 -06:00
|
|
|
iss >> numTransfers;
|
2020-09-25 12:20:48 -06:00
|
|
|
if (iss.fail()) return;
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
std::string exeMem;
|
2020-09-25 12:20:48 -06:00
|
|
|
std::string srcMem;
|
|
|
|
|
std::string dstMem;
|
2022-04-27 20:43:24 -06:00
|
|
|
if (numTransfers > 0)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Method 1: Take in triples (srcMem, exeMem, dstMem)
|
2020-09-25 12:20:48 -06:00
|
|
|
int numBlocksToUse;
|
|
|
|
|
iss >> numBlocksToUse;
|
2021-09-07 15:28:16 -06:00
|
|
|
if (numBlocksToUse <= 0 || iss.fail())
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
|
|
|
|
printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
for (int i = 0; i < numTransfers; i++)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
Transfer transfer;
|
|
|
|
|
transfer.transferIndex = i;
|
2020-12-11 10:21:14 -07:00
|
|
|
iss >> srcMem >> exeMem >> dstMem;
|
2021-09-07 15:28:16 -06:00
|
|
|
if (iss.fail())
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
|
2021-09-07 15:28:16 -06:00
|
|
|
exit(1);
|
|
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
|
|
|
|
|
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
|
|
|
|
|
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
|
|
|
|
|
transfer.numBlocksToUse = numBlocksToUse;
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
// Ensure executor is either CPU or GPU
|
2022-04-27 20:43:24 -06:00
|
|
|
if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2021-10-07 15:57:21 -06:00
|
|
|
printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n",
|
2022-04-27 20:43:24 -06:00
|
|
|
srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
|
2020-12-11 10:21:14 -07:00
|
|
|
exit(1);
|
|
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
Executor executor(transfer.exeMemType, transfer.exeIndex);
|
|
|
|
|
ExecutorInfo& executorInfo = transferMap[executor];
|
|
|
|
|
executorInfo.totalBlocks += transfer.numBlocksToUse;
|
|
|
|
|
executorInfo.transfers.push_back(transfer);
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Method 2: Read in quads (srcMem, exeMem, dstMem, Read common # blocks to use, then read (src, dst) doubles
|
2022-04-27 20:43:24 -06:00
|
|
|
numTransfers *= -1;
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
for (int i = 0; i < numTransfers; i++)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
Transfer transfer;
|
|
|
|
|
transfer.transferIndex = i;
|
|
|
|
|
iss >> srcMem >> exeMem >> dstMem >> transfer.numBlocksToUse;
|
2021-09-07 15:28:16 -06:00
|
|
|
if (iss.fail())
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
|
2021-09-07 15:28:16 -06:00
|
|
|
exit(1);
|
|
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
|
|
|
|
|
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
|
|
|
|
|
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
|
|
|
|
|
if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2021-10-07 15:57:21 -06:00
|
|
|
printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n"
|
2022-04-27 20:43:24 -06:00
|
|
|
, srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
|
2020-12-11 10:21:14 -07:00
|
|
|
exit(1);
|
|
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
Executor executor(transfer.exeMemType, transfer.exeIndex);
|
|
|
|
|
ExecutorInfo& executorInfo = transferMap[executor];
|
|
|
|
|
executorInfo.totalBlocks += transfer.numBlocksToUse;
|
|
|
|
|
executorInfo.transfers.push_back(transfer);
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
int canAccess;
|
|
|
|
|
HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
|
|
|
|
|
if (!canAccess)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
HIP_CALL(hipSetDevice(deviceId));
|
|
|
|
|
HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0));
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
|
|
|
|
if (numBytes == 0)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] Unable to allocate 0 bytes\n");
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
if (memType == MEM_CPU || memType == MEM_CPU_FINE)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Set numa policy prior to call to hipHostMalloc
|
2022-02-07 12:16:19 -07:00
|
|
|
// NOTE: It may be possible that the actual configured numa nodes do not start at 0
|
|
|
|
|
// so remapping may be necessary
|
|
|
|
|
// Find the 'deviceId'-th available NUMA node
|
|
|
|
|
int numaIdx = 0;
|
|
|
|
|
for (int i = 0; i <= devIndex; i++)
|
|
|
|
|
while (!numa_bitmask_isbitset(numa_get_mems_allowed(), numaIdx))
|
|
|
|
|
++numaIdx;
|
|
|
|
|
|
|
|
|
|
unsigned long nodemask = (1ULL << numaIdx);
|
2020-12-11 10:21:14 -07:00
|
|
|
long retCode = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
|
|
|
|
|
if (retCode)
|
|
|
|
|
{
|
2022-02-07 12:16:19 -07:00
|
|
|
printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", numaIdx);
|
2020-12-11 10:21:14 -07:00
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Allocate host-pinned memory (should respect NUMA mem policy)
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
if (memType == MEM_CPU_FINE)
|
|
|
|
|
{
|
|
|
|
|
HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
|
|
|
|
|
}
|
2020-12-11 10:21:14 -07:00
|
|
|
|
|
|
|
|
// Check that the allocated pages are actually on the correct NUMA node
|
2022-02-07 12:16:19 -07:00
|
|
|
CheckPages((char*)*memPtr, numBytes, numaIdx);
|
2020-12-11 10:21:14 -07:00
|
|
|
|
|
|
|
|
// Reset to default numa mem policy
|
|
|
|
|
retCode = set_mempolicy(MPOL_DEFAULT, NULL, 8);
|
|
|
|
|
if (retCode)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] Unable reset to default NUMA memory policy\n");
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
else if (memType == MEM_GPU)
|
|
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Allocate GPU memory on appropriate device
|
|
|
|
|
HIP_CALL(hipSetDevice(devIndex));
|
|
|
|
|
HIP_CALL(hipMalloc((void**)memPtr, numBytes));
|
|
|
|
|
}
|
|
|
|
|
else if (memType == MEM_GPU_FINE)
|
|
|
|
|
{
|
2020-12-15 17:37:31 -07:00
|
|
|
HIP_CALL(hipSetDevice(devIndex));
|
2020-12-11 10:21:14 -07:00
|
|
|
HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
printf("[ERROR] Unsupported memory type %d\n", memType);
|
2020-09-25 12:20:48 -06:00
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
void DeallocateMemory(MemType memType, void* memPtr)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
if (memType == MEM_CPU || memType == MEM_CPU_FINE)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
|
|
|
|
HIP_CALL(hipHostFree(memPtr));
|
|
|
|
|
}
|
2020-12-11 10:21:14 -07:00
|
|
|
else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
|
|
|
|
HIP_CALL(hipFree(memPtr));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
void CheckPages(char* array, size_t numBytes, int targetId)
|
|
|
|
|
{
|
|
|
|
|
unsigned long const pageSize = getpagesize();
|
|
|
|
|
unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
|
|
|
|
|
|
|
|
|
|
std::vector<void *> pages(numPages);
|
|
|
|
|
std::vector<int> status(numPages);
|
|
|
|
|
|
|
|
|
|
pages[0] = array;
|
|
|
|
|
for (int i = 1; i < numPages; i++)
|
|
|
|
|
{
|
|
|
|
|
pages[i] = (char*)pages[i-1] + pageSize;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
|
|
|
|
|
if (retCode)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] Unable to collect page info\n");
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t mistakeCount = 0;
|
|
|
|
|
for (int i = 0; i < numPages; i++)
|
|
|
|
|
{
|
|
|
|
|
if (status[i] < 0)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
if (status[i] != targetId) mistakeCount++;
|
|
|
|
|
}
|
|
|
|
|
if (mistakeCount > 0)
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
|
2021-10-21 15:28:16 -06:00
|
|
|
printf("[ERROR] Ensure up-to-date ROCm is installed\n");
|
|
|
|
|
exit(1);
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 12:20:48 -06:00
|
|
|
// Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
|
2021-06-15 08:41:57 -06:00
|
|
|
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float>const& fillPattern, float* ptr)
|
2020-09-25 12:20:48 -06:00
|
|
|
{
|
|
|
|
|
// Prepare reference resultx
|
|
|
|
|
float* refBuffer = (float*)malloc(N * sizeof(float));
|
|
|
|
|
if (isMemset)
|
|
|
|
|
{
|
|
|
|
|
if (isHipCall)
|
|
|
|
|
{
|
|
|
|
|
memset(refBuffer, 42, N * sizeof(float));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < N; i++)
|
|
|
|
|
refBuffer[i] = 1234.0f;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2021-06-15 08:41:57 -06:00
|
|
|
// Fill with repeated pattern if specified
|
|
|
|
|
size_t patternLen = fillPattern.size();
|
|
|
|
|
if (patternLen > 0)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < N; i++)
|
|
|
|
|
refBuffer[i] = fillPattern[i % patternLen];
|
|
|
|
|
}
|
|
|
|
|
else // Otherwise fill with pseudo-random values
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < N; i++)
|
2020-09-25 12:20:48 -06:00
|
|
|
refBuffer[i] = (i % 383 + 31);
|
2021-06-15 08:41:57 -06:00
|
|
|
}
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Either fill the memory with the reference buffer, or compare against it
|
|
|
|
|
if (mode == MODE_FILL)
|
|
|
|
|
{
|
|
|
|
|
HIP_CALL(hipMemcpy(ptr, refBuffer, N * sizeof(float), hipMemcpyDefault));
|
|
|
|
|
}
|
|
|
|
|
else if (mode == MODE_CHECK)
|
|
|
|
|
{
|
|
|
|
|
float* hostBuffer = (float*) malloc(N * sizeof(float));
|
|
|
|
|
HIP_CALL(hipMemcpy(hostBuffer, ptr, N * sizeof(float), hipMemcpyDefault));
|
|
|
|
|
for (int i = 0; i < N; i++)
|
|
|
|
|
{
|
|
|
|
|
if (refBuffer[i] != hostBuffer[i])
|
|
|
|
|
{
|
|
|
|
|
printf("[ERROR] Mismatch at element %d Ref: %f Actual: %f\n", i, refBuffer[i], hostBuffer[i]);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-02-01 09:48:09 -07:00
|
|
|
free(hostBuffer);
|
2020-09-25 12:20:48 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(refBuffer);
|
|
|
|
|
}
|
2020-10-27 09:00:33 -06:00
|
|
|
|
|
|
|
|
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
|
|
|
|
|
{
|
|
|
|
|
char result[10];
|
|
|
|
|
|
|
|
|
|
switch (linkType)
|
|
|
|
|
{
|
|
|
|
|
case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT: sprintf(result, " HT-%d", hopCount); break;
|
|
|
|
|
case HSA_AMD_LINK_INFO_TYPE_QPI : sprintf(result, " QPI-%d", hopCount); break;
|
|
|
|
|
case HSA_AMD_LINK_INFO_TYPE_PCIE : sprintf(result, "PCIE-%d", hopCount); break;
|
|
|
|
|
case HSA_AMD_LINK_INFO_TYPE_INFINBAND : sprintf(result, "INFB-%d", hopCount); break;
|
|
|
|
|
case HSA_AMD_LINK_INFO_TYPE_XGMI : sprintf(result, "XGMI-%d", hopCount); break;
|
|
|
|
|
default: sprintf(result, "??????");
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
std::string GetDesc(MemType srcMemType, int srcIndex,
|
|
|
|
|
MemType dstMemType, int dstIndex)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
if (srcMemType == MEM_CPU || srcMemType == MEM_CPU_FINE)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
if (dstMemType == MEM_CPU || dstMemType == MEM_CPU_FINE)
|
2020-12-11 10:21:14 -07:00
|
|
|
return (srcIndex == dstIndex) ? "LOCAL" : "NUMA";
|
|
|
|
|
else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
|
|
|
|
|
return "PCIE";
|
|
|
|
|
else
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
|
|
|
|
else if (srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE)
|
|
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
if (dstMemType == MEM_CPU || dstMemType == MEM_CPU_FINE)
|
2020-12-11 10:21:14 -07:00
|
|
|
return "PCIE";
|
|
|
|
|
else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
|
|
|
|
|
{
|
|
|
|
|
if (srcIndex == dstIndex) return "LOCAL";
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
uint32_t linkType, hopCount;
|
2022-02-02 08:51:41 -07:00
|
|
|
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(srcIndex, MEM_GPU),
|
|
|
|
|
RemappedIndex(dstIndex, MEM_GPU),
|
|
|
|
|
&linkType, &hopCount));
|
2020-12-11 10:21:14 -07:00
|
|
|
return GetLinkTypeDesc(linkType, hopCount);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
goto error;
|
|
|
|
|
}
|
|
|
|
|
error:
|
|
|
|
|
printf("[ERROR] Unrecognized memory type\n");
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
std::string GetTransferDesc(Transfer const& transfer)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
return GetDesc(transfer.srcMemType, transfer.srcIndex, transfer.exeMemType, transfer.exeIndex) + "-"
|
|
|
|
|
+ GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
2020-10-27 09:00:33 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx)
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
Transfer& transfer = exeInfo.transfers[transferIdx];
|
2022-04-08 15:20:55 -06:00
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
// GPU execution agent
|
2022-04-27 20:43:24 -06:00
|
|
|
if (transfer.exeMemType == MEM_GPU)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Switch to executing GPU
|
2022-04-27 20:43:24 -06:00
|
|
|
int const exeIndex = RemappedIndex(transfer.exeIndex, MEM_GPU);
|
2022-04-08 15:20:55 -06:00
|
|
|
HIP_CALL(hipSetDevice(exeIndex));
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
hipStream_t& stream = exeInfo.streams[transferIdx];
|
|
|
|
|
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
|
|
|
|
|
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
|
2020-12-11 10:21:14 -07:00
|
|
|
|
|
|
|
|
int const initOffset = ev.byteOffset / sizeof(float);
|
|
|
|
|
|
|
|
|
|
if (ev.useHipCall)
|
|
|
|
|
{
|
|
|
|
|
// Record start event
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipEventRecord(startEvent, stream));
|
2020-12-11 10:21:14 -07:00
|
|
|
|
|
|
|
|
// Execute hipMemset / hipMemcpy
|
|
|
|
|
if (ev.useMemset)
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipMemsetAsync(transfer.dstMem + initOffset, 42, N * sizeof(float), stream));
|
2020-12-11 10:21:14 -07:00
|
|
|
else
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipMemcpyAsync(transfer.dstMem + initOffset,
|
|
|
|
|
transfer.srcMem + initOffset,
|
2020-12-11 10:21:14 -07:00
|
|
|
N * sizeof(float), hipMemcpyDefault,
|
2022-04-08 15:20:55 -06:00
|
|
|
stream));
|
2020-12-11 10:21:14 -07:00
|
|
|
// Record stop event
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipEventRecord(stopEvent, stream));
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalBlocks : transfer.numBlocksToUse;
|
2020-12-11 10:21:14 -07:00
|
|
|
hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
|
2022-04-08 15:20:55 -06:00
|
|
|
dim3(numBlocksToRun, 1, 1),
|
2020-12-11 10:21:14 -07:00
|
|
|
dim3(BLOCKSIZE, 1, 1),
|
2022-04-08 15:20:55 -06:00
|
|
|
ev.sharedMemBytes, stream,
|
2022-04-27 20:43:24 -06:00
|
|
|
startEvent, stopEvent,
|
|
|
|
|
0, transfer.blockParamGpuPtr);
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Synchronize per iteration, unless in single sync mode, in which case
|
|
|
|
|
// synchronize during last warmup / last actual iteration
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipStreamSynchronize(stream));
|
2020-10-27 09:00:33 -06:00
|
|
|
|
2020-12-11 10:21:14 -07:00
|
|
|
if (iteration >= 0)
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Record GPU timing
|
2022-04-27 20:43:24 -06:00
|
|
|
float gpuDeltaMsec;
|
|
|
|
|
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
|
2022-04-08 15:20:55 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
if (ev.useSingleStream)
|
|
|
|
|
{
|
|
|
|
|
for (Transfer& currTransfer : exeInfo.transfers)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
long long minStartCycle = currTransfer.blockParamGpuPtr[0].startCycle;
|
|
|
|
|
long long maxStopCycle = currTransfer.blockParamGpuPtr[0].stopCycle;
|
|
|
|
|
for (int i = 1; i < currTransfer.numBlocksToUse; i++)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
minStartCycle = std::min(minStartCycle, currTransfer.blockParamGpuPtr[i].startCycle);
|
|
|
|
|
maxStopCycle = std::max(maxStopCycle, currTransfer.blockParamGpuPtr[i].stopCycle);
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
int const wallClockRate = GetWallClockRate(exeIndex);
|
|
|
|
|
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
|
|
|
|
|
currTransfer.transferTime += iterationTimeMs;
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
exeInfo.totalTime += gpuDeltaMsec;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
transfer.transferTime += gpuDeltaMsec;
|
2020-12-11 10:21:14 -07:00
|
|
|
}
|
2020-10-27 09:00:33 -06:00
|
|
|
}
|
|
|
|
|
}
|
2022-04-27 20:43:24 -06:00
|
|
|
else if (transfer.exeMemType == MEM_CPU) // CPU execution agent
|
2020-10-27 09:00:33 -06:00
|
|
|
{
|
2020-12-11 10:21:14 -07:00
|
|
|
// Force this thread and all child threads onto correct NUMA node
|
2022-04-27 20:43:24 -06:00
|
|
|
if (numa_run_on_node(transfer.exeIndex))
|
2020-12-11 10:21:14 -07:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("[ERROR] Unable to set CPU to NUMA node %d\n", transfer.exeIndex);
|
2020-12-11 10:21:14 -07:00
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::thread> childThreads;
|
|
|
|
|
|
|
|
|
|
auto cpuStart = std::chrono::high_resolution_clock::now();
|
|
|
|
|
|
|
|
|
|
// Launch child-threads to perform memcopies
|
2022-04-27 20:43:24 -06:00
|
|
|
for (int i = 0; i < ev.numCpuPerTransfer; i++)
|
|
|
|
|
childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(transfer.blockParam[i])));
|
2020-12-11 10:21:14 -07:00
|
|
|
|
|
|
|
|
// Wait for child-threads to finish
|
2022-04-27 20:43:24 -06:00
|
|
|
for (int i = 0; i < ev.numCpuPerTransfer; i++)
|
2020-12-11 10:21:14 -07:00
|
|
|
childThreads[i].join();
|
|
|
|
|
|
|
|
|
|
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
|
|
|
|
|
|
|
|
|
|
// Record time if not a warmup iteration
|
|
|
|
|
if (iteration >= 0)
|
2022-04-27 20:43:24 -06:00
|
|
|
transfer.transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
|
2020-10-27 09:00:33 -06:00
|
|
|
}
|
|
|
|
|
}
|
2021-10-21 15:28:16 -06:00
|
|
|
|
2021-11-24 18:05:37 -07:00
|
|
|
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
|
|
|
|
// Collect the number of available CPUs/GPUs on this machine
|
|
|
|
|
int numGpus;
|
|
|
|
|
HIP_CALL(hipGetDeviceCount(&numGpus));
|
|
|
|
|
int const numCpus = numa_num_configured_nodes();
|
|
|
|
|
int const numDevices = numCpus + numGpus;
|
|
|
|
|
|
|
|
|
|
// Enable peer to peer for each GPU
|
|
|
|
|
for (int i = 0; i < numGpus; i++)
|
|
|
|
|
for (int j = 0; j < numGpus; j++)
|
|
|
|
|
if (i != j) EnablePeerAccess(i, j);
|
|
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
if (!ev.outputToCsv)
|
|
|
|
|
{
|
|
|
|
|
printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float));
|
2022-04-27 20:43:24 -06:00
|
|
|
printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerTransfer);
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("Using %d CUs per transfer\n", numBlocksToUse);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
printf("SRC,DST,Direction,ReadMode,BW(GB/s),Bytes\n");
|
|
|
|
|
}
|
2021-10-21 15:28:16 -06:00
|
|
|
|
|
|
|
|
// Perform unidirectional / bidirectional
|
2021-11-24 18:05:37 -07:00
|
|
|
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
2021-11-24 18:05:37 -07:00
|
|
|
// Print header
|
2022-04-08 15:20:55 -06:00
|
|
|
if (!ev.outputToCsv)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni",
|
|
|
|
|
readMode == 0 ? "Local" : "Remote",
|
|
|
|
|
readMode == 0 ? "Remote" : "Local");
|
|
|
|
|
printf("%10s", "D/D");
|
|
|
|
|
if (!skipCpu)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < numCpus; i++)
|
|
|
|
|
printf("%7s %02d", "CPU", i);
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < numGpus; i++)
|
|
|
|
|
printf("%7s %02d", "GPU", i);
|
|
|
|
|
printf("\n");
|
2021-11-24 18:05:37 -07:00
|
|
|
}
|
2021-10-21 15:28:16 -06:00
|
|
|
|
2021-11-24 18:05:37 -07:00
|
|
|
// Loop over all possible src/dst pairs
|
|
|
|
|
for (int src = 0; src < numDevices; src++)
|
|
|
|
|
{
|
|
|
|
|
MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
|
|
|
|
|
if (skipCpu && srcMemType == MEM_CPU) continue;
|
|
|
|
|
int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
|
2022-04-08 15:20:55 -06:00
|
|
|
if (!ev.outputToCsv)
|
|
|
|
|
printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
|
2021-11-24 18:05:37 -07:00
|
|
|
for (int dst = 0; dst < numDevices; dst++)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
2021-11-24 18:05:37 -07:00
|
|
|
MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
|
|
|
|
|
if (skipCpu && dstMemType == MEM_CPU) continue;
|
|
|
|
|
int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
|
2022-04-08 15:20:55 -06:00
|
|
|
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, readMode, numBlocksToUse,
|
|
|
|
|
srcMemType, srcIndex, dstMemType, dstIndex);
|
|
|
|
|
if (!ev.outputToCsv)
|
|
|
|
|
{
|
|
|
|
|
if (bandwidth == 0)
|
|
|
|
|
printf("%10s", "N/A");
|
|
|
|
|
else
|
|
|
|
|
printf("%10.2f", bandwidth);
|
|
|
|
|
}
|
2021-11-24 18:05:37 -07:00
|
|
|
else
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
|
|
|
|
printf("%s %02d,%s %02d,%s,%s,%.2f,%lu\n",
|
|
|
|
|
srcMemType == MEM_CPU ? "CPU" : "GPU",
|
|
|
|
|
srcIndex,
|
|
|
|
|
dstMemType == MEM_CPU ? "CPU" : "GPU",
|
|
|
|
|
dstIndex,
|
|
|
|
|
isBidirectional ? "bidirectional" : "unidirectional",
|
|
|
|
|
readMode == 0 ? "Local" : "Remote",
|
|
|
|
|
bandwidth,
|
|
|
|
|
N * sizeof(float));
|
|
|
|
|
}
|
2021-11-24 18:05:37 -07:00
|
|
|
fflush(stdout);
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
if (!ev.outputToCsv) printf("\n");
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
if (!ev.outputToCsv) printf("\n");
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
double GetPeakBandwidth(EnvVars const& ev,
|
|
|
|
|
size_t const N,
|
|
|
|
|
int const isBidirectional,
|
|
|
|
|
int const readMode,
|
|
|
|
|
int const numBlocksToUse,
|
|
|
|
|
MemType const srcMemType,
|
|
|
|
|
int const srcIndex,
|
|
|
|
|
MemType const dstMemType,
|
|
|
|
|
int const dstIndex)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
|
|
|
|
// Skip bidirectional on same device
|
|
|
|
|
if (isBidirectional && srcMemType == dstMemType && srcIndex == dstIndex) return 0.0f;
|
2021-10-25 11:23:29 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
int const initOffset = ev.byteOffset / sizeof(float);
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Prepare Transfers
|
|
|
|
|
std::vector<Transfer*> transfers;
|
2022-04-08 15:20:55 -06:00
|
|
|
ExecutorInfo exeInfo[2];
|
|
|
|
|
for (int i = 0; i < 2; i++)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
exeInfo[i].transfers.resize(1);
|
2022-04-08 15:20:55 -06:00
|
|
|
exeInfo[i].streams.resize(1);
|
|
|
|
|
exeInfo[i].startEvents.resize(1);
|
|
|
|
|
exeInfo[i].stopEvents.resize(1);
|
2022-04-27 20:43:24 -06:00
|
|
|
transfers.push_back(&exeInfo[i].transfers[0]);
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
transfers[0]->srcMemType = transfers[1]->dstMemType = srcMemType;
|
|
|
|
|
transfers[0]->dstMemType = transfers[1]->srcMemType = dstMemType;
|
|
|
|
|
transfers[0]->srcIndex = transfers[1]->dstIndex = RemappedIndex(srcIndex, srcMemType);
|
|
|
|
|
transfers[0]->dstIndex = transfers[1]->srcIndex = RemappedIndex(dstIndex, dstMemType);
|
2022-04-08 15:20:55 -06:00
|
|
|
|
|
|
|
|
// Either perform (local read + remote write), or (remote read + local write)
|
2022-04-27 20:43:24 -06:00
|
|
|
transfers[0]->exeMemType = (readMode == 0 ? srcMemType : dstMemType);
|
|
|
|
|
transfers[1]->exeMemType = (readMode == 0 ? dstMemType : srcMemType);
|
|
|
|
|
transfers[0]->exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), transfers[0]->exeMemType);
|
|
|
|
|
transfers[1]->exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), transfers[1]->exeMemType);
|
2021-11-08 14:36:04 -07:00
|
|
|
|
2021-10-21 15:28:16 -06:00
|
|
|
for (int i = 0; i <= isBidirectional; i++)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
AllocateMemory(transfers[i]->srcMemType, transfers[i]->srcIndex,
|
|
|
|
|
N * sizeof(float) + ev.byteOffset, (void**)&transfers[i]->srcMem);
|
|
|
|
|
AllocateMemory(transfers[i]->dstMemType, transfers[i]->dstIndex,
|
|
|
|
|
N * sizeof(float) + ev.byteOffset, (void**)&transfers[i]->dstMem);
|
2021-10-21 15:28:16 -06:00
|
|
|
|
2022-04-08 15:20:55 -06:00
|
|
|
// Prepare block parameters on CPU
|
2022-04-27 20:43:24 -06:00
|
|
|
transfers[i]->numBlocksToUse = (transfers[i]->exeMemType == MEM_GPU) ? numBlocksToUse : ev.numCpuPerTransfer;
|
|
|
|
|
transfers[i]->blockParam.resize(transfers[i]->numBlocksToUse);
|
|
|
|
|
transfers[i]->PrepareBlockParams(ev, N);
|
2022-04-08 15:20:55 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
if (transfers[i]->exeMemType == MEM_GPU)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
// Copy block parameters onto GPU
|
2022-04-27 20:43:24 -06:00
|
|
|
AllocateMemory(MEM_GPU, transfers[i]->exeIndex, numBlocksToUse * sizeof(BlockParam),
|
|
|
|
|
(void **)&transfers[i]->blockParamGpuPtr);
|
|
|
|
|
HIP_CALL(hipMemcpy(transfers[i]->blockParamGpuPtr,
|
|
|
|
|
transfers[i]->blockParam.data(),
|
2022-04-08 15:20:55 -06:00
|
|
|
numBlocksToUse * sizeof(BlockParam),
|
|
|
|
|
hipMemcpyHostToDevice));
|
|
|
|
|
|
|
|
|
|
// Prepare GPU resources
|
2022-04-27 20:43:24 -06:00
|
|
|
HIP_CALL(hipSetDevice(transfers[i]->exeIndex));
|
2022-04-08 15:20:55 -06:00
|
|
|
HIP_CALL(hipStreamCreate(&exeInfo[i].streams[0]));
|
|
|
|
|
HIP_CALL(hipEventCreate(&exeInfo[i].startEvents[0]));
|
|
|
|
|
HIP_CALL(hipEventCreate(&exeInfo[i].stopEvents[0]));
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::stack<std::thread> threads;
|
|
|
|
|
|
|
|
|
|
// Perform iteration
|
|
|
|
|
for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
|
|
|
|
|
{
|
|
|
|
|
// Perform timed iterations
|
|
|
|
|
for (int i = 0; i <= isBidirectional; i++)
|
2022-04-27 20:43:24 -06:00
|
|
|
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo[i]), 0));
|
2021-10-21 15:28:16 -06:00
|
|
|
|
|
|
|
|
// Wait for all threads to finish
|
|
|
|
|
for (int i = 0; i <= isBidirectional; i++)
|
|
|
|
|
{
|
|
|
|
|
threads.top().join();
|
|
|
|
|
threads.pop();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
// Validate that each Transfer has transferred correctly
|
2021-10-21 15:28:16 -06:00
|
|
|
for (int i = 0; i <= isBidirectional; i++)
|
2022-04-27 20:43:24 -06:00
|
|
|
CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfers[i]->dstMem + initOffset);
|
2021-10-21 15:28:16 -06:00
|
|
|
|
|
|
|
|
// Collect aggregate bandwidth
|
|
|
|
|
double totalBandwidth = 0;
|
|
|
|
|
for (int i = 0; i <= isBidirectional; i++)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
double transferDurationMsec = transfers[i]->transferTime / (1.0 * ev.numIterations);
|
|
|
|
|
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
|
|
|
|
|
totalBandwidth += transferBandwidthGbs;
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Release GPU memory
|
|
|
|
|
for (int i = 0; i <= isBidirectional; i++)
|
|
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
DeallocateMemory(transfers[i]->srcMemType, transfers[i]->srcMem);
|
|
|
|
|
DeallocateMemory(transfers[i]->dstMemType, transfers[i]->dstMem);
|
2021-10-21 15:28:16 -06:00
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
if (transfers[i]->exeMemType == MEM_GPU)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
2022-04-27 20:43:24 -06:00
|
|
|
DeallocateMemory(MEM_GPU, transfers[i]->blockParamGpuPtr);
|
2022-04-08 15:20:55 -06:00
|
|
|
HIP_CALL(hipStreamDestroy(exeInfo[i].streams[0]));
|
|
|
|
|
HIP_CALL(hipEventDestroy(exeInfo[i].startEvents[0]));
|
|
|
|
|
HIP_CALL(hipEventDestroy(exeInfo[i].stopEvents[0]));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return totalBandwidth;
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
void Transfer::PrepareBlockParams(EnvVars const& ev, size_t const N)
|
2022-04-08 15:20:55 -06:00
|
|
|
{
|
|
|
|
|
int const initOffset = ev.byteOffset / sizeof(float);
|
|
|
|
|
|
|
|
|
|
// Initialize source memory with patterned data
|
|
|
|
|
CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, this->srcMem + initOffset);
|
|
|
|
|
|
|
|
|
|
// Each block needs to know src/dst pointers and how many elements to transfer
|
2022-04-27 20:43:24 -06:00
|
|
|
// Figure out the sub-array each block does for this Transfer
|
2022-04-08 15:20:55 -06:00
|
|
|
// - Partition N as evenly as possible, but try to keep blocks as multiples of BLOCK_BYTES bytes,
|
|
|
|
|
// except the very last one, for alignment reasons
|
|
|
|
|
int const targetMultiple = ev.blockBytes / sizeof(float);
|
|
|
|
|
int const maxNumBlocksToUse = std::min((N + targetMultiple - 1) / targetMultiple, this->blockParam.size());
|
|
|
|
|
size_t assigned = 0;
|
|
|
|
|
for (int j = 0; j < this->blockParam.size(); j++)
|
|
|
|
|
{
|
|
|
|
|
int const blocksLeft = std::max(0, maxNumBlocksToUse - j);
|
|
|
|
|
size_t const leftover = N - assigned;
|
|
|
|
|
size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple;
|
|
|
|
|
|
|
|
|
|
BlockParam& param = this->blockParam[j];
|
|
|
|
|
param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * targetMultiple)) : 0;
|
|
|
|
|
param.src = this->srcMem + assigned + initOffset;
|
|
|
|
|
param.dst = this->dstMem + assigned + initOffset;
|
|
|
|
|
param.startCycle = 0;
|
|
|
|
|
param.stopCycle = 0;
|
|
|
|
|
assigned += param.N;
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-27 20:43:24 -06:00
|
|
|
this->transferTime = 0.0;
|
2022-04-08 15:20:55 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NOTE: This is a stop-gap solution until HIP provides wallclock values
|
|
|
|
|
int GetWallClockRate(int deviceId)
|
|
|
|
|
{
|
|
|
|
|
static std::vector<int> wallClockPerDeviceMhz;
|
|
|
|
|
|
|
|
|
|
if (wallClockPerDeviceMhz.size() == 0)
|
|
|
|
|
{
|
|
|
|
|
int numGpuDevices;
|
|
|
|
|
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
|
|
|
|
wallClockPerDeviceMhz.resize(numGpuDevices);
|
|
|
|
|
|
|
|
|
|
hipDeviceProp_t prop;
|
|
|
|
|
for (int i = 0; i < numGpuDevices; i++)
|
|
|
|
|
{
|
|
|
|
|
HIP_CALL(hipGetDeviceProperties(&prop, i));
|
|
|
|
|
int value = 25000;
|
|
|
|
|
switch (prop.gcnArch)
|
2021-10-21 15:28:16 -06:00
|
|
|
{
|
2022-04-08 15:20:55 -06:00
|
|
|
case 906: case 910: value = 25000; break;
|
|
|
|
|
default:
|
|
|
|
|
printf("Unrecognized GCN arch %d\n", prop.gcnArch);
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
wallClockPerDeviceMhz[i] = value;
|
|
|
|
|
}
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|
2022-04-08 15:20:55 -06:00
|
|
|
return wallClockPerDeviceMhz[deviceId];
|
2021-10-21 15:28:16 -06:00
|
|
|
}
|