Files
rocm-systems/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDPerformanceTest.cpp
T
Mukul Joshi e1ffd97abd kfdtest: Update KFDPerformanceTest.P2PBandWidthTest for CPX mode
Currently, KFDPerformanceTest.P2PBandWidthTest cannot work if there are
more than 16 KFD nodes in the system. This limit was put in to match the
number of SDMA queues supported on a single node.
This patch updates the test to make it run on systems with more than
16 KFD nodes.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Change-Id: I561d0cdef664cae84fb9c13a801052e2001256e5


[ROCm/ROCR-Runtime commit: b81e45f03c]
2024-09-30 11:28:33 -04:00

416 lines
15 KiB
C++

/*
* Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <sys/time.h>
#include <vector>
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#include "SDMAPacket.hpp"
#include "SDMAQueue.hpp"
#include "AqlQueue.hpp"
#include "KFDTestUtilQueue.hpp"
#include <algorithm>
#include <gtest/gtest.h>
#include "KFDBaseComponentTest.hpp"
class KFDPerformanceTest: public KFDBaseComponentTest {
protected:
virtual void SetUp();
virtual void TearDown();
};
void KFDPerformanceTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
ROUTINE_END
}
void KFDPerformanceTest::TearDown() {
ROUTINE_START
KFDBaseComponentTest::TearDown();
ROUTINE_END
}
enum P2PDirection {
IN = 1,
OUT = 2,
IN_OUT = 3,
NONE = 4,
};
/*
* Do the copy of one GPU from & to multiple GPUs.
*/
static void
testNodeToNodes(HSAuint32 n1, const HSAuint32 *const n2Array, int n, P2PDirection n1Direction,
P2PDirection n2Direction, HSAuint64 size, HSAuint64 *speed, HSAuint64 *speed2, std::stringstream *msg,
bool isTestOverhead = false, HSAuint64 *time = 0) {
HSAuint32 n2[n];
void *n1Mem, *n2Mem[n];
HsaMemFlags memFlags = {0};
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 0;
memFlags.ui32.NonPaged = 1;
SDMACopyParams array[n * 4];
int array_count = 0;
HSAuint64 alloc_size = ALIGN_UP(size, PAGE_SIZE);
std::vector<SDMACopyParams> copyArray;
int i;
ASSERT_SUCCESS(hsaKmtAllocMemory(n1, alloc_size, memFlags, &n1Mem));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(n1Mem, alloc_size, NULL));
for (i = 0; i < n; i++) {
n2[i] = n2Array[i];
ASSERT_SUCCESS(hsaKmtAllocMemory(n2[i], alloc_size, memFlags, &n2Mem[i]));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(n2Mem[i], alloc_size, NULL));
}
for (i = 0; i < n; i++) {
if (n1Direction != NONE)
ASSERT_NE(n1, 0);
if (n2Direction != NONE)
ASSERT_NE(n2[i], 0);
do {
if (n1Direction == IN || n1Direction == IN_OUT)
/* n2Mem -> n1Mem*/
array[array_count++] = {n1, n2Mem[i], n1Mem, size, n1/*group id, just a hint*/};
if (n1Direction == OUT || n1Direction == IN_OUT)
/* n1Mem -> n2Mem*/
array[array_count++] = {n1, n1Mem, n2Mem[i], size, n1};
/* Issue two copies to make full use of sdma.*/
} while (n1Direction < IN_OUT && n == 1 && array_count % 2);
/* Do nothing if no IN or OUT specified.*/
do {
if (n2Direction == IN || n2Direction == IN_OUT)
/* n1Mem -> n2Mem*/
array[array_count++] = {n2[i], n1Mem, n2Mem[i], size, n2[i]};
if (n2Direction == OUT || n2Direction == IN_OUT)
/* n2Mem -> n1Mem*/
array[array_count++] = {n2[i], n2Mem[i], n1Mem, size, n2[i]};
} while (n2Direction < IN_OUT && array_count % 2);
}
/* We measure a bunch of packets.*/
if (isTestOverhead) {
for (i = 0; i < 1000; i++)
for (int j = 0; j < array_count; j++)
copyArray.push_back(array[j]);
sdma_multicopy(copyArray, 1, HEAD_TAIL);
*time = CounterToNanoSec(copyArray[0].timeConsumption / (1000 * array_count));
} else
/* It did not respect the group id we set above.*/
sdma_multicopy(array, array_count, speed, speed2, msg);
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(n1Mem));
EXPECT_SUCCESS(hsaKmtFreeMemory(n1Mem, alloc_size));
for (i = 0; i < n; i++) {
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(n2Mem[i]));
EXPECT_SUCCESS(hsaKmtFreeMemory(n2Mem[i], alloc_size));
}
}
TEST_F(KFDPerformanceTest, P2PBandWidthTest) {
TEST_START(TESTPROFILE_RUNALL);
if (!hsakmt_is_dgpu()) {
LOG() << "Skipping test: Can't have 2 APUs on the same system." << std::endl;
return;
}
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
std::vector<int> nodes;
const bool isSpecified = g_TestDstNodeId != -1 && g_TestNodeId != -1;
int numPeers = 0;
const unsigned int maxSdmaQueues = m_numSdmaEngines * m_numSdmaQueuesPerEngine;
if (isSpecified) {
if (g_TestNodeId != g_TestDstNodeId) {
nodes.push_back(g_TestNodeId);
nodes.push_back(g_TestDstNodeId);
if ((m_NodeInfo.IsPeerAccessibleByNode(g_TestNodeId, g_TestDstNodeId) &&
m_NodeInfo.IsPeerAccessibleByNode(g_TestDstNodeId, g_TestNodeId)))
numPeers = 2;
}
} else {
nodes = m_NodeInfo.GetNodesWithGPU();
numPeers = nodes.size();
}
if (numPeers < 2) {
LOG() << "Skipping test: Need at least two large bar GPU or XGMI connected." << std::endl;
return;
}
g_TestTimeOut *= numPeers;
std::vector<int> sysNodes(nodes); // include sysMem node 0...
sysNodes.insert(sysNodes.begin(),0);
const int total_tests = 7;
const char *test_suits_string[total_tests] = {
"Copy from node to node by [push, NONE]",
"Copy from node to node by [pull, NONE]",
"Full duplex copy from node to node by [push|pull, NONE]",
"Full duplex copy from node to node by [push, push]",
"Full duplex copy from node to node by [pull, pull]",
"Copy from node to multiple nodes by [push, NONE]",
"Copy from multiple nodes to node by [push, NONE]",
};
const P2PDirection test_suits[total_tests][2] = {
/* One node used.*/
{OUT, NONE},
{IN, NONE},
{IN_OUT,NONE},
/* two nodes used.*/
{OUT, OUT},
{IN, IN},
/* Multi nodes used*/
{OUT, NONE},
{NONE, OUT},
};
const int twoNodesIdx = 3;
const int multiNodesIdx = 5;
const HSAuint32 size = 32ULL << 20;
int s = 0; //test index;
std::stringstream msg;
char str[64];
if (isSpecified) {
HSAuint32 n1 = g_TestNodeId;
HSAuint32 n2 = g_TestDstNodeId;
HSAuint64 speed, speed2;
LOG() << "Copy from node to node by [push, pull]" << std::endl;
snprintf(str, sizeof(str), "[%d -> %d] ", n1, n2);
testNodeToNodes(n1, &n2, 1, OUT, IN, size, &speed, &speed2, &msg);
LOG() << std::dec << str << (float)speed / 1024 << " - " <<
(float)speed2 / 1024 << " GB/s" << std::endl;
goto exit;
}
for (; s < twoNodesIdx; s++) {
LOG() << test_suits_string[s] << std::endl;
msg << test_suits_string[s] << std::endl;
for (unsigned i = 0; i < nodes.size(); i++) {
/* Src node is a GPU.*/
HSAuint32 n1 = nodes[i];
HSAuint64 speed, speed2;
/* Pick up dst node which can be sysMem.*/
for (unsigned j = 0; j < sysNodes.size(); j++) {
HSAuint32 n2 = sysNodes[j];
if (n1 == n2)
continue;
if (!m_NodeInfo.IsPeerAccessibleByNode(n2, n1))
continue;
snprintf(str, sizeof(str), "[%d -> %d] ", n1, n2);
msg << str << std::endl;
testNodeToNodes(n1, &n2, 1, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg);
LOG() << std::dec << str << (float)speed / 1024 << " - " <<
(float)speed2 / 1024 << " GB/s" << std::endl;
}
}
}
for (; s < multiNodesIdx; s++) {
LOG() << test_suits_string[s] << std::endl;
msg << test_suits_string[s] << std::endl;
for (unsigned i = 0; i < nodes.size(); i++) {
HSAuint32 n1 = nodes[i];
HSAuint64 speed, speed2;
for (unsigned j = i + 1; j < nodes.size(); j++) {
HSAuint32 n2 = nodes[j];
if (!m_NodeInfo.IsPeerAccessibleByNode(n2, n1) ||
!m_NodeInfo.IsPeerAccessibleByNode(n1, n2))
continue;
snprintf(str, sizeof(str), "[%d <-> %d] ", n1, n2);
msg << str << std::endl;
testNodeToNodes(n1, &n2, 1, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg);
LOG() << std::dec << str << (float)speed / 1024 << " - " <<
(float)speed2 / 1024 << " GB/s" << std::endl;
}
}
}
for (; s < total_tests && !isSpecified; s++) {
LOG() << test_suits_string[s] << std::endl;
msg << test_suits_string[s] << std::endl;
/* Just use GPU nodes to do copy.*/
std::vector<int> &src = test_suits[s][0] != NONE ? nodes : sysNodes;
std::vector<int> &dst = test_suits[s][1] != NONE ? nodes : sysNodes;
for (unsigned i = 0; i < src.size(); i++) {
HSAuint32 n1 = src[i];
HSAuint64 speed, speed2;
HSAuint32 n2[dst.size()];
int n = 0;
char str[64];
for (unsigned j = 0; j < dst.size(); j++) {
if (dst[j] != n1) {
if (test_suits[s][0] != NONE &&
!m_NodeInfo.IsPeerAccessibleByNode(dst[j], n1))
continue;
if (test_suits[s][1] != NONE &&
!m_NodeInfo.IsPeerAccessibleByNode(n1, dst[j]))
continue;
n2[n++] = dst[j];
}
}
/* At least 2 dst GPUs.*/
if (n < 2)
continue;
if (test_suits[s][1] == OUT) {
snprintf(str, sizeof(str), "[[%d...%d] -> %d] ", dst.front(), dst.back(), n1);
msg << str << std::endl;
testNodeToNodes(n1, n2, n, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg);
LOG() << std::dec << str << (float)speed / 1024 << " - " <<
(float)speed2 / 1024 << " GB/s" << std::endl;
} else {
/* If the total number of peers is greater than the number of SDMA queues supported,
* then we test in the following way:
* 1. Test peers in batches where each batch consists of number of peers equal to the
* max number of SDMA queues.
* 2. Keep repeating step 1 if number of peers left is greater than number of SDMA queues
* supported.
* 3. Test the last batch with the remaining peers left which can be less than the number of
* SDMA queues supported.
* For example, if there are 24 peers and max number of SDMA queues supported is 16, then
* the test will test 16 peers/nodes first and then remaining 8 in the next round.
*/
unsigned int j=0;
unsigned int start_index;
unsigned int end_index;
do {
start_index = maxSdmaQueues * j++;
end_index = start_index + maxSdmaQueues - 1;
if (end_index + 1 > n)
end_index = n - 1;
snprintf(str, sizeof(str), "[%d -> [%d...%d]] ", n1, n2[start_index], n2[end_index]);
msg << str << std::endl;
testNodeToNodes(n1, &n2[start_index], end_index - start_index + 1,
test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg);
LOG() << std::dec << str << (float)speed / 1024 << " - " <<
(float)speed2 / 1024 << " GB/s" << std::endl;
} while(end_index < (n - 1));
}
}
}
g_TestTimeOut /= numPeers;
exit:
/* New line.*/
LOG() << std::endl << msg.str() << std::endl;
TEST_END
}
TEST_F(KFDPerformanceTest, P2POverheadTest) {
TEST_START(TESTPROFILE_RUNALL);
if (!hsakmt_is_dgpu()) {
LOG() << "Skipping test: Can't have 2 APUs on the same system." << std::endl;
return;
}
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
std::vector<int> nodes;
nodes = m_NodeInfo.GetNodesWithGPU();
int numPeers = nodes.size();
if (numPeers < 2) {
LOG() << "Skipping test: Need at least two large bar GPU or XGMI connected." << std::endl;
return;
}
std::vector<int> sysNodes(nodes); // include sysMem node 0...
sysNodes.insert(sysNodes.begin(),0);
/* size should be small.*/
const HSAuint32 sizeArray[] = {4, 8, 16, 64, 256, 1024};
const int total_tests = 3;
const char *test_suits_string[total_tests] = {
"[push] ",
"[pull] ",
"[push|pull]",
};
const P2PDirection test_suits[total_tests] = {OUT, IN, IN_OUT};
std::stringstream msg;
int s; //test index;
msg << "Test (avg. ns) | Size";
for (auto &size : sizeArray)
msg << "\t" << size;
LOG() << msg.str() << std::endl;
LOG() << "-----------------------------------------------------------------------" << std::endl;
for (s = 0; s < total_tests; s++) {
for (unsigned i = 0; i < nodes.size(); i++) {
/* Src node is a GPU.*/
HSAuint32 n1 = nodes[i];
HSAuint64 time;
/* Pick up dst node which can be sysMem.*/
for (unsigned j = 0; j < sysNodes.size(); j++) {
HSAuint32 n2 = sysNodes[j];
std::stringstream msg;
if (n1 != n2 && !m_NodeInfo.IsPeerAccessibleByNode(n2, n1))
continue;
msg << test_suits_string[s] << "[" << n1 << " -> " << n2 << "]";
for (auto &size : sizeArray) {
testNodeToNodes(n1, &n2, 1, test_suits[s], NONE, size, 0, 0, 0, 1, &time);
msg << "\t" << time;
}
LOG() << msg.str() << std::endl;
}
}
}
TEST_END
}