From e1ffd97abdb28cec893c4a8fb95350dc3367cb25 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Wed, 25 Sep 2024 11:53:10 -0400 Subject: [PATCH] kfdtest: Update KFDPerformanceTest.P2PBandWidthTest for CPX mode Currently, KFDPerformanceTest.P2PBandWidthTest cannot work if there are more than 16 KFD nodes in the system. This limit was put in to match the number of SDMA queues supported on a single node. This patch updates the test to make it run on systems with more than 16 KFD nodes. Signed-off-by: Mukul Joshi Change-Id: I561d0cdef664cae84fb9c13a801052e2001256e5 [ROCm/ROCR-Runtime commit: b81e45f03c9506adac3e9b44420623d4c9134a46] --- .../tests/kfdtest/src/KFDPerformanceTest.cpp | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDPerformanceTest.cpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDPerformanceTest.cpp index fc8344228d..aada774a2f 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDPerformanceTest.cpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDPerformanceTest.cpp @@ -68,7 +68,6 @@ static void testNodeToNodes(HSAuint32 n1, const HSAuint32 *const n2Array, int n, P2PDirection n1Direction, P2PDirection n2Direction, HSAuint64 size, HSAuint64 *speed, HSAuint64 *speed2, std::stringstream *msg, bool isTestOverhead = false, HSAuint64 *time = 0) { - ASSERT_GT(16, unsigned(n - 1)); HSAuint32 n2[n]; void *n1Mem, *n2Mem[n]; HsaMemFlags memFlags = {0}; @@ -148,6 +147,7 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) { std::vector nodes; const bool isSpecified = g_TestDstNodeId != -1 && g_TestNodeId != -1; int numPeers = 0; + const unsigned int maxSdmaQueues = m_numSdmaEngines * m_numSdmaQueuesPerEngine; if (isSpecified) { if (g_TestNodeId != g_TestDstNodeId) { @@ -299,15 +299,43 @@ TEST_F(KFDPerformanceTest, P2PBandWidthTest) { if (n < 2) continue; - if (test_suits[s][1] == OUT) + if (test_suits[s][1] == OUT) { snprintf(str, sizeof(str), "[[%d...%d] -> %d] ", dst.front(), dst.back(), n1); - else - snprintf(str, sizeof(str), "[%d -> [%d...%d]] ", n1, dst.front(), dst.back()); - msg << str << std::endl; - testNodeToNodes(n1, n2, n, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg); + msg << str << std::endl; + testNodeToNodes(n1, n2, n, test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg); - LOG() << std::dec << str << (float)speed / 1024 << " - " << + LOG() << std::dec << str << (float)speed / 1024 << " - " << (float)speed2 / 1024 << " GB/s" << std::endl; + } else { + /* If the total number of peers is greater than the number of SDMA queues supported, + * then we test in the following way: + * 1. Test peers in batches where each batch consists of number of peers equal to the + * max number of SDMA queues. + * 2. Keep repeating step 1 if number of peers left is greater than number of SDMA queues + * supported. + * 3. Test the last batch with the remaining peers left which can be less than the number of + * SDMA queues supported. + * For example, if there are 24 peers and max number of SDMA queues supported is 16, then + * the test will test 16 peers/nodes first and then remaining 8 in the next round. + */ + unsigned int j=0; + unsigned int start_index; + unsigned int end_index; + do { + start_index = maxSdmaQueues * j++; + end_index = start_index + maxSdmaQueues - 1; + + if (end_index + 1 > n) + end_index = n - 1; + + snprintf(str, sizeof(str), "[%d -> [%d...%d]] ", n1, n2[start_index], n2[end_index]); + msg << str << std::endl; + testNodeToNodes(n1, &n2[start_index], end_index - start_index + 1, + test_suits[s][0], test_suits[s][1], size, &speed, &speed2, &msg); + LOG() << std::dec << str << (float)speed / 1024 << " - " << + (float)speed2 / 1024 << " GB/s" << std::endl; + } while(end_index < (n - 1)); + } } }