kfdtest: Enable GPU selection via CLI for multi-GPU tests (#245)

* kfdtest: Enable GPU selection via CLI for multi-GPU tests

Replaced environment variable-based GPU selection with
GPU selection via command-line parameter --concurrentnodes (-c)
Modified g_TestGPUsNum to be passed in via command-line
parameter --testnodenum (t)

Signed-off-by: Alysa Liu <Alysa.Liu@amd.com>

* kfdtest: Enable GPU selection via CLI for multi-GPU tests
Replaced environment variable-based GPU selection with
GPU selection via command-line parameter --concurrentnodes (-c)
Modified g_TestGPUsNum to be passed in via command-line
parameter --testnodenum (t)

---------

Signed-off-by: Alysa Liu <Alysa.Liu@amd.com>
Co-authored-by: Alysa Liu <Alysa.Liu@amd.com>
This commit is contained in:
systems-assistant[bot]
2025-11-03 09:27:38 -05:00
committato da GitHub
parent bb5fd1d4ae
commit 740b27528f
6 ha cambiato i file con 138 aggiunte e 34 eliminazioni
@@ -78,6 +78,8 @@ fi
PLATFORM=""
GDB=""
NODE=""
CONCURRENTNODES=""
TESTNODENUM=""
FORCE_HIGH=""
RUN_IN_DOCKER=""
ADDITIONAL_EXCLUDE=""
@@ -95,6 +97,14 @@ printUsage() {
"quoted, space-separated string as an argument"\
"(e.g. -n 1 OR -n \"1 2 3\")"\
"NOTE: Node numbers come from /sys/class/kfd/kfd/topology/nodes/#"
echo " -c , --concurrentnodes Concurrent nodes string for multi-gpu testing."\
"Takes a string comma-separated as an argument"\
"(e.g. -c \"1,2,3\" or --concurrentnodes \"1,2,3\")"\
"use -c \"all\" or --concurrentnodes \"all\" to test on all available nodes"\
"NOTE: Node numbers come from /sys/class/kfd/kfd/topology/nodes/#"
echo " -t , --testnodenum Number of concurrent nodes for multi-gpu testing."\
"Takes an integer as argument"\
"(e.g. -t 2 or --testnodenum 2)"
echo " -l , --list List available nodes"
echo " --high Force clocks to high for test execution"
echo " -d , --docker Run in docker container"
@@ -241,10 +251,15 @@ runKfdTest() {
fi
sudo docker rm kfdtest_docker
else
if [ "$HSA_TEST_GPUS_NUM" != "" ]; then
echo "++++ Starting parallel testing on $HSA_TEST_GPUS_NUM gpu(s) ++++"
$GDB $KFDTEST $gtestFilter $GTEST_ARGS
echo "++++ Finished parallel testing on $HSA_TEST_GPUS_NUM gpu(s) ++++"
if [ -n "$CONCURRENTNODES" ]; then
echo "++++ Starting parallel testing on node(s) $CONCURRENTNODES ++++"
$GDB $KFDTEST "--concurrentnodes=$CONCURRENTNODES" $gtestFilter $GTEST_ARGS
echo "++++ Finished parallel testing on node(s) $CONCURRENTNODES ++++"
exit 0;
elif [ -n "$TESTNODENUM" ]; then
echo "++++ Starting parallel testing on $TESTNODENUM node(s) ++++"
$GDB $KFDTEST "--testnodenum=$TESTNODENUM" $gtestFilter $GTEST_ARGS
echo "++++ Finished parallel testing on $TESTNODENUM node(s) ++++"
exit 0;
else
echo ""
@@ -278,6 +293,10 @@ while [ "$1" != "" ]; do
printGpuNodelist; exit 0 ;;
-n | --node )
shift 1; NODE=$1 ;;
-c | --concurrentnodes )
shift 1; CONCURRENTNODES="$1" ;;
-t | --testnodenum )
shift 1; TESTNODENUM="$1" ;;
--high)
FORCE_HIGH="true" ;;
-d | --docker )
@@ -292,6 +311,23 @@ while [ "$1" != "" ]; do
shift 1
done
if [ "$CONCURRENTNODES" == "all" ]; then
validNodes=$(getHsaNodes)
CONCURRENTNODES=$(echo $validNodes | tr ' ' ',')
else
validNodes=$(getHsaNodes)
validNodesArray=($validNodes)
IFS=',' read -ra concurrentNodesArray <<< "$CONCURRENTNODES"
for concurrentNode in "${concurrentNodesArray[@]}"; do
if [[ ! " ${validNodesArray[@]} " =~ " $concurrentNode " ]]; then
echo "Error: Invalid node $concurrentNode specified in --concurrentnodes."
echo "Valid nodes are: $validNodes"
exit 1
fi
done
fi
# If the SMI is missing, try to find it
SMI="$(find /opt/rocm* -type l -name rocm-smi 2>/dev/null | tail -1)"
if [ -z ${SMI} ]; then
@@ -27,6 +27,9 @@
#include "KFDTestUtil.hpp"
extern unsigned int g_TestGPUsNum;
extern int g_TestNodeId;
extern std::vector<int> g_SelectedNodes;
extern std::string g_ConcurrentNodes;
void KFDBaseComponentTest::SetUpTestCase() {
}
@@ -88,24 +91,62 @@ void KFDBaseComponentTest::SetUp() {
&m_numSdmaXgmiEngines_GPU[i], &m_numSdmaQueuesPerEngine_GPU[i]);
}
if (!g_ConcurrentNodes.empty()) {
std::set<int> uniqueIndices;
size_t start = 0, end = 0;
while ((end = g_ConcurrentNodes.find(',', start)) != std::string::npos) {
std::string token = g_ConcurrentNodes.substr(start, end - start);
if (!token.empty()) {
int node = std::stoi(token);
if (std::find(gpuNodes.begin(), gpuNodes.end(), node) != gpuNodes.end())
uniqueIndices.insert(node);
else
LOG() << "Node " << node << " is not a GPU node. Skipping." << std::endl;
}
start = end + 1;
}
if (start < g_ConcurrentNodes.size()) {
int node = std::stoi(g_ConcurrentNodes.substr(start));
if (std::find(gpuNodes.begin(), gpuNodes.end(), node) != gpuNodes.end()) {
uniqueIndices.insert(node);
} else {
LOG() << "Node " << node << " is not a GPU node. Skipping." << std::endl;
}
}
g_SelectedNodes.assign(uniqueIndices.begin(), uniqueIndices.end());
g_TestGPUsNum = static_cast<unsigned int>(g_SelectedNodes.size());
} else if (g_TestGPUsNum > 0) {
g_SelectedNodes = gpuNodes;
}
/* adjust g_TestGPUsNum not above MAX_GPU and gpu number at system */
g_TestGPUsNum = std::min(g_TestGPUsNum, (unsigned int)gpuNodes.size());
g_TestGPUsNum = (g_TestGPUsNum <= MAX_GPU) ? g_TestGPUsNum : MAX_GPU;
if (!g_SelectedNodes.empty())
g_SelectedNodes.resize(g_TestGPUsNum);
const testing::TestInfo* curr_test_info =
::testing::UnitTest::GetInstance()->current_test_info();
openlog("KFDTEST", LOG_CONS , LOG_USER);
if (g_TestGPUsNum == 1)
if (g_TestGPUsNum == 1) {
syslog(LOG_INFO, "[Test on Node#%03d] "
"STARTED ========== %s.%s ==========",
m_NodeInfo.HsaDefaultGPUNode(),
g_SelectedNodes.empty() ?
m_NodeInfo.HsaDefaultGPUNode() : g_SelectedNodes[0],
curr_test_info->test_case_name(), curr_test_info->name());
else
} else {
syslog(LOG_INFO, "[Test on %03d Node(s)] "
"STARTED ========== %s.%s ==========",
g_TestGPUsNum,
curr_test_info->test_case_name(), curr_test_info->name());
}
ROUTINE_END
}
@@ -144,7 +185,8 @@ void KFDBaseComponentTest::TearDown() {
if (g_TestGPUsNum == 1)
syslog(LOG_INFO, "[Test on Node#%03d] PASSED"
" ========== %s.%s ==========",
m_NodeInfo.HsaDefaultGPUNode(),
g_SelectedNodes.empty() ?
m_NodeInfo.HsaDefaultGPUNode() : g_SelectedNodes[0],
curr_test_info->test_case_name(), curr_test_info->name());
else
syslog(LOG_INFO, "[Tested on %03d Node(s)] PASSED"
@@ -154,12 +196,13 @@ void KFDBaseComponentTest::TearDown() {
else
if (g_TestGPUsNum == 1)
syslog(LOG_WARNING, "[Test on Node#%03d] FAILED"
syslog(LOG_WARNING, "[Test on Node#%03d] FAILED"
" ========== %s.%s ==========",
m_NodeInfo.HsaDefaultGPUNode(),
g_SelectedNodes.empty() ?
m_NodeInfo.HsaDefaultGPUNode() : g_SelectedNodes[0],
curr_test_info->test_case_name(), curr_test_info->name());
else
syslog(LOG_WARNING, "[Test on %03d Node(s)] FAILED"
syslog(LOG_WARNING, "[Test on %03d Node(s)] FAILED"
" ========== %s.%s ==========",
g_TestGPUsNum,
curr_test_info->test_case_name(), curr_test_info->name());
@@ -336,19 +379,20 @@ static void* KFDTest_GPU(void* ptr) {
}
HSAKMT_STATUS KFDBaseComponentTest::KFDTestMultiGPU(Test_Function test_function,
unsigned int gpu_num) {
const std::vector<int>& gpuNodes,
unsigned int gpu_num) {
HSAKMT_STATUS r = HSAKMT_STATUS_SUCCESS;
int gpu_node;
int err = 0;
int i, j;
if (gpuNodes.empty())
return HSAKMT_STATUS_SUCCESS;
KFDTEST_GPUPARAMETERS kfdtest_GpuParameters[gpu_num];
KFDTEST_PARAMETERS kfdTest_Parameters[gpu_num];
pthread_t pThreadGPU[gpu_num];
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
for (i = 0; i < gpu_num; i++) {
gpu_node = gpuNodes.at(i);
@@ -386,7 +430,7 @@ err_out:
HSAKMT_STATUS KFDBaseComponentTest::KFDTest_Launch(Test_Function test_function) {
/* test on default GPU only */
if (g_TestGPUsNum == 1) {
if (g_TestNodeId >= 0) {
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
if (defaultGPUNode < 0) {
LOG() << "defaultGPUNode is invalid." << defaultGPUNode <<std::endl;
@@ -405,9 +449,9 @@ HSAKMT_STATUS KFDBaseComponentTest::KFDTest_Launch(Test_Function test_function)
return HSAKMT_STATUS_SUCCESS;
}
/* run test_function on all available GPUs */
/* run test_function on all selected GPUs */
HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
err = KFDTestMultiGPU(test_function, g_TestGPUsNum);
err = KFDTestMultiGPU(test_function, g_SelectedNodes, g_TestGPUsNum);
return err;
}
@@ -31,6 +31,7 @@
#include <amdgpu.h>
#include <amdgpu_drm.h>
#include <sys/param.h>
#include <algorithm>
#include "hsakmt/hsakmt.h"
#include "OSWrapper.hpp"
#include "KFDTestUtil.hpp"
@@ -105,8 +106,9 @@ class KFDBaseComponentTest : public testing::Test {
return m_numSdmaXgmiEngines_GPU[gpuIndex];
}
HSAKMT_STATUS KFDTestMultiGPU(Test_Function test_function,
unsigned int gpu_num);
HSAKMT_STATUS KFDTestMultiGPU(Test_Function test_function,
const std::vector<int>& gpu_indices,
unsigned int gpu_num);
HSAKMT_STATUS KFDTest_Launch(Test_Function test_function);
@@ -29,6 +29,7 @@
#include "Assemble.hpp"
#define KFD_TEST_DEFAULT_TIMEOUT 60000
#define MAX_GPU 64
std::ostream& operator << (std::ostream& out, TESTPROFILE profile) {
switch (profile) {
@@ -48,16 +49,18 @@ std::ostream& operator << (std::ostream& out, TESTPROFILE profile) {
return out;
}
unsigned int g_TestGPUsNum ;
unsigned int g_TestGPUsNum = 0;
unsigned int g_TestRunProfile;
unsigned int g_TestENVCaps;
unsigned int g_TestTimeOut;
int g_TestNodeId;
int g_TestNodeId = -1;
int g_TestDstNodeId;
bool g_IsChildProcess;
bool g_IsEmuMode;
unsigned int g_SleepTime;
unsigned int g_TestGPUFamilyId;
std::string g_ConcurrentNodes = "";
std::vector<int> g_SelectedNodes;
class KFDBaseComponentTest *g_baseTest;
GTEST_API_ int main(int argc, char **argv) {
@@ -69,7 +72,6 @@ GTEST_API_ int main(int argc, char **argv) {
testing::InitGoogleTest(&argc, argv);
CommandLineArguments args;
memset(&args, 0, sizeof(args));
bool success = GetCommandLineArguments(argc, argv, args);
@@ -90,10 +92,18 @@ GTEST_API_ int main(int argc, char **argv) {
g_SleepTime = args.SleepTime;
}
// If --node is not specified, then args.NodeId == -1
g_TestNodeId = args.NodeId;
g_TestDstNodeId = args.DstNodeId;
// If --node is not specified, then args.NodeId == -1
if (!args.ConcurrentNodes.empty()) {
g_ConcurrentNodes = args.ConcurrentNodes;
} else if (args.TestNodeNum > 0) {
g_TestGPUsNum = args.TestNodeNum;
} else {
g_TestNodeId = args.NodeId;
g_TestGPUsNum = 1;
}
g_IsEmuMode = CheckEmuModeEnabled();
LOG() << "Profile: " << (TESTPROFILE)g_TestRunProfile << std::endl;
@@ -107,14 +117,6 @@ GTEST_API_ int main(int argc, char **argv) {
LOG() << "Sleep time in seconds as specified by user: " << std::dec << g_SleepTime << std::endl;
}
char *testGPUsNum = NULL;
/* if HSA_TEST_GPUS_NUM is defined use it, otherwise test on 1 gpu */
testGPUsNum = getenv("HSA_TEST_GPUS_NUM");
if (testGPUsNum)
g_TestGPUsNum = std::max(1, atoi(testGPUsNum));
else
g_TestGPUsNum = 1;
/* init LLVM one time*/
Init_LLVM();
@@ -110,6 +110,8 @@ void ComandLineArgumentsUsage() {
printf("\t--timeout arg\t - Time Out\n");
printf("\t--dst_node\t - For testing multiple nodes");
printf("\t--sleep_time\t - For testing CRIU, etc");
printf("\t--concurrentnodes arg\t - Concurrent nodes string\n");
printf("\t--testnodenum arg\t - Number of concurrent nodes\n");
}
bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs) {
@@ -125,6 +127,8 @@ bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs)
{ "node", required_argument, 0, 0 },
{ "dst_node", required_argument, 0, 0 },
{ "sleep_time", required_argument, 0, 0 },
{ "concurrentnodes", required_argument, 0, 0 },
{ "testnodenum", required_argument, 0, 0 },
{ 0, 0, 0, 0 }
};
@@ -135,6 +139,8 @@ bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs)
rArgs.NodeId = -1;
rArgs.DstNodeId = -1;
rArgs.SleepTime = 0;
rArgs.ConcurrentNodes = "";
rArgs.TestNodeNum = 0;
while (true) {
int c = getopt_long(argc, argv, "", long_options, &option_index);
@@ -211,6 +217,18 @@ bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs)
rArgs.SleepTime = sleepTime;
}
break;
case 7:
{
rArgs.ConcurrentNodes = optarg;
}
break;
case 8:
{
int testNodeNum = atoi(optarg);
if (testNodeNum > 0)
rArgs.TestNodeNum = testNodeNum;
}
break;
}
}
@@ -70,6 +70,8 @@ struct CommandLineArguments {
int DstNodeId;
/* Time in units of seconds */
unsigned int SleepTime;
std::string ConcurrentNodes;
unsigned int TestNodeNum;
};
// It is either MEM_NONE or the bitwise OR of one or more of the following flags