kfdtest: Enable GPU selection via CLI for multi-GPU tests (#245)
* kfdtest: Enable GPU selection via CLI for multi-GPU tests Replaced environment variable-based GPU selection with GPU selection via command-line parameter --concurrentnodes (-c) Modified g_TestGPUsNum to be passed in via command-line parameter --testnodenum (t) Signed-off-by: Alysa Liu <Alysa.Liu@amd.com> * kfdtest: Enable GPU selection via CLI for multi-GPU tests Replaced environment variable-based GPU selection with GPU selection via command-line parameter --concurrentnodes (-c) Modified g_TestGPUsNum to be passed in via command-line parameter --testnodenum (t) --------- Signed-off-by: Alysa Liu <Alysa.Liu@amd.com> Co-authored-by: Alysa Liu <Alysa.Liu@amd.com>
This commit is contained in:
committato da
GitHub
parent
bb5fd1d4ae
commit
740b27528f
@@ -78,6 +78,8 @@ fi
|
||||
PLATFORM=""
|
||||
GDB=""
|
||||
NODE=""
|
||||
CONCURRENTNODES=""
|
||||
TESTNODENUM=""
|
||||
FORCE_HIGH=""
|
||||
RUN_IN_DOCKER=""
|
||||
ADDITIONAL_EXCLUDE=""
|
||||
@@ -95,6 +97,14 @@ printUsage() {
|
||||
"quoted, space-separated string as an argument"\
|
||||
"(e.g. -n 1 OR -n \"1 2 3\")"\
|
||||
"NOTE: Node numbers come from /sys/class/kfd/kfd/topology/nodes/#"
|
||||
echo " -c , --concurrentnodes Concurrent nodes string for multi-gpu testing."\
|
||||
"Takes a string comma-separated as an argument"\
|
||||
"(e.g. -c \"1,2,3\" or --concurrentnodes \"1,2,3\")"\
|
||||
"use -c \"all\" or --concurrentnodes \"all\" to test on all available nodes"\
|
||||
"NOTE: Node numbers come from /sys/class/kfd/kfd/topology/nodes/#"
|
||||
echo " -t , --testnodenum Number of concurrent nodes for multi-gpu testing."\
|
||||
"Takes an integer as argument"\
|
||||
"(e.g. -t 2 or --testnodenum 2)"
|
||||
echo " -l , --list List available nodes"
|
||||
echo " --high Force clocks to high for test execution"
|
||||
echo " -d , --docker Run in docker container"
|
||||
@@ -241,10 +251,15 @@ runKfdTest() {
|
||||
fi
|
||||
sudo docker rm kfdtest_docker
|
||||
else
|
||||
if [ "$HSA_TEST_GPUS_NUM" != "" ]; then
|
||||
echo "++++ Starting parallel testing on $HSA_TEST_GPUS_NUM gpu(s) ++++"
|
||||
$GDB $KFDTEST $gtestFilter $GTEST_ARGS
|
||||
echo "++++ Finished parallel testing on $HSA_TEST_GPUS_NUM gpu(s) ++++"
|
||||
if [ -n "$CONCURRENTNODES" ]; then
|
||||
echo "++++ Starting parallel testing on node(s) $CONCURRENTNODES ++++"
|
||||
$GDB $KFDTEST "--concurrentnodes=$CONCURRENTNODES" $gtestFilter $GTEST_ARGS
|
||||
echo "++++ Finished parallel testing on node(s) $CONCURRENTNODES ++++"
|
||||
exit 0;
|
||||
elif [ -n "$TESTNODENUM" ]; then
|
||||
echo "++++ Starting parallel testing on $TESTNODENUM node(s) ++++"
|
||||
$GDB $KFDTEST "--testnodenum=$TESTNODENUM" $gtestFilter $GTEST_ARGS
|
||||
echo "++++ Finished parallel testing on $TESTNODENUM node(s) ++++"
|
||||
exit 0;
|
||||
else
|
||||
echo ""
|
||||
@@ -278,6 +293,10 @@ while [ "$1" != "" ]; do
|
||||
printGpuNodelist; exit 0 ;;
|
||||
-n | --node )
|
||||
shift 1; NODE=$1 ;;
|
||||
-c | --concurrentnodes )
|
||||
shift 1; CONCURRENTNODES="$1" ;;
|
||||
-t | --testnodenum )
|
||||
shift 1; TESTNODENUM="$1" ;;
|
||||
--high)
|
||||
FORCE_HIGH="true" ;;
|
||||
-d | --docker )
|
||||
@@ -292,6 +311,23 @@ while [ "$1" != "" ]; do
|
||||
shift 1
|
||||
done
|
||||
|
||||
if [ "$CONCURRENTNODES" == "all" ]; then
|
||||
validNodes=$(getHsaNodes)
|
||||
CONCURRENTNODES=$(echo $validNodes | tr ' ' ',')
|
||||
else
|
||||
validNodes=$(getHsaNodes)
|
||||
validNodesArray=($validNodes)
|
||||
IFS=',' read -ra concurrentNodesArray <<< "$CONCURRENTNODES"
|
||||
|
||||
for concurrentNode in "${concurrentNodesArray[@]}"; do
|
||||
if [[ ! " ${validNodesArray[@]} " =~ " $concurrentNode " ]]; then
|
||||
echo "Error: Invalid node $concurrentNode specified in --concurrentnodes."
|
||||
echo "Valid nodes are: $validNodes"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# If the SMI is missing, try to find it
|
||||
SMI="$(find /opt/rocm* -type l -name rocm-smi 2>/dev/null | tail -1)"
|
||||
if [ -z ${SMI} ]; then
|
||||
|
||||
@@ -27,6 +27,9 @@
|
||||
#include "KFDTestUtil.hpp"
|
||||
|
||||
extern unsigned int g_TestGPUsNum;
|
||||
extern int g_TestNodeId;
|
||||
extern std::vector<int> g_SelectedNodes;
|
||||
extern std::string g_ConcurrentNodes;
|
||||
|
||||
void KFDBaseComponentTest::SetUpTestCase() {
|
||||
}
|
||||
@@ -88,24 +91,62 @@ void KFDBaseComponentTest::SetUp() {
|
||||
&m_numSdmaXgmiEngines_GPU[i], &m_numSdmaQueuesPerEngine_GPU[i]);
|
||||
}
|
||||
|
||||
if (!g_ConcurrentNodes.empty()) {
|
||||
std::set<int> uniqueIndices;
|
||||
size_t start = 0, end = 0;
|
||||
|
||||
while ((end = g_ConcurrentNodes.find(',', start)) != std::string::npos) {
|
||||
std::string token = g_ConcurrentNodes.substr(start, end - start);
|
||||
if (!token.empty()) {
|
||||
int node = std::stoi(token);
|
||||
|
||||
if (std::find(gpuNodes.begin(), gpuNodes.end(), node) != gpuNodes.end())
|
||||
uniqueIndices.insert(node);
|
||||
else
|
||||
LOG() << "Node " << node << " is not a GPU node. Skipping." << std::endl;
|
||||
}
|
||||
start = end + 1;
|
||||
}
|
||||
|
||||
if (start < g_ConcurrentNodes.size()) {
|
||||
int node = std::stoi(g_ConcurrentNodes.substr(start));
|
||||
if (std::find(gpuNodes.begin(), gpuNodes.end(), node) != gpuNodes.end()) {
|
||||
uniqueIndices.insert(node);
|
||||
} else {
|
||||
LOG() << "Node " << node << " is not a GPU node. Skipping." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
g_SelectedNodes.assign(uniqueIndices.begin(), uniqueIndices.end());
|
||||
g_TestGPUsNum = static_cast<unsigned int>(g_SelectedNodes.size());
|
||||
|
||||
} else if (g_TestGPUsNum > 0) {
|
||||
g_SelectedNodes = gpuNodes;
|
||||
}
|
||||
|
||||
/* adjust g_TestGPUsNum not above MAX_GPU and gpu number at system */
|
||||
g_TestGPUsNum = std::min(g_TestGPUsNum, (unsigned int)gpuNodes.size());
|
||||
g_TestGPUsNum = (g_TestGPUsNum <= MAX_GPU) ? g_TestGPUsNum : MAX_GPU;
|
||||
|
||||
if (!g_SelectedNodes.empty())
|
||||
g_SelectedNodes.resize(g_TestGPUsNum);
|
||||
|
||||
const testing::TestInfo* curr_test_info =
|
||||
::testing::UnitTest::GetInstance()->current_test_info();
|
||||
|
||||
openlog("KFDTEST", LOG_CONS , LOG_USER);
|
||||
if (g_TestGPUsNum == 1)
|
||||
|
||||
if (g_TestGPUsNum == 1) {
|
||||
syslog(LOG_INFO, "[Test on Node#%03d] "
|
||||
"STARTED ========== %s.%s ==========",
|
||||
m_NodeInfo.HsaDefaultGPUNode(),
|
||||
g_SelectedNodes.empty() ?
|
||||
m_NodeInfo.HsaDefaultGPUNode() : g_SelectedNodes[0],
|
||||
curr_test_info->test_case_name(), curr_test_info->name());
|
||||
else
|
||||
} else {
|
||||
syslog(LOG_INFO, "[Test on %03d Node(s)] "
|
||||
"STARTED ========== %s.%s ==========",
|
||||
g_TestGPUsNum,
|
||||
curr_test_info->test_case_name(), curr_test_info->name());
|
||||
}
|
||||
|
||||
ROUTINE_END
|
||||
}
|
||||
@@ -144,7 +185,8 @@ void KFDBaseComponentTest::TearDown() {
|
||||
if (g_TestGPUsNum == 1)
|
||||
syslog(LOG_INFO, "[Test on Node#%03d] PASSED"
|
||||
" ========== %s.%s ==========",
|
||||
m_NodeInfo.HsaDefaultGPUNode(),
|
||||
g_SelectedNodes.empty() ?
|
||||
m_NodeInfo.HsaDefaultGPUNode() : g_SelectedNodes[0],
|
||||
curr_test_info->test_case_name(), curr_test_info->name());
|
||||
else
|
||||
syslog(LOG_INFO, "[Tested on %03d Node(s)] PASSED"
|
||||
@@ -154,12 +196,13 @@ void KFDBaseComponentTest::TearDown() {
|
||||
|
||||
else
|
||||
if (g_TestGPUsNum == 1)
|
||||
syslog(LOG_WARNING, "[Test on Node#%03d] FAILED"
|
||||
syslog(LOG_WARNING, "[Test on Node#%03d] FAILED"
|
||||
" ========== %s.%s ==========",
|
||||
m_NodeInfo.HsaDefaultGPUNode(),
|
||||
g_SelectedNodes.empty() ?
|
||||
m_NodeInfo.HsaDefaultGPUNode() : g_SelectedNodes[0],
|
||||
curr_test_info->test_case_name(), curr_test_info->name());
|
||||
else
|
||||
syslog(LOG_WARNING, "[Test on %03d Node(s)] FAILED"
|
||||
syslog(LOG_WARNING, "[Test on %03d Node(s)] FAILED"
|
||||
" ========== %s.%s ==========",
|
||||
g_TestGPUsNum,
|
||||
curr_test_info->test_case_name(), curr_test_info->name());
|
||||
@@ -336,19 +379,20 @@ static void* KFDTest_GPU(void* ptr) {
|
||||
}
|
||||
|
||||
HSAKMT_STATUS KFDBaseComponentTest::KFDTestMultiGPU(Test_Function test_function,
|
||||
unsigned int gpu_num) {
|
||||
|
||||
const std::vector<int>& gpuNodes,
|
||||
unsigned int gpu_num) {
|
||||
HSAKMT_STATUS r = HSAKMT_STATUS_SUCCESS;
|
||||
int gpu_node;
|
||||
int err = 0;
|
||||
int i, j;
|
||||
|
||||
if (gpuNodes.empty())
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
|
||||
KFDTEST_GPUPARAMETERS kfdtest_GpuParameters[gpu_num];
|
||||
KFDTEST_PARAMETERS kfdTest_Parameters[gpu_num];
|
||||
pthread_t pThreadGPU[gpu_num];
|
||||
|
||||
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
|
||||
|
||||
for (i = 0; i < gpu_num; i++) {
|
||||
|
||||
gpu_node = gpuNodes.at(i);
|
||||
@@ -386,7 +430,7 @@ err_out:
|
||||
HSAKMT_STATUS KFDBaseComponentTest::KFDTest_Launch(Test_Function test_function) {
|
||||
|
||||
/* test on default GPU only */
|
||||
if (g_TestGPUsNum == 1) {
|
||||
if (g_TestNodeId >= 0) {
|
||||
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
|
||||
if (defaultGPUNode < 0) {
|
||||
LOG() << "defaultGPUNode is invalid." << defaultGPUNode <<std::endl;
|
||||
@@ -405,9 +449,9 @@ HSAKMT_STATUS KFDBaseComponentTest::KFDTest_Launch(Test_Function test_function)
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/* run test_function on all available GPUs */
|
||||
/* run test_function on all selected GPUs */
|
||||
HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
|
||||
err = KFDTestMultiGPU(test_function, g_TestGPUsNum);
|
||||
err = KFDTestMultiGPU(test_function, g_SelectedNodes, g_TestGPUsNum);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include <amdgpu.h>
|
||||
#include <amdgpu_drm.h>
|
||||
#include <sys/param.h>
|
||||
#include <algorithm>
|
||||
#include "hsakmt/hsakmt.h"
|
||||
#include "OSWrapper.hpp"
|
||||
#include "KFDTestUtil.hpp"
|
||||
@@ -105,8 +106,9 @@ class KFDBaseComponentTest : public testing::Test {
|
||||
return m_numSdmaXgmiEngines_GPU[gpuIndex];
|
||||
}
|
||||
|
||||
HSAKMT_STATUS KFDTestMultiGPU(Test_Function test_function,
|
||||
unsigned int gpu_num);
|
||||
HSAKMT_STATUS KFDTestMultiGPU(Test_Function test_function,
|
||||
const std::vector<int>& gpu_indices,
|
||||
unsigned int gpu_num);
|
||||
|
||||
HSAKMT_STATUS KFDTest_Launch(Test_Function test_function);
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "Assemble.hpp"
|
||||
|
||||
#define KFD_TEST_DEFAULT_TIMEOUT 60000
|
||||
#define MAX_GPU 64
|
||||
|
||||
std::ostream& operator << (std::ostream& out, TESTPROFILE profile) {
|
||||
switch (profile) {
|
||||
@@ -48,16 +49,18 @@ std::ostream& operator << (std::ostream& out, TESTPROFILE profile) {
|
||||
return out;
|
||||
}
|
||||
|
||||
unsigned int g_TestGPUsNum ;
|
||||
unsigned int g_TestGPUsNum = 0;
|
||||
unsigned int g_TestRunProfile;
|
||||
unsigned int g_TestENVCaps;
|
||||
unsigned int g_TestTimeOut;
|
||||
int g_TestNodeId;
|
||||
int g_TestNodeId = -1;
|
||||
int g_TestDstNodeId;
|
||||
bool g_IsChildProcess;
|
||||
bool g_IsEmuMode;
|
||||
unsigned int g_SleepTime;
|
||||
unsigned int g_TestGPUFamilyId;
|
||||
std::string g_ConcurrentNodes = "";
|
||||
std::vector<int> g_SelectedNodes;
|
||||
class KFDBaseComponentTest *g_baseTest;
|
||||
|
||||
GTEST_API_ int main(int argc, char **argv) {
|
||||
@@ -69,7 +72,6 @@ GTEST_API_ int main(int argc, char **argv) {
|
||||
testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
CommandLineArguments args;
|
||||
memset(&args, 0, sizeof(args));
|
||||
|
||||
bool success = GetCommandLineArguments(argc, argv, args);
|
||||
|
||||
@@ -90,10 +92,18 @@ GTEST_API_ int main(int argc, char **argv) {
|
||||
g_SleepTime = args.SleepTime;
|
||||
}
|
||||
|
||||
// If --node is not specified, then args.NodeId == -1
|
||||
g_TestNodeId = args.NodeId;
|
||||
g_TestDstNodeId = args.DstNodeId;
|
||||
|
||||
// If --node is not specified, then args.NodeId == -1
|
||||
if (!args.ConcurrentNodes.empty()) {
|
||||
g_ConcurrentNodes = args.ConcurrentNodes;
|
||||
} else if (args.TestNodeNum > 0) {
|
||||
g_TestGPUsNum = args.TestNodeNum;
|
||||
} else {
|
||||
g_TestNodeId = args.NodeId;
|
||||
g_TestGPUsNum = 1;
|
||||
}
|
||||
|
||||
g_IsEmuMode = CheckEmuModeEnabled();
|
||||
|
||||
LOG() << "Profile: " << (TESTPROFILE)g_TestRunProfile << std::endl;
|
||||
@@ -107,14 +117,6 @@ GTEST_API_ int main(int argc, char **argv) {
|
||||
LOG() << "Sleep time in seconds as specified by user: " << std::dec << g_SleepTime << std::endl;
|
||||
}
|
||||
|
||||
char *testGPUsNum = NULL;
|
||||
/* if HSA_TEST_GPUS_NUM is defined use it, otherwise test on 1 gpu */
|
||||
testGPUsNum = getenv("HSA_TEST_GPUS_NUM");
|
||||
if (testGPUsNum)
|
||||
g_TestGPUsNum = std::max(1, atoi(testGPUsNum));
|
||||
else
|
||||
g_TestGPUsNum = 1;
|
||||
|
||||
/* init LLVM one time*/
|
||||
Init_LLVM();
|
||||
|
||||
|
||||
@@ -110,6 +110,8 @@ void ComandLineArgumentsUsage() {
|
||||
printf("\t--timeout arg\t - Time Out\n");
|
||||
printf("\t--dst_node\t - For testing multiple nodes");
|
||||
printf("\t--sleep_time\t - For testing CRIU, etc");
|
||||
printf("\t--concurrentnodes arg\t - Concurrent nodes string\n");
|
||||
printf("\t--testnodenum arg\t - Number of concurrent nodes\n");
|
||||
}
|
||||
|
||||
bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs) {
|
||||
@@ -125,6 +127,8 @@ bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs)
|
||||
{ "node", required_argument, 0, 0 },
|
||||
{ "dst_node", required_argument, 0, 0 },
|
||||
{ "sleep_time", required_argument, 0, 0 },
|
||||
{ "concurrentnodes", required_argument, 0, 0 },
|
||||
{ "testnodenum", required_argument, 0, 0 },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
@@ -135,6 +139,8 @@ bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs)
|
||||
rArgs.NodeId = -1;
|
||||
rArgs.DstNodeId = -1;
|
||||
rArgs.SleepTime = 0;
|
||||
rArgs.ConcurrentNodes = "";
|
||||
rArgs.TestNodeNum = 0;
|
||||
|
||||
while (true) {
|
||||
int c = getopt_long(argc, argv, "", long_options, &option_index);
|
||||
@@ -211,6 +217,18 @@ bool GetCommandLineArguments(int argc, char **argv, CommandLineArguments& rArgs)
|
||||
rArgs.SleepTime = sleepTime;
|
||||
}
|
||||
break;
|
||||
case 7:
|
||||
{
|
||||
rArgs.ConcurrentNodes = optarg;
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
{
|
||||
int testNodeNum = atoi(optarg);
|
||||
if (testNodeNum > 0)
|
||||
rArgs.TestNodeNum = testNodeNum;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -70,6 +70,8 @@ struct CommandLineArguments {
|
||||
int DstNodeId;
|
||||
/* Time in units of seconds */
|
||||
unsigned int SleepTime;
|
||||
std::string ConcurrentNodes;
|
||||
unsigned int TestNodeNum;
|
||||
};
|
||||
|
||||
// It is either MEM_NONE or the bitwise OR of one or more of the following flags
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user