Files
andmar-amd 2b4d17078a Improve test script logic and error handling (#1424)
- Fix exclude+gtest_filter logic
 - Improve error handling when detecting upstream branches
2025-11-19 14:14:40 -08:00

362 строки
13 KiB
Bash
Исполняемый файл

#!/bin/bash
#
# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
#
# See if we can find the SHARE/BIN dirs in their expected locations
CWD="${BASH_SOURCE%/*}"
while read candidate; do
if [ -e "$candidate/kfdtest.exclude" ]; then
source "$candidate/kfdtest.exclude"
break
fi
done <<EOF
$KFDTEST_SHARE_DIR
$CWD
$CWD/../share/kfdtest
/opt/rocm/share/kfdtest
EOF
# Keep these checks until automation starts using the package install
if [ -z "${FILTER[core]}" ]; then
if [ -e "$CWD/../bin/kfdtest/kfdtest.exclude" ]; then
source "$CWD/../bin/kfdtest/kfdtest.exclude"
elif [ -e "$CWD/../../share/kfdtest.exclude" ]; then
source "$CWD/../../share/kfdtest.exclude"
fi
fi
# This filter will always exist if we sourced a valid kfdtest.exclude
if [ -z "${FILTER[core]}" ]; then
echo "Unable to locate kfdtest.exclude."
echo "Please set KFDTEST_SHARE_DIR or ensure that kfdtest.exclude is present inside $CWD, $CWD/../share/kfdtest or /opt/rocm/share/kfdtest"
exit 1
fi
# Using "which" produces different results in different
# OSes so use command -v instead. It returns "" if the
# command isn't in the PATH
if [ -z "$(command -v kfdtest)" ]; then
if [ -z "$BIN_DIR" ]; then
if [ -e "${0%/*}/kfdtest" ]; then
BIN_DIR="${0%/*}"
else
# The default location
BIN_DIR="/opt/rocm/bin"
fi
fi
if [ -e "$BIN_DIR/kfdtest" ]; then
KFDTEST="$BIN_DIR/kfdtest"
else
echo "Unable to locate kfdtest."
echo "Please set BIN_DIR, ensure that kfdtest is in $PATH, or ensure that kfdtest is present inside ${0%/*} or /opt/rocm/bin"
exit 1
fi
else
KFDTEST="kfdtest"
fi
PLATFORM=""
GDB=""
NODE=""
CONCURRENTNODES=""
TESTNODENUM=""
FORCE_HIGH=""
RUN_IN_DOCKER=""
ADDITIONAL_EXCLUDE=""
printUsage() {
echo
echo "Usage: $(basename $0) [options ...] [gtest arguments]"
echo
echo "Options:"
echo " -p <platform> , --platform <platform> Only run tests that"\
"pass on the specified platform. Usually you"\
"don't need this option"
echo " -g , --gdb Run in debugger"
echo " -n <node(s)> , --node <node(s)> NodeId(s) to test. Takes a single integer, or a"\
"quoted, space-separated string as an argument"\
"(e.g. -n 1 OR -n \"1 2 3\")"\
"NOTE: Node numbers come from /sys/class/kfd/kfd/topology/nodes/#"
echo " -c , --concurrentnodes Concurrent nodes string for multi-gpu testing."\
"Takes a string comma-separated as an argument"\
"(e.g. -c \"1,2,3\" or --concurrentnodes \"1,2,3\")"\
"use -c \"all\" or --concurrentnodes \"all\" to test on all available nodes"\
"NOTE: Node numbers come from /sys/class/kfd/kfd/topology/nodes/#"
echo " -t , --testnodenum Number of concurrent nodes for multi-gpu testing."\
"Takes an integer as argument"\
"(e.g. -t 2 or --testnodenum 2)"
echo " -l , --list List available nodes"
echo " --high Force clocks to high for test execution"
echo " -d , --docker Run in docker container"
echo " -e <list> , --exclude <list> Additional tests to exclude, in addition to kfdtest.exclude."\
"Takes a colon-separated string as an argument"\
"(e.g. -e KFDEvictTest.*:KFDSVMEvictTest.*)"
echo " -h , --help Prints this help"
echo
echo "Gtest arguments will be forwarded to the app"
echo
echo "Valid platform options: core_sws, core, polaris10, vega10, vega20, pm, all, and so on"
echo "'all' option runs all tests"
return 0
}
# Print gtest_filter for the given Platform
# param - Platform.
getFilter() {
# For regular platforms such as vega10, this will automatically generate
# the valid variable BLACKLIST based on the variable platform.
local platform=$1;
case "$platform" in
all ) gtestFilter="" ;;
* )
if [ -z "${FILTER[$platform]}" ]; then
echo "Unsupported platform $platform. Exiting"
exit 1
fi
gtestFilter="--gtest_filter=${FILTER[$platform]}"
;;
esac
# Check if the loaded driver is upstream (in-box) or DKMS
rdma_get_pages_func=$(cat /proc/kallsyms | grep rdma_get_pages || true)
if [ -z "$rdma_get_pages_func" ]; then
gtestFilter="$gtestFilter:${FILTER[upstream]}"
fi
if [ -n "$ADDITIONAL_EXCLUDE" ]; then
gtestFilter="$gtestFilter:$ADDITIONAL_EXCLUDE"
fi
}
TOPOLOGY_SYSFS_DIR=/sys/devices/virtual/kfd/kfd/topology/nodes
# Prints list of HSA Nodes. HSA Nodes are identified from sysfs KFD topology. The nodes
# should have valid SIMD count
getHsaNodes() {
for i in $(find $TOPOLOGY_SYSFS_DIR -maxdepth 1 -mindepth 1 -type d); do
simdcount=$(cat $i/properties | grep simd_count | awk '{print $2}')
if [ $simdcount != 0 ]; then
hsaNodeList+="$(basename $i) "
fi
done
echo "$hsaNodeList"
}
# Prints GPU Name for the given Node ID. If transitioned to IP discovery,
# use target gfx version
# param - Node ID
getNodeName() {
local nodeId=$1; shift;
local gpuName=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/name)
if [ "$gpuName" == "raven" ]; then
local CpuCoresCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep cpu_cores_count | awk '{print $2}')
local SimdCount=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep simd_count | awk '{print $2}')
if [ "$CpuCoresCount" -eq 0 ] && [ "$SimdCount" -gt 0 ]; then
gpuName="raven_dgpuFallback"
fi
elif [ "$gpuName" == "ip discovery" ]; then
if [ -n "$HSA_OVERRIDE_GFX_VERSION" ]; then
gpuName="gfx$(echo "$HSA_OVERRIDE_GFX_VERSION" | awk 'BEGIN {FS="."; RS=""} {printf "%d%x%x", $1, $2, $3 }')"
else
local GfxVersionDec=$(cat $TOPOLOGY_SYSFS_DIR/$nodeId/properties | grep gfx_target_version | awk '{print $2}')
if [[ ${#GfxVersionDec} = 5 ]]; then
GfxVersionDec="0${GfxVersionDec}"
fi
gpuName="gfx$(printf "$GfxVersionDec" | fold -w2 | awk 'BEGIN {FS="\n"; RS=""} {printf "%d%x%x", $1, $2, $3}')"
fi
fi
echo "$gpuName"
}
# Run KfdTest independently. Two global variables set by command-line
# will influence the tests as indicated below
# PLATFORM - If set all tests will run with this platform filter
# NODE - If set tests will be run only on this NODE, else it will be
# run on all available HSA Nodes
runKfdTest() {
if [ "$RUN_IN_DOCKER" == "true" ]; then
if [ `sudo systemctl is-active docker` != "active" ]; then
echo "docker isn't active, install and setup docker first!!!!"
exit 0
fi
PKG_ROOT="$(getPackageRoot)"
fi
if [[ "$GTEST_ARGS" =~ "--gtest_filter" && -n "$ADDITIONAL_EXCLUDE" ]]; then
echo "Cannot use -e and --gtest_filter flags together"
exit 0
fi
if [ "$NODE" == "" ]; then
hsaNodes=$(getHsaNodes)
if [ "$hsaNodes" == "" ]; then
echo "No GPU found in the system."
exit 1
fi
else
hsaNodes=$NODE
fi
for hsaNode in $hsaNodes; do
nodeName=$(getNodeName $hsaNode)
if [ "$PLATFORM" != "" ] && [ "$PLATFORM" != "$nodeName" ]; then
echo "WARNING: Actual ASIC $nodeName treated as $PLATFORM"
nodeName="$PLATFORM"
fi
getFilter $nodeName
if [ "$RUN_IN_DOCKER" == "true" ]; then
if [ "$NODE" == "" ]; then
DEVICE_NODE="/dev/dri"
else
RENDER_NODE=$(($hsaNode + 127))
DEVICE_NODE="/dev/dri/renderD${RENDER_NODE}"
fi
echo "Starting testing node $hsaNode ($nodeName) in docker container"
sudo docker run -it --name kfdtest_docker --user="jenkins" --network=host \
--device=/dev/kfd --device=${DEVICE_NODE} --group-add video --cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined -v $PKG_ROOT:/home/jenkins/rocm \
compute-artifactory.amd.com:5000/yuho/tianli-ubuntu1604-kfdtest:01 \
/home/jenkins/rocm/utils/run_kfdtest.sh -n $hsaNode $gtestFilter $GTEST_ARGS
if [ "$?" = "0" ]; then
echo "Finished node $hsaNode ($nodeName) successfully in docker container"
else
echo "Testing failed for node $hsaNode ($nodeName) in docker container"
fi
sudo docker rm kfdtest_docker
else
if [ -n "$CONCURRENTNODES" ]; then
echo "++++ Starting parallel testing on node(s) $CONCURRENTNODES ++++"
$GDB $KFDTEST "--concurrentnodes=$CONCURRENTNODES" $gtestFilter $GTEST_ARGS
echo "++++ Finished parallel testing on node(s) $CONCURRENTNODES ++++"
exit 0;
elif [ -n "$TESTNODENUM" ]; then
echo "++++ Starting parallel testing on $TESTNODENUM node(s) ++++"
$GDB $KFDTEST "--testnodenum=$TESTNODENUM" $gtestFilter $GTEST_ARGS
echo "++++ Finished parallel testing on $TESTNODENUM node(s) ++++"
exit 0;
else
echo ""
echo "++++ Starting testing node $hsaNode ($nodeName) ++++"
$GDB $KFDTEST "--node=$hsaNode" $gtestFilter $GTEST_ARGS
echo "---- Finished testing node $hsaNode ($nodeName) ----"
fi
fi
done
}
# Prints number of GPUs present in the system
getGPUCount() {
gNodes=$(getHsaNodes)
gNodes=( $gNodes )
gpuCount=${#gNodes[@]}
echo "$gpuCount"
}
while [ "$1" != "" ]; do
case "$1" in
-p | --platform )
shift 1; PLATFORM=$1 ;;
-g | --gdb )
GDB="gdb --args" ;;
-l | --list )
printGpuNodelist; exit 0 ;;
-n | --node )
shift 1; NODE=$1 ;;
-c | --concurrentnodes )
shift 1; CONCURRENTNODES="$1" ;;
-t | --testnodenum )
shift 1; TESTNODENUM="$1" ;;
--high)
FORCE_HIGH="true" ;;
-d | --docker )
RUN_IN_DOCKER="true" ;;
-e | --exclude )
shift 1; ADDITIONAL_EXCLUDE="$1" ;;
-h | --help )
printUsage; exit 0 ;;
*)
GTEST_ARGS=$@; break;;
esac
shift 1
done
if [ "$CONCURRENTNODES" == "all" ]; then
validNodes=$(getHsaNodes)
CONCURRENTNODES=$(echo $validNodes | tr ' ' ',')
else
validNodes=$(getHsaNodes)
validNodesArray=($validNodes)
IFS=',' read -ra concurrentNodesArray <<< "$CONCURRENTNODES"
for concurrentNode in "${concurrentNodesArray[@]}"; do
if [[ ! " ${validNodesArray[@]} " =~ " $concurrentNode " ]]; then
echo "Error: Invalid node $concurrentNode specified in --concurrentnodes."
echo "Valid nodes are: $validNodes"
exit 1
fi
done
fi
# If the SMI is missing, try to find it
SMI="$(find /opt/rocm* -type l -name rocm-smi 2>/dev/null | tail -1)"
if [ -z ${SMI} ]; then
if [ -x ${BIN_DIR}/rocm-smi ]; then
SMI=${BIN_DIR}/rocm-smi
else
SMI=`which rocm-smi`
fi
fi
# If the SMI is still missing, just report and continue
if [ "$FORCE_HIGH" == "true" ]; then
if [ -e "$SMI" ]; then
OLDPERF=$($SMI -p | awk '/Performance Level:/ {print $NF; exit}')
$($SMI --setperflevel high &> /dev/null)
if [ $? != 0 ]; then
echo "SMI failed to set perf level"
OLDPERF=""
fi
else
echo "Unable to set clocks to high, cannot find rocm-smi"
fi
fi
# Set HSA_DEBUG env to run KFDMemoryTest.PtraceAccessInvisibleVram
export HSA_DEBUG=1
runKfdTest
# OLDPERF is only set if FORCE_HIGH and SMI both exist
if [ -n "$OLDPERF" ]; then
$SMI --setperflevel $OLDPERF &> /dev/null
fi