From 259ef6348b1d213a2602eaa446ebf69c3b3a8e99 Mon Sep 17 00:00:00 2001 From: ajanicijamd Date: Tue, 14 Oct 2025 23:45:08 -0400 Subject: [PATCH] Fixed issues with nic-performance test (#1168) - On some hosts the wget can finish too soon and PAPI doesn't catch even a single network event. - On some hosts, there are multiple default NICs and the scripts didn't work in that case. - The test script was writing the output of wget to /tmp directory, which causes a problem if another user tries to run the same test. Because the output file with the same name already exists in the same directory, but with a different owner, the test fails --------- Co-authored-by: David Galiffi --- .../tests/generate_papi_nic_events.sh | 34 ++++++++++++++ .../tests/get_default_nic.sh | 36 ++++++--------- .../tests/rocprof-sys-nic-perf.cmake | 46 +++++++++---------- 3 files changed, 70 insertions(+), 46 deletions(-) create mode 100755 projects/rocprofiler-systems/tests/generate_papi_nic_events.sh diff --git a/projects/rocprofiler-systems/tests/generate_papi_nic_events.sh b/projects/rocprofiler-systems/tests/generate_papi_nic_events.sh new file mode 100755 index 0000000000..15c955745d --- /dev/null +++ b/projects/rocprofiler-systems/tests/generate_papi_nic_events.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# This script gets a list of default NICs from ip command +# and generates a list of PAPI events, 4 for each NIC. +# and generates a list of PAPI events; 4 for each NIC. +# For example, if the NIC is enp7s0, the PAPI events are: +# net:::enp7s0:tx:byte net:::enp7s0:rx:byte net:::enp7s0:tx:packet net:::enp7s0:rx:packet + +# Get the directory where this script is located +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ ! -x "$script_dir/get_default_nic.sh" ]; then + echo "Error: helper script get_default_nic.sh not found or not executable in $script_dir" >&2 + exit 1 +fi + +# Call the `get_default_nic.sh`` script to get the list of default NICs +# and store it in the nic_list variable +nic_list="$("$script_dir/get_default_nic.sh")" +if [ $? -ne 0 ]; then + echo "Error: failed to get default NICs" >&2 + exit 1 +fi + +events=() + +for nic in $nic_list; do + events+=("net:::${nic}:tx:byte" "net:::${nic}:rx:byte" "net:::${nic}:tx:packet" "net:::${nic}:rx:packet") +done + +event_list="${events[*]}" +echo $event_list diff --git a/projects/rocprofiler-systems/tests/get_default_nic.sh b/projects/rocprofiler-systems/tests/get_default_nic.sh index 8c7d0af2dc..1e2fced6f5 100755 --- a/projects/rocprofiler-systems/tests/get_default_nic.sh +++ b/projects/rocprofiler-systems/tests/get_default_nic.sh @@ -1,27 +1,19 @@ #!/usr/bin/env bash -# MIT License -# -# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT # This script gets the name of the default NIC and writes it to standard output. +# NOTE: if command "ip r" finds multiple default NICs, this script will output +# all of them. +nics=$(ip r | awk '/^default /{print $5}' | sort -u) -ip r | awk '/default/{print $5}' +# nics="ens50f1 ens50f2 ens50f3 ens50f4" # For testing purposes only +# nics= # For testing purposes only + +if [ -z "$nics" ]; then + echo "Error: no default route found" >&2 + exit 1 +fi + +echo "$nics" diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake index e0b7fa892b..3826714d2d 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-nic-perf.cmake @@ -1,24 +1,5 @@ -# MIT License -# -# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT # -------------------------------------------------------------------------------------- # # @@ -33,7 +14,16 @@ execute_process( OUTPUT_VARIABLE _network_interface ) -message(STATUS "Default network interface is ${_network_interface}") +message(STATUS "The list of default network interfaces is ${_network_interface}") + +# Generate the list of all events that we want PAPI to record. +execute_process( + COMMAND "${CMAKE_SOURCE_DIR}/tests/generate_papi_nic_events.sh" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE _event_list +) + +message(STATUS "The list of all PAPI network events is ${_event_list}") set(_nic_perf_environment "${_base_environment}" @@ -46,7 +36,7 @@ set(_nic_perf_environment "ROCPROFSYS_USE_ROCM=OFF" "ROCPROFSYS_TIMEMORY_COMPONENTS=wall_clock,papi_array,network_stats" "ROCPROFSYS_NETWORK_INTERFACE=${_network_interface}" - "ROCPROFSYS_PAPI_EVENTS=net:::${_network_interface}:tx:byte net:::${_network_interface}:rx:byte net:::${_network_interface}:rx:packet net:::${_network_interface}:tx:packet" + "ROCPROFSYS_PAPI_EVENTS=${_event_list}" "ROCPROFSYS_SAMPLING_DELAY=0.05" ) @@ -56,12 +46,20 @@ set(_download_url "https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.4.1/rocprofiler-systems-1.0.1-ubuntu-22.04-ROCm-60400-PAPI-OMPT-Python3.sh" ) +# The second file to download. We are downloading two files (each about 90MB), because +# we want wget to run for at least 2s even on a fast network. This will give PAPI enough +# time to collect network metrics. +set(_download2_url + "https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.4.3/rocprofiler-systems-1.0.2-rhel-9.4-PAPI-OMPT-Python3.sh" +) + # Run the NIC performance test add_test( NAME nic-performance COMMAND $ -- wget --no-check-certificate - ${_download_url} -O /tmp/rocprofiler-systems.test.bin + ${_download_url} ${_download2_url} -O + ${PROJECT_BINARY_DIR}/rocprofiler-systems.test.bin WORKING_DIRECTORY ${PROJECT_BINARY_DIR} )