From 25e7109b6437acf2e383ae3b4110af850e14b6ef Mon Sep 17 00:00:00 2001 From: Yiltan Date: Tue, 8 Apr 2025 16:12:22 -0400 Subject: [PATCH] Enable RO CI (#65) --- scripts/functional_tests/driver.sh | 72 +++++++++++++++++++----------- scripts/install_dependencies.sh | 4 +- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh index 698437b312..c28d0795d7 100755 --- a/scripts/functional_tests/driver.sh +++ b/scripts/functional_tests/driver.sh @@ -104,6 +104,8 @@ ExecTest() { NUM_THREADS=$4 MAX_MSG_SIZE=$5 + TIMEOUT=$((5 * 60)) # Timeout in seconds + TEST_NUM=${TEST_NUMBERS[$TEST_NAME]} if [[ "" == "$TEST_NUM" ]] @@ -120,7 +122,10 @@ ExecTest() { # MPI Parameters LAUNCHER=mpirun - OPTIONS=" -n $NUM_RANKS -mca pml ucx -x ROCSHMEM_MAX_NUM_CONTEXTS=$ROCSHMEM_MAX_NUM_CONTEXTS" + OPTIONS=" -n $NUM_RANKS -mca pml ucx -mca osc ucx" + OPTIONS+=" -x ROCSHMEM_MAX_NUM_CONTEXTS=$ROCSHMEM_MAX_NUM_CONTEXTS" + OPTIONS+=" -x UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384" + OPTIONS+=" --map-by numa --timeout $TIMEOUT" if [[ "" != "$HOSTFILE" ]] then @@ -153,7 +158,7 @@ ExecTest() { unset ROCSHMEM_MAX_NUM_CONTEXTS } -TestRMA() { +TestRMAPut() { ############################################################################## # | Name | Ranks | Workgroups | Threads | Max Message Size # ############################################################################## @@ -176,30 +181,6 @@ TestRMA() { ExecTest "teamctxput" 2 4 128 1024 ExecTest "teamctxput" 2 16 256 1024 - ExecTest "get" 2 1 1 1048576 - ExecTest "get" 2 1 1024 512 - ExecTest "get" 2 8 1 1048576 - ExecTest "get" 2 16 128 8 - ExecTest "get" 2 32 256 512 - ExecTest "get" 2 64 1024 8 - - ExecTest "wgget" 2 1 64 1048576 - ExecTest "wgget" 2 2 64 1048576 - ExecTest "wgget" 2 16 64 8 - - ExecTest "waveget" 2 1 64 1048576 - ExecTest "waveget" 2 2 64 1048576 - ExecTest "waveget" 2 2 128 1048576 - ExecTest "waveget" 2 16 128 8 - - ExecTest "teamctxget" 2 4 128 1024 - ExecTest "teamctxget" 2 16 256 1024 - - ExecTest "g" 2 1 1 128 - ExecTest "g" 2 1 1024 2 - ExecTest "g" 2 8 1 32 - ExecTest "g" 2 16 128 4 - ExecTest "p" 2 1 1 128 ExecTest "p" 2 1 1024 2 ExecTest "p" 2 8 1 32 @@ -225,6 +206,37 @@ TestRMA() { ExecTest "teamctxputnbi" 2 4 128 1024 ExecTest "teamctxputnbi" 2 16 256 1024 +} + +TestRMAGet() { + ############################################################################## + # | Name | Ranks | Workgroups | Threads | Max Message Size # + ############################################################################## + ExecTest "get" 2 1 1 1048576 + ExecTest "get" 2 1 1024 512 + ExecTest "get" 2 8 1 1048576 + ExecTest "get" 2 16 128 8 + ExecTest "get" 2 32 256 512 + ExecTest "get" 2 64 1024 8 + + ExecTest "wgget" 2 1 64 1048576 + ExecTest "wgget" 2 2 64 1048576 + ExecTest "wgget" 2 16 64 8 + + ExecTest "waveget" 2 1 64 1048576 + ExecTest "waveget" 2 2 64 1048576 + ExecTest "waveget" 2 2 128 1048576 + ExecTest "waveget" 2 16 128 8 + + ExecTest "teamctxget" 2 4 128 1024 + ExecTest "teamctxget" 2 16 256 1024 + + ExecTest "g" 2 1 1 128 + ExecTest "g" 2 1 1024 1 + ExecTest "g" 2 8 1 32 + ExecTest "g" 2 16 128 4 + + ################################ Non-Blocking ################################ ExecTest "getnbi" 2 1 1 1048576 ExecTest "getnbi" 2 1 1024 512 @@ -246,6 +258,13 @@ TestRMA() { ExecTest "teamctxgetnbi" 2 16 256 1024 } +TestRMA() { + TestRMAPut + if [ "0" == "$ROCSHMEM_DRIVER_DISABLE_GET" ]; then + TestRMAGet + fi +} + TestAMO() { ############################################################################## # | Name | Ranks | Workgroups | Threads | Max Message Size # @@ -429,6 +448,7 @@ LOG_DIR=$3 HOSTFILE=$4 DRIVER_RETURN_STATUS=0 +ROCSHMEM_DRIVER_DISABLE_GET="${ROCSHMEM_DRIVER_DISABLE_GET:-1}" ValidateInput $# ValidateLogDir $LOG_DIR diff --git a/scripts/install_dependencies.sh b/scripts/install_dependencies.sh index b7c2254cb6..bb4ceaf528 100755 --- a/scripts/install_dependencies.sh +++ b/scripts/install_dependencies.sh @@ -16,11 +16,11 @@ mkdir -p $_DEPS_SRC_DIR #Adjust branches and installation location as necessary export _UCX_INSTALL_DIR=$_INSTALL_DIR/ucx export _UCX_REPO=https://github.com/ROCm/ucx.git -export _UCX_COMMIT_HASH=4ef9a097c12ee6f7a8d3e41c317ea2d47e424b32 +export _UCX_COMMIT_HASH=18770fdc1c3b5de202d14a088a14b734d2c4bbf3 export _OMPI_INSTALL_DIR=$_INSTALL_DIR/ompi export _OMPI_REPO=https://github.com/ROCm/ompi.git -export _OMPI_COMMIT_HASH=8a5c2ef25dc8e4528f0d3fd2ec91a6578160af95 +export _OMPI_COMMIT_HASH=720f556508ad3f2cbb17341eb184c2d8565a5133 # Step 1: Build UCX with ROCm support cd $_DEPS_SRC_DIR