diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy index ea3025d62d..ccc326dae2 100644 --- a/.jenkins/extended.groovy +++ b/.jenkins/extended.groovy @@ -9,14 +9,14 @@ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path -def runCI = +def runCI = { nodeDetails, jobName-> def prj = new rocProject('rccl', 'Extended') prj.timeout.test = 600 - prj.paths.build_command = './install.sh -t ' + prj.paths.build_command = './install.sh -t --npkit-enable ' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) @@ -32,7 +32,7 @@ def runCI = commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } - + def testCommand = { platform, project-> @@ -43,14 +43,14 @@ def runCI = def packageCommand = { platform, project-> - + commonGroovy.runPackageCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } -ci: { +ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] @@ -58,17 +58,17 @@ ci: { propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([centos8:['8gfx906']])] - + jobNameList = auxiliary.appendJobNameList(jobNameList) - - propertyList.each + + propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } - jobNameList.each + jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) diff --git a/README.md b/README.md index cbd927c21c..379df2d277 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The root of this repository has a helper script 'install.sh' to build and instal * `./install.sh -s` -- builds RCCL as a static library (default: shared) * `./install.sh -hcc` -- builds RCCL with hcc compiler; note that hcc is now deprecated. (default:hip-clang) * `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm) +* `./install.sh --npkit-enable` -- enable compilation of npkit profiler framework with all options ## Manual build ### To build the library : diff --git a/install.sh b/install.sh index df29091cee..b6bfb72064 100755 --- a/install.sh +++ b/install.sh @@ -21,6 +21,7 @@ function display_help() echo " [--address-sanitizer] Build with address sanitizer enabled" echo " [--build_allreduce_only] Build only AllReduce + sum + float kernel" echo " [--rm-legacy-include-dir] Remove legacy include dir Packaging added for file/folder reorg backward compatibility" + echo " [--npkit-enable] Compile with npkit enabled" } # ################################################# @@ -40,6 +41,7 @@ install_dependencies=false build_static=false build_allreduce_only=false build_freorg_bkwdcomp=true +npkit_enabled=false # ################################################# # Parameter parsing @@ -48,7 +50,7 @@ build_freorg_bkwdcomp=true # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ $? -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,dependencies,package_build,tests_build,run_tests_quick,static,run_tests_all,hcc,hip-clang,no_clean,prefix:,address-sanitizer,build_allreduce_only,rm-legacy-include-dir --options hidptrs -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,dependencies,package_build,tests_build,run_tests_quick,static,run_tests_all,hcc,hip-clang,no_clean,prefix:,address-sanitizer,build_allreduce_only,npkit-enable,rm-legacy-include-dir --options hidptrs -- "$@") else echo "Need a new version of getopt" exit 1 @@ -107,6 +109,9 @@ while true; do --rm-legacy-include-dir) build_freorg_bkwdcomp=false shift ;; + --npkit-enable) + npkit_enabled=true + shift ;; --prefix) install_prefix=${2} shift 2 ;; @@ -214,12 +219,95 @@ fi if ($build_allreduce_only); then cmake_common_options="${cmake_common_options} -DBUILD_ALLREDUCE_ONLY=ON" fi + +npkit_options="" +if ($npkit_enabled); then + npkit_options="-DENABLE_NPKIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT \ + -DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT \ + -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY \ + -DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT \ + -DENABLE_NPKIT_EVENT_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_RECV_EXIT \ + -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY \ + -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT \ + -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY \ + -DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT \ + -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY \ + -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT \ + -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY \ + -DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT \ + -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY \ + -DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT \ + -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY \ + -DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT \ + -DENABLE_NPKIT_EVENT_NET_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_NET_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_NET_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_NET_RECV_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT \ + -DENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY \ + -DENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT \ + -DENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT \ + -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" +fi + check_exit_code "$?" if ($build_tests) || (($run_tests) && [[ ! -f ./test/rccl-UnitTests ]]); then - CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../. + CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../. else - CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../. + CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../. fi check_exit_code "$?"