From 8a442faa12b4dcbc89cd9af2fba886c08349c22e Mon Sep 17 00:00:00 2001 From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:08:24 -0700 Subject: [PATCH] Nvtx support (#1076) * NVTX support --- CMakeLists.txt | 9 +++++++-- cmake/Generator.cmake | 1 - install.sh | 10 +++++++++- src/include/core.h | 4 ++++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12efe8fc8f..3202833350 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ option(COLLTRACE "Collective Trace Option" option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON) option(ENABLE_IFC "Enable indirect function call" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) +option(NVTX "Enable NVTX" OFF) option(PROFILE "Enable profiling" OFF) option(TIMETRACE "Enable time-trace during compilation" OFF) option(TRACE "Enable additional tracing" OFF) @@ -411,7 +412,7 @@ set(SRC_FILES src/include/trees.h src/include/utils.h src/init.cc -# src/init_nvtx.cc + src/init_nvtx.cc src/misc/archinfo.cc src/misc/argcheck.cc # src/misc/cudawrap.cc @@ -484,6 +485,8 @@ foreach(SRC_FILE ${SRC_FILES}) ) endforeach() +# Generate device/host tables and all the collective functions that are going to be in librccl.so +#================================================================================================== if(ONLY_FUNCS) ## Generate only the specified functions gen_functions(${ONLY_FUNCS}) @@ -525,7 +528,9 @@ if(DEMANGLE_DIR) endif() ## Set RCCL compile definitions -target_compile_definitions(rccl PRIVATE NVTX_NO_IMPL) # NVTX is not supported +if(NOT NVTX) + target_compile_definitions(rccl PRIVATE NVTX_NO_IMPL) +endif() if(COLLTRACE) target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE) endif() diff --git a/cmake/Generator.cmake b/cmake/Generator.cmake index a38cd80690..9b842a6fac 100644 --- a/cmake/Generator.cmake +++ b/cmake/Generator.cmake @@ -21,7 +21,6 @@ # SOFTWARE. set(ALL_PARAMS "ALL_COLLS" "ALL_ALGOS" "ALL_PROTOS" "ALL_REDOPS" "ALL_TYPES") - set(ALL_COLLS "AllGather" "AllReduce" "AllToAllPivot" "Broadcast" "Reduce" "ReduceScatter" "SendRecv") set(ALL_ALGOS "TREE" "RING" "COLLNET_DIRECT" "COLLNET_CHAIN") set(ALL_PROTOS "LL" "LL128" "SIMPLE") diff --git a/install.sh b/install.sh index 3cc42a3649..018c7abb33 100755 --- a/install.sh +++ b/install.sh @@ -25,6 +25,7 @@ install_library=false msccl_kernel_enabled=true num_parallel_jobs=$(nproc) npkit_enabled=false +nvtx_enabled=false run_tests=false run_tests_all=false time_trace=false @@ -50,6 +51,7 @@ function display_help() echo " --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, seperate by ';' (builds for all supported GPU architectures by default)" echo " --no_clean Don't delete files if they already exist" echo " --npkit-enable Compile with npkit enabled" + echo " --nvtx-enable Compile with nvtx enabled" echo " -p|--package_build Build RCCL package" echo " --prefix Specify custom directory to install RCCL to (default: /opt/rocm)" echo " --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility" @@ -68,7 +70,7 @@ function display_help() # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ $? -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,nvtx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@") else echo "Need a new version of getopt" exit 1 @@ -97,6 +99,7 @@ while true; do --amdgpu_targets) build_amdgpu_targets=${2}; shift 2 ;; --no_clean) clean_build=false; shift ;; --npkit-enable) npkit_enabled=true; shift ;; + --nvtx-enable) nvtx_enabled=true; shift ;; -p | --package_build) build_package=true; shift ;; --prefix) install_prefix=${2}; shift 2 ;; --rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;; @@ -220,6 +223,11 @@ if ($install_dependencies); then cmake_common_options="${cmake_common_options} -DINSTALL_DEPENDENCIES=ON" fi +# Enable NVTX +if [[ "${nvtx_enabled}" == true ]]; then + cmake_common_options="${cmake_common_options} -DNVTX=ON" +fi + cmake_executable=cmake case "${OS_ID}" in centos|rhel) diff --git a/src/include/core.h b/src/include/core.h index a1d644d295..8ab92765a5 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -60,6 +60,10 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { #include "alloc.h" #include "utils.h" #include "param.h" +#ifdef NVTX_NO_IMPL #include "nvtx_stub.h" +#else +#include "nvtx.h" +#endif #endif // end include guard