From 0c77c6622193d7215061dc5e2c217d1811e67a20 Mon Sep 17 00:00:00 2001 From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com> Date: Wed, 21 Jun 2023 16:16:09 -0400 Subject: [PATCH] Disable Colltrace for --fast option (#778) * Disable Colltrace for --fast option * Limit nprocs for CI --- .jenkins/extended.groovy | 2 +- .jenkins/precheckin.groovy | 2 +- .jenkins/staticlibrary.groovy | 2 +- install.sh | 77 +++++++++++++++--------- src/collectives/device/common.h | 14 ++++- src/enqueue.cc | 7 +++ src/graph/topo.h | 7 ++- tools/time-trace/time_trace_generator.py | 17 +++--- 8 files changed, 85 insertions(+), 43 deletions(-) diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy index 6133305f5f..eb2c857aa8 100644 --- a/.jenkins/extended.groovy +++ b/.jenkins/extended.groovy @@ -17,7 +17,7 @@ def runCI = def prj = new rocProject('rccl', 'Extended') prj.timeout.test = 600 - prj.paths.build_command = './install.sh -t --npkit-enable ' + prj.paths.build_command = './install.sh -t --npkit-enable --limit-nprocs' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy index 7c8eef8690..4791b1979f 100644 --- a/.jenkins/precheckin.groovy +++ b/.jenkins/precheckin.groovy @@ -18,7 +18,7 @@ def runCI = def prj = new rocProject('rccl', 'PreCheckin') prj.timeout.test = 300 - prj.paths.build_command = './install.sh -t --fast' + prj.paths.build_command = './install.sh -t --fast --limit-nprocs' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy index e75ff7ec97..8be7f8480f 100644 --- a/.jenkins/staticlibrary.groovy +++ b/.jenkins/staticlibrary.groovy @@ -12,7 +12,7 @@ def runCI = def prj = new rocProject('rccl', 'Static Library PreCheckin') prj.timeout.test = 1440 - prj.paths.build_command = './install.sh -t --static' + prj.paths.build_command = './install.sh -t --static --limit-nprocs' def nodes = new dockerNodes(nodeDetails, jobName, prj) diff --git a/install.sh b/install.sh index 57c9f31fcf..5964aeaafc 100755 --- a/install.sh +++ b/install.sh @@ -13,11 +13,12 @@ function display_help() echo " -d|--dependencies Install RCCL depdencencies" echo " --debug Build debug library" echo " --disable_backtrace Build without custom backtrace support" - echo " --fast Quick-build RCCL (local gpu arch only, no backtrace support)" + echo " --disable-colltrace Build without collective trace" + echo " --fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)" echo " -h|--help Prints this help message" echo " -i|--install Install RCCL library (see --prefix argument below)" + echo " -l|--limit-nprocs Limit the number of procs to 16 while building" echo " --local_gpu_only Only compile for local GPU architecture" - echo " --max-jobs Use nproc instead of default number of 16" echo " --no_clean Don't delete files if they already exist" echo " --npkit-enable Compile with npkit enabled" echo " -p|--package_build Build RCCL package" @@ -38,6 +39,7 @@ ROCM_PATH=${ROCM_PATH:="/opt/rocm"} build_address_sanitizer=false build_allreduce_only=false +collective_trace=true install_dependencies=false build_release=true build_bfd=true @@ -53,7 +55,7 @@ build_static=false build_tests=false build_verbose=0 time_trace=false -enable_all_jobs=false +enable_all_jobs=true enable_ninja="" # ################################################# @@ -63,7 +65,7 @@ enable_ninja="" # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ $? -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,fast,help,install,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,max-jobs,verbose --options hidptrs -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,disable-colltrace,fast,help,install,limit-nprocs,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,verbose --options hidptrs -- "$@") else echo "Need a new version of getopt" exit 1 @@ -78,27 +80,28 @@ eval set -- "${GETOPT_PARSE}" while true; do case "${1}" in - --address-sanitizer) build_address_sanitizer=true; shift ;; - --build_allreduce_only) build_allreduce_only=true; shift ;; - -d | --dependencies) install_dependencies=true; shift ;; - --debug) build_release=false; shift ;; - --disable_backtrace) build_bfd=false; shift ;; - --fast) build_bfd=false; build_local_gpu_only=true; shift ;; - -h | --help) display_help; exit 0 ;; - -i | --install) install_library=true; shift ;; - --local_gpu_only) build_local_gpu_only=true; shift ;; - --max-jobs) enable_all_jobs=true; shift ;; - --no_clean) clean_build=false; shift ;; - --npkit-enable) npkit_enabled=true; shift ;; - -p | --package_build) build_package=true; shift ;; - --prefix) install_prefix=${2} shift 2 ;; - --rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;; - -r | --run_tests_quick) run_tests=true; shift ;; - --run_tests_all) run_tests=true; run_tests_all=true; shift ;; - --static) build_static=true; shift ;; - -t | --tests_build) build_tests=true; shift ;; - --time-trace) time_trace=true; shift ;; - --verbose) build_verbose=1; shift ;; + --address-sanitizer) build_address_sanitizer=true; shift ;; + --build_allreduce_only) build_allreduce_only=true; shift ;; + -d | --dependencies) install_dependencies=true; shift ;; + --debug) build_release=false; shift ;; + --disable_backtrace) build_bfd=false; shift ;; + --disable-colltrace) collective_trace=false; shift ;; + --fast) build_bfd=false; build_local_gpu_only=true; collective_trace=false; shift ;; + -h | --help) display_help; exit 0 ;; + -i | --install) install_library=true; shift ;; + -l | --limit-nprocs) enable_all_jobs=false; shift ;; + --local_gpu_only) build_local_gpu_only=true; shift ;; + --no_clean) clean_build=false; shift ;; + --npkit-enable) npkit_enabled=true; shift ;; + -p | --package_build) build_package=true; shift ;; + --prefix) install_prefix=${2} shift 2 ;; + --rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;; + -r | --run_tests_quick) run_tests=true; shift ;; + --run_tests_all) run_tests=true; run_tests_all=true; shift ;; + --static) build_static=true; shift ;; + -t | --tests_build) build_tests=true; shift ;; + --time-trace) time_trace=true; shift ;; + --verbose) build_verbose=1; shift ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -199,6 +202,10 @@ if [[ "${build_static}" == true ]]; then cmake_common_options="${cmake_common_options} -DBUILD_SHARED_LIBS=OFF" fi +# Disable collective trace +if [[ "${collective_trace}" == false ]]; then + cmake_common_options="${cmake_common_options} -DCOLLTRACE=OFF" +fi # Install dependencies if ($install_dependencies); then @@ -344,9 +351,19 @@ if ($run_tests); then fi fi -if ($time_trace) then - cd ../../tools/time-trace - chmod +x ./rccl-TimeTrace.sh - echo "Generating RCCL-compile-timeline.html..." - ./rccl-TimeTrace.sh +if ($time_trace); then + search_dir="../../" + time_trace_dir=$(find "$search_dir" -type d -name "time-trace" -print -quit) + + if [ "$time_trace_dir" ]; then + time_trace_script="$time_trace_dir/rccl-TimeTrace.sh" + if [ -x "$time_trace_script" ]; then + echo "Generating RCCL-compile-timeline.html..." + (cd "$time_trace_dir" && ./rccl-TimeTrace.sh) + else + echo "Error: Unable to execute $time_trace_script. Make sure the file has the correct permissions." + fi + else + echo "Error: time-trace folder not found in $search_dir." + fi fi \ No newline at end of file diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index aadffbf273..e8ec8a57c5 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -341,6 +341,10 @@ class ncclFunction { collTrace->type = ncclCollTraceDataType; \ } #else +#define traceColl(launch_type) +#define traceKernelLaunch(firstLaunch) +#define traceKernelEnd() +#define traceAbort() #define traceData(data2, data4, data8_0, data8_1) #endif @@ -519,7 +523,6 @@ __forceinline__ __device__ void ncclKernel( } #endif if (tid == 0) __insert_timestamp(__LINE__); - if (COLLTRACE && tid == 0) traceKernelLaunch(true); while (true) { @@ -563,6 +566,7 @@ __forceinline__ __device__ void ncclKernel( if (COLLTRACE && tid == 0) traceColl(false); } if (COLLTRACE && tid == 0) traceKernelEnd(); + #ifdef ENABLE_PROFILING if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) { __synclds(); @@ -572,6 +576,7 @@ __forceinline__ __device__ void ncclKernel( #endif } +#ifdef ENABLE_COLLTRACE #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ __launch_bounds__(NCCL_MAX_NTHREADS, 1) \ __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \ @@ -582,6 +587,13 @@ __launch_bounds__(NCCL_MAX_NTHREADS, 1) \ __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \ ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(comm, channelMask, workHead); \ } +#else +#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ +__launch_bounds__(NCCL_MAX_NTHREADS, 1) \ +__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \ + ncclKernel, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, channelMask, workHead); \ +} +#endif // Examples : AllReduce, RING, LL, Sum, uint8 /* Functions for aggregation case */ diff --git a/src/enqueue.cc b/src/enqueue.cc index 591606fa33..61bc8169f4 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -28,11 +28,18 @@ struct ncclKernelMatch { }; typedef void(*ncclKern_t)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); + // Must be consistent with the ncclFuncSet enum +#ifdef ENABLE_COLLTRACE static ncclKernelMatch const ncclKerns[2] = { {(void *)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true}, {(void *)NCCL_KERN_NAME_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t), true}, }; +#else +static ncclKernelMatch const ncclKerns[1] = { + {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true} +}; +#endif static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */); diff --git a/src/graph/topo.h b/src/graph/topo.h index 730e8faa52..72a294837c 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -224,7 +224,10 @@ static float ncclTopoXGMISpeed(int gcn) { return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH; } -#define ncclGetKernelIndex(p_comm) \ - ((p_comm)->collTraceThread ? 1 : 0) +#if ENABLE_COLLTRACE + #define ncclGetKernelIndex(p_comm) ((p_comm)->collTraceThread ? 1 : 0) +#else + #define ncclGetKernelIndex(p_comm) (0) +#endif #endif diff --git a/tools/time-trace/time_trace_generator.py b/tools/time-trace/time_trace_generator.py index 83ba7333b5..9754b4a533 100644 --- a/tools/time-trace/time_trace_generator.py +++ b/tools/time-trace/time_trace_generator.py @@ -4,9 +4,6 @@ import pandas as pd import plotly.graph_objects as go import argparse -# Specify the path to the .log file -log_file = '../../build/release/time_trace.log' - def generateRandomColors(df, colorList): for _ in range(len(df)): @@ -102,8 +99,14 @@ def plotCompileTime(log_file, minVal): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--min_val", nargs='?', default='5', type=int, help="Ignore any if it's less than the value provided.") - parser.add_argument("--include_linking", action='store_true', help="Include linking when plotting.") + parser.add_argument("--min_val", nargs='?', default='5', type=int, help="Ignore any if it's less than the value provided") + parser.add_argument("--include_linking", action='store_true', help="Include linking when plotting") + parser.add_argument("--log_file_path", type=str, help="Location of the log file generated with --time-trace flag") args = parser.parse_args() - - plotCompileTime(log_file, args.min_val) \ No newline at end of file + + if args.log_file_path is not None: + log_file_path = args.log_file_path + else: + log_file_path = '../../build/release/time_trace.log' + + plotCompileTime(log_file_path, args.min_val) \ No newline at end of file