Disable Colltrace for --fast option (#778)

* Disable Colltrace for --fast option

* Limit nprocs for CI
Tento commit je obsažen v:
Bertan Dogancay
2023-06-21 16:16:09 -04:00
odevzdal GitHub
rodič 399d31ed40
revize 0c77c66221
8 změnil soubory, kde provedl 85 přidání a 43 odebrání
+1 -1
Zobrazit soubor
@@ -17,7 +17,7 @@ def runCI =
def prj = new rocProject('rccl', 'Extended')
prj.timeout.test = 600
prj.paths.build_command = './install.sh -t --npkit-enable '
prj.paths.build_command = './install.sh -t --npkit-enable --limit-nprocs'
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
+1 -1
Zobrazit soubor
@@ -18,7 +18,7 @@ def runCI =
def prj = new rocProject('rccl', 'PreCheckin')
prj.timeout.test = 300
prj.paths.build_command = './install.sh -t --fast'
prj.paths.build_command = './install.sh -t --fast --limit-nprocs'
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
+1 -1
Zobrazit soubor
@@ -12,7 +12,7 @@ def runCI =
def prj = new rocProject('rccl', 'Static Library PreCheckin')
prj.timeout.test = 1440
prj.paths.build_command = './install.sh -t --static'
prj.paths.build_command = './install.sh -t --static --limit-nprocs'
def nodes = new dockerNodes(nodeDetails, jobName, prj)
+47 -30
Zobrazit soubor
@@ -13,11 +13,12 @@ function display_help()
echo " -d|--dependencies Install RCCL depdencencies"
echo " --debug Build debug library"
echo " --disable_backtrace Build without custom backtrace support"
echo " --fast Quick-build RCCL (local gpu arch only, no backtrace support)"
echo " --disable-colltrace Build without collective trace"
echo " --fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)"
echo " -h|--help Prints this help message"
echo " -i|--install Install RCCL library (see --prefix argument below)"
echo " -l|--limit-nprocs Limit the number of procs to 16 while building"
echo " --local_gpu_only Only compile for local GPU architecture"
echo " --max-jobs Use nproc instead of default number of 16"
echo " --no_clean Don't delete files if they already exist"
echo " --npkit-enable Compile with npkit enabled"
echo " -p|--package_build Build RCCL package"
@@ -38,6 +39,7 @@ ROCM_PATH=${ROCM_PATH:="/opt/rocm"}
build_address_sanitizer=false
build_allreduce_only=false
collective_trace=true
install_dependencies=false
build_release=true
build_bfd=true
@@ -53,7 +55,7 @@ build_static=false
build_tests=false
build_verbose=0
time_trace=false
enable_all_jobs=false
enable_all_jobs=true
enable_ninja=""
# #################################################
@@ -63,7 +65,7 @@ enable_ninja=""
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,fast,help,install,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,max-jobs,verbose --options hidptrs -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,disable-colltrace,fast,help,install,limit-nprocs,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,verbose --options hidptrs -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -78,27 +80,28 @@ eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
--address-sanitizer) build_address_sanitizer=true; shift ;;
--build_allreduce_only) build_allreduce_only=true; shift ;;
-d | --dependencies) install_dependencies=true; shift ;;
--debug) build_release=false; shift ;;
--disable_backtrace) build_bfd=false; shift ;;
--fast) build_bfd=false; build_local_gpu_only=true; shift ;;
-h | --help) display_help; exit 0 ;;
-i | --install) install_library=true; shift ;;
--local_gpu_only) build_local_gpu_only=true; shift ;;
--max-jobs) enable_all_jobs=true; shift ;;
--no_clean) clean_build=false; shift ;;
--npkit-enable) npkit_enabled=true; shift ;;
-p | --package_build) build_package=true; shift ;;
--prefix) install_prefix=${2} shift 2 ;;
--rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;;
-r | --run_tests_quick) run_tests=true; shift ;;
--run_tests_all) run_tests=true; run_tests_all=true; shift ;;
--static) build_static=true; shift ;;
-t | --tests_build) build_tests=true; shift ;;
--time-trace) time_trace=true; shift ;;
--verbose) build_verbose=1; shift ;;
--address-sanitizer) build_address_sanitizer=true; shift ;;
--build_allreduce_only) build_allreduce_only=true; shift ;;
-d | --dependencies) install_dependencies=true; shift ;;
--debug) build_release=false; shift ;;
--disable_backtrace) build_bfd=false; shift ;;
--disable-colltrace) collective_trace=false; shift ;;
--fast) build_bfd=false; build_local_gpu_only=true; collective_trace=false; shift ;;
-h | --help) display_help; exit 0 ;;
-i | --install) install_library=true; shift ;;
-l | --limit-nprocs) enable_all_jobs=false; shift ;;
--local_gpu_only) build_local_gpu_only=true; shift ;;
--no_clean) clean_build=false; shift ;;
--npkit-enable) npkit_enabled=true; shift ;;
-p | --package_build) build_package=true; shift ;;
--prefix) install_prefix=${2} shift 2 ;;
--rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;;
-r | --run_tests_quick) run_tests=true; shift ;;
--run_tests_all) run_tests=true; run_tests_all=true; shift ;;
--static) build_static=true; shift ;;
-t | --tests_build) build_tests=true; shift ;;
--time-trace) time_trace=true; shift ;;
--verbose) build_verbose=1; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -199,6 +202,10 @@ if [[ "${build_static}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_SHARED_LIBS=OFF"
fi
# Disable collective trace
if [[ "${collective_trace}" == false ]]; then
cmake_common_options="${cmake_common_options} -DCOLLTRACE=OFF"
fi
# Install dependencies
if ($install_dependencies); then
@@ -344,9 +351,19 @@ if ($run_tests); then
fi
fi
if ($time_trace) then
cd ../../tools/time-trace
chmod +x ./rccl-TimeTrace.sh
echo "Generating RCCL-compile-timeline.html..."
./rccl-TimeTrace.sh
if ($time_trace); then
search_dir="../../"
time_trace_dir=$(find "$search_dir" -type d -name "time-trace" -print -quit)
if [ "$time_trace_dir" ]; then
time_trace_script="$time_trace_dir/rccl-TimeTrace.sh"
if [ -x "$time_trace_script" ]; then
echo "Generating RCCL-compile-timeline.html..."
(cd "$time_trace_dir" && ./rccl-TimeTrace.sh)
else
echo "Error: Unable to execute $time_trace_script. Make sure the file has the correct permissions."
fi
else
echo "Error: time-trace folder not found in $search_dir."
fi
fi
+13 -1
Zobrazit soubor
@@ -341,6 +341,10 @@ class ncclFunction {
collTrace->type = ncclCollTraceDataType; \
}
#else
#define traceColl(launch_type)
#define traceKernelLaunch(firstLaunch)
#define traceKernelEnd()
#define traceAbort()
#define traceData(data2, data4, data8_0, data8_1)
#endif
@@ -519,7 +523,6 @@ __forceinline__ __device__ void ncclKernel(
}
#endif
if (tid == 0) __insert_timestamp(__LINE__);
if (COLLTRACE && tid == 0) traceKernelLaunch(true);
while (true) {
@@ -563,6 +566,7 @@ __forceinline__ __device__ void ncclKernel(
if (COLLTRACE && tid == 0) traceColl(false);
}
if (COLLTRACE && tid == 0) traceKernelEnd();
#ifdef ENABLE_PROFILING
if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
__synclds();
@@ -572,6 +576,7 @@ __forceinline__ __device__ void ncclKernel(
#endif
}
#ifdef ENABLE_COLLTRACE
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
@@ -582,6 +587,13 @@ __launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(comm, channelMask, workHead); \
}
#else
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, channelMask, workHead); \
}
#endif
// Examples : AllReduce, RING, LL, Sum, uint8
/* Functions for aggregation case */
+7
Zobrazit soubor
@@ -28,11 +28,18 @@ struct ncclKernelMatch {
};
typedef void(*ncclKern_t)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
// Must be consistent with the ncclFuncSet enum
#ifdef ENABLE_COLLTRACE
static ncclKernelMatch const ncclKerns[2] = {
{(void *)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
{(void *)NCCL_KERN_NAME_DEBUG(SendRecv, RING, SIMPLE, Sum, int8_t), true},
};
#else
static ncclKernelMatch const ncclKerns[1] = {
{(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true}
};
#endif
static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
+5 -2
Zobrazit soubor
@@ -224,7 +224,10 @@ static float ncclTopoXGMISpeed(int gcn) {
return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
}
#define ncclGetKernelIndex(p_comm) \
((p_comm)->collTraceThread ? 1 : 0)
#if ENABLE_COLLTRACE
#define ncclGetKernelIndex(p_comm) ((p_comm)->collTraceThread ? 1 : 0)
#else
#define ncclGetKernelIndex(p_comm) (0)
#endif
#endif
+10 -7
Zobrazit soubor
@@ -4,9 +4,6 @@ import pandas as pd
import plotly.graph_objects as go
import argparse
# Specify the path to the .log file
log_file = '../../build/release/time_trace.log'
def generateRandomColors(df, colorList):
for _ in range(len(df)):
@@ -102,8 +99,14 @@ def plotCompileTime(log_file, minVal):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--min_val", nargs='?', default='5', type=int, help="Ignore any if it's less than the value provided.")
parser.add_argument("--include_linking", action='store_true', help="Include linking when plotting.")
parser.add_argument("--min_val", nargs='?', default='5', type=int, help="Ignore any if it's less than the value provided")
parser.add_argument("--include_linking", action='store_true', help="Include linking when plotting")
parser.add_argument("--log_file_path", type=str, help="Location of the log file generated with --time-trace flag")
args = parser.parse_args()
plotCompileTime(log_file, args.min_val)
if args.log_file_path is not None:
log_file_path = args.log_file_path
else:
log_file_path = '../../build/release/time_trace.log'
plotCompileTime(log_file_path, args.min_val)