Add IB verbs logging and enable traces through install.sh (#1511)

* Add IB Verbs logging

* Simplify tracing and undo debug.h changes

* Update debug.h

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update CHANGELOG.md

* Exchange remote comm device index

[ROCm/rccl commit: dc75209dd7]
このコミットが含まれているのは:
Mustafa Abduljabbar
2025-01-31 12:35:39 -05:00
committed by GitHub
コミット f58025185e
5個のファイルの変更29行の追加4行の削除
+2
ファイルの表示
@@ -7,6 +7,8 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
### Added
* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters
* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data ibv_post_sends
* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`)
### Changed
+9 -1
ファイルの表示
@@ -23,6 +23,7 @@ enable_ninja=""
install_dependencies=false
install_library=false
install_prefix="${ROCM_PATH}"
log_trace=false
msccl_kernel_enabled=true
mscclpp_enabled=true
num_parallel_jobs=$(nproc)
@@ -56,6 +57,7 @@ function display_help()
echo " --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)"
echo " --no_clean Don't delete files if they already exist"
echo " --npkit-enable Compile with npkit enabled"
echo " --log-trace Build with log trace enabled (i.e. NCCL_DEBUG=TRACE)"
echo " --openmp-test-enable Enable OpenMP in rccl unit tests"
echo " -p|--package_build Build RCCL package"
echo " --prefix Specify custom directory to install RCCL to (default: \`/opt/rocm\`)"
@@ -75,7 +77,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -106,6 +108,7 @@ while true; do
--amdgpu_targets) build_amdgpu_targets=${2}; shift 2 ;;
--no_clean) clean_build=false; shift ;;
--npkit-enable) npkit_enabled=true; shift ;;
--log-trace) log_trace=true; shift ;;
--openmp-test-enable) openmp_test_enabled=true; shift ;;
-p | --package_build) build_package=true; shift ;;
--prefix) install_library=true; install_prefix=${2}; shift 2 ;;
@@ -251,6 +254,11 @@ if [[ "${install_library}" == true ]]; then
cmake_common_options="${cmake_common_options} -DCMAKE_INSTALL_PREFIX=${install_prefix}"
fi
# Enable trace debug level
if [[ "${log_trace}" == true ]]; then
cmake_common_options="${cmake_common_options} -DTRACE=ON"
fi
# Disable ROCTX
if [[ "${roctx_enabled}" == false ]]; then
cmake_common_options="${cmake_common_options} -DROCTX=OFF"
+2
ファイルの表示
@@ -89,6 +89,8 @@ static void ncclDebugInit() {
mask = NCCL_REG;
} else if (strcasecmp(subsys, "PROFILE") == 0) {
mask = NCCL_PROFILE;
} else if (strcasecmp(subsys, "VERBS") == 0) {
mask = NCCL_VERBS;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
+1
ファイルの表示
@@ -32,6 +32,7 @@ typedef enum {
NCCL_BOOTSTRAP = 0x1000,
NCCL_REG = 0x2000,
NCCL_PROFILE = 0x4000,
NCCL_VERBS = 0x8000,
NCCL_ALL = ~0
} ncclDebugLogSubSys;
+15 -3
ファイルの表示
@@ -787,6 +787,8 @@ struct ncclIbDevInfo {
//remote dev info
union ibv_gid remoteGid;
int ibv_dev_index;
};
// Struct containing everything needed to establish connections
@@ -999,7 +1001,7 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base
pthread_mutex_unlock(&ibDev->lock);
// CQ is sized to accommodate the max SQ + RQ WQE completions. If each SQ WQE could be signaled, then,
// for each QP, there can be 2*MAX_REQUESTS completions for SQ and MAX_REQUESTS completions for RQ.
// for each QP, there can be 2*MAX_REQUESTS completions for SQ and MAX_REQUESTS completions for RQ.
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 3*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
return ncclSuccess;
@@ -1189,7 +1191,7 @@ ib_connect_check:
devInfo->ib_port = ibDev->portNum;
devInfo->mtu = ibDev->portAttr.active_mtu;
devInfo->lid = ibDev->portAttr.lid;
devInfo->ibv_dev_index = commDev->base.ibDevN;
// Prepare my fifo
NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
devInfo->fifoRkey = commDev->fifoMr->rkey;
@@ -1269,7 +1271,6 @@ ib_connect:
comm->base.remDevs[i] = remMeta.devs[i];
comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].gid.global.interface_id;
comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].gid.global.subnet_prefix;
// Retain remote sizes fifo info and prepare RDMA ops
comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey;
comm->remSizesFifo.addr = remMeta.fifoAddr;
@@ -1473,6 +1474,7 @@ ib_recv:
meta.devs[i].ib_port = ibDev->portNum;
meta.devs[i].gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
meta.devs[i].gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id;
meta.devs[i].ibv_dev_index = rCommDev->base.ibDevN;
// Adjust the MTU
remMeta.devs[i].mtu = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu);
@@ -1760,6 +1762,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
reqs[r]->send.offset += chunkSize;
comm->sges[r].addr += chunkSize;
comm->wrs[r].wr.rdma.remote_addr += chunkSize;
TRACE(NCCL_VERBS, "Posted send wr_id=%lu, wr_indx=%d, qp_num=%d, src_nic=%d, dst_nic=%d, dlid=%d, opcode=%d, send_flags=%d, imm_data=%d, remote_addr=%lx, rkey=%x, length=%d, lkey=%x",
comm->wrs[r].wr_id, r, qp->qp->qp_num, comm->devs[qp->devIndex].base.ibDevN , comm->base.remDevs[qp->remDevIdx].ibv_dev_index, comm->base.remDevs[qp->remDevIdx].lid,
comm->wrs[r].opcode, comm->wrs[r].send_flags, comm->wrs[r].imm_data, comm->wrs[r].wr.rdma.remote_addr,
comm->wrs[r].wr.rdma.rkey,comm->wrs[r].sg_list ? comm->wrs[r].sg_list->length : 0, comm->wrs[r].sg_list ? comm->wrs[r].sg_list->lkey : 0);
}
// Select the next qpIndex
@@ -1927,6 +1934,11 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(ctsQp->qp, &wr, &bad_wr));
TRACE(NCCL_VERBS, "Posted send wr_id=%lu, wr_indx=%d, qp_num=%d, src_nic=%d, dst_nic=%d, dlid=%lu, opcode=%d, send_flags=%d, imm_data=%d, remote_addr=%lx, rkey=%x, length=%d, lkey=%x",
wr.wr_id, 0, ctsQp->qp->qp_num, comm->devs[ctsQp->devIndex].base.ibDevN, comm->base.remDevs[ctsQp->remDevIdx].ibv_dev_index, comm->base.remDevs[ctsQp->remDevIdx].lid,
wr.opcode, wr.send_flags, wr.imm_data, wr.wr.rdma.remote_addr, wr.wr.rdma.rkey, wr.sg_list ? wr.sg_list->length : 0, wr.sg_list ? wr.sg_list->lkey : 0);
comm->remFifo.fifoTail++;
return ncclSuccess;