diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 39db45df3a..7efdf3a885 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -7,6 +7,8 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ### Added * `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters +* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data ibv_post_sends +* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`) ### Changed diff --git a/projects/rccl/install.sh b/projects/rccl/install.sh index c174be03ea..9191585345 100755 --- a/projects/rccl/install.sh +++ b/projects/rccl/install.sh @@ -23,6 +23,7 @@ enable_ninja="" install_dependencies=false install_library=false install_prefix="${ROCM_PATH}" +log_trace=false msccl_kernel_enabled=true mscclpp_enabled=true num_parallel_jobs=$(nproc) @@ -56,6 +57,7 @@ function display_help() echo " --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)" echo " --no_clean Don't delete files if they already exist" echo " --npkit-enable Compile with npkit enabled" + echo " --log-trace Build with log trace enabled (i.e. NCCL_DEBUG=TRACE)" echo " --openmp-test-enable Enable OpenMP in rccl unit tests" echo " -p|--package_build Build RCCL package" echo " --prefix Specify custom directory to install RCCL to (default: \`/opt/rocm\`)" @@ -75,7 +77,7 @@ function display_help() # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ "$?" -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@") else echo "Need a new version of getopt" exit 1 @@ -106,6 +108,7 @@ while true; do --amdgpu_targets) build_amdgpu_targets=${2}; shift 2 ;; --no_clean) clean_build=false; shift ;; --npkit-enable) npkit_enabled=true; shift ;; + --log-trace) log_trace=true; shift ;; --openmp-test-enable) openmp_test_enabled=true; shift ;; -p | --package_build) build_package=true; shift ;; --prefix) install_library=true; install_prefix=${2}; shift 2 ;; @@ -251,6 +254,11 @@ if [[ "${install_library}" == true ]]; then cmake_common_options="${cmake_common_options} -DCMAKE_INSTALL_PREFIX=${install_prefix}" fi +# Enable trace debug level +if [[ "${log_trace}" == true ]]; then + cmake_common_options="${cmake_common_options} -DTRACE=ON" +fi + # Disable ROCTX if [[ "${roctx_enabled}" == false ]]; then cmake_common_options="${cmake_common_options} -DROCTX=OFF" diff --git a/projects/rccl/src/debug.cc b/projects/rccl/src/debug.cc index ba8c85443e..7f48bb4e90 100644 --- a/projects/rccl/src/debug.cc +++ b/projects/rccl/src/debug.cc @@ -89,6 +89,8 @@ static void ncclDebugInit() { mask = NCCL_REG; } else if (strcasecmp(subsys, "PROFILE") == 0) { mask = NCCL_PROFILE; + } else if (strcasecmp(subsys, "VERBS") == 0) { + mask = NCCL_VERBS; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } diff --git a/projects/rccl/src/include/nccl_common.h b/projects/rccl/src/include/nccl_common.h index 409941693a..de34202725 100644 --- a/projects/rccl/src/include/nccl_common.h +++ b/projects/rccl/src/include/nccl_common.h @@ -32,6 +32,7 @@ typedef enum { NCCL_BOOTSTRAP = 0x1000, NCCL_REG = 0x2000, NCCL_PROFILE = 0x4000, + NCCL_VERBS = 0x8000, NCCL_ALL = ~0 } ncclDebugLogSubSys; diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index ca995139a3..231ef40f5d 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -787,6 +787,8 @@ struct ncclIbDevInfo { //remote dev info union ibv_gid remoteGid; + + int ibv_dev_index; }; // Struct containing everything needed to establish connections @@ -999,7 +1001,7 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base pthread_mutex_unlock(&ibDev->lock); // CQ is sized to accommodate the max SQ + RQ WQE completions. If each SQ WQE could be signaled, then, - // for each QP, there can be 2*MAX_REQUESTS completions for SQ and MAX_REQUESTS completions for RQ. + // for each QP, there can be 2*MAX_REQUESTS completions for SQ and MAX_REQUESTS completions for RQ. NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 3*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); return ncclSuccess; @@ -1189,7 +1191,7 @@ ib_connect_check: devInfo->ib_port = ibDev->portNum; devInfo->mtu = ibDev->portAttr.active_mtu; devInfo->lid = ibDev->portAttr.lid; - + devInfo->ibv_dev_index = commDev->base.ibDevN; // Prepare my fifo NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); devInfo->fifoRkey = commDev->fifoMr->rkey; @@ -1269,7 +1271,6 @@ ib_connect: comm->base.remDevs[i] = remMeta.devs[i]; comm->base.remDevs[i].remoteGid.global.interface_id = comm->base.remDevs[i].gid.global.interface_id; comm->base.remDevs[i].remoteGid.global.subnet_prefix = comm->base.remDevs[i].gid.global.subnet_prefix; - // Retain remote sizes fifo info and prepare RDMA ops comm->remSizesFifo.rkeys[i] = remMeta.devs[i].fifoRkey; comm->remSizesFifo.addr = remMeta.fifoAddr; @@ -1473,6 +1474,7 @@ ib_recv: meta.devs[i].ib_port = ibDev->portNum; meta.devs[i].gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; meta.devs[i].gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; + meta.devs[i].ibv_dev_index = rCommDev->base.ibDevN; // Adjust the MTU remMeta.devs[i].mtu = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu); @@ -1760,6 +1762,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { reqs[r]->send.offset += chunkSize; comm->sges[r].addr += chunkSize; comm->wrs[r].wr.rdma.remote_addr += chunkSize; + + TRACE(NCCL_VERBS, "Posted send wr_id=%lu, wr_indx=%d, qp_num=%d, src_nic=%d, dst_nic=%d, dlid=%d, opcode=%d, send_flags=%d, imm_data=%d, remote_addr=%lx, rkey=%x, length=%d, lkey=%x", + comm->wrs[r].wr_id, r, qp->qp->qp_num, comm->devs[qp->devIndex].base.ibDevN , comm->base.remDevs[qp->remDevIdx].ibv_dev_index, comm->base.remDevs[qp->remDevIdx].lid, + comm->wrs[r].opcode, comm->wrs[r].send_flags, comm->wrs[r].imm_data, comm->wrs[r].wr.rdma.remote_addr, + comm->wrs[r].wr.rdma.rkey,comm->wrs[r].sg_list ? comm->wrs[r].sg_list->length : 0, comm->wrs[r].sg_list ? comm->wrs[r].sg_list->lkey : 0); } // Select the next qpIndex @@ -1927,6 +1934,11 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(ctsQp->qp, &wr, &bad_wr)); + + TRACE(NCCL_VERBS, "Posted send wr_id=%lu, wr_indx=%d, qp_num=%d, src_nic=%d, dst_nic=%d, dlid=%lu, opcode=%d, send_flags=%d, imm_data=%d, remote_addr=%lx, rkey=%x, length=%d, lkey=%x", + wr.wr_id, 0, ctsQp->qp->qp_num, comm->devs[ctsQp->devIndex].base.ibDevN, comm->base.remDevs[ctsQp->remDevIdx].ibv_dev_index, comm->base.remDevs[ctsQp->remDevIdx].lid, + wr.opcode, wr.send_flags, wr.imm_data, wr.wr.rdma.remote_addr, wr.wr.rdma.rkey, wr.sg_list ? wr.sg_list->length : 0, wr.sg_list ? wr.sg_list->lkey : 0); + comm->remFifo.fifoTail++; return ncclSuccess;