Merge commit '3d4813d99196bb349eccd50a925e2addc8f1622c' into develop
This commit is contained in:
+2
-2
@@ -34,7 +34,7 @@ jobs:
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
|
||||
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
|
||||
|
||||
- name: Checkout rccl repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
@@ -143,5 +143,5 @@ jobs:
|
||||
with:
|
||||
amdgpu_families: ${{ inputs.amdgpu_families }}
|
||||
artifact_group: ${{ inputs.artifact_group }}
|
||||
test_runs_on: linux-mi325-1gpu-ossci-rocm-frac
|
||||
test_runs_on: linux-mi325-4gpu-ossci-rocm
|
||||
artifact_run_id: ${{ github.run_id }}
|
||||
|
||||
@@ -39,14 +39,15 @@ jobs:
|
||||
env:
|
||||
VENV_DIR: ${{ github.workspace }}/.venv
|
||||
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
|
||||
OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
|
||||
OUTPUT_ARTIFACTS_DIR: /apps/cvs_tests/dist_new/dist/rocm
|
||||
THEROCK_BIN_DIR: "./build/bin"
|
||||
AWS_SHARED_CREDENTIALS_FILE: /apps/cvs_tests/awsconfig/credentials.ini
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
|
||||
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
|
||||
|
||||
- name: Run setup test environment workflow
|
||||
uses: './.github/actions/setup_test_environment'
|
||||
@@ -61,20 +62,11 @@ jobs:
|
||||
|
||||
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
|
||||
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
|
||||
# sbatch script runs rccl_heatmap_cvs script which validates and generates a bandwidth heatmap file for different rccl collectives
|
||||
- name: Test gfx950
|
||||
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
|
||||
run: |
|
||||
salloc -N 4 -p meta64 -t 04:00:00 --exclusive bash -c "
|
||||
source /home/arravikum/TheRock/.venv/bin/activate &&
|
||||
cd /home/arravikum/cvs &&
|
||||
python input/setup.py &&
|
||||
pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
|
||||
--cluster_file ./input/cluster.json \
|
||||
--config_file ./input/mi350_config.json \
|
||||
--log-file=/tmp/rccl_log.log \
|
||||
--html=/home/arravikum/cvs/test_reports/ci_test_report.html \
|
||||
--capture=tee-sys \
|
||||
--self-contained-html"
|
||||
SETUP_NODES=1 sbatch --wait -N4 /apps/cvs_tests/cvs-sbatch/sbatch/default.sbatch
|
||||
|
||||
- name: Configure AWS Credentials for non-forked repos
|
||||
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
|
||||
@@ -91,6 +83,6 @@ jobs:
|
||||
python3 build_tools/github_actions/upload_test_report_script.py \
|
||||
--run-id "${{ github.run_id }}" \
|
||||
--amdgpu-family "${{ inputs.amdgpu_families }}" \
|
||||
--report-path "/home/arravikum/cvs/test_reports" \
|
||||
--report-path "/apps/cvs_tests/test_reports" \
|
||||
--log-destination "/logs/gfx950-dcgpu" \
|
||||
--index-file-name "index_rccl_test_report.html"
|
||||
|
||||
@@ -30,13 +30,16 @@ jobs:
|
||||
name: 'Test single-node'
|
||||
runs-on: ${{ inputs.test_runs_on }}
|
||||
container:
|
||||
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
|
||||
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98
|
||||
options: --ipc host
|
||||
--group-add video
|
||||
--device /dev/kfd
|
||||
--device /dev/dri
|
||||
--group-add 110
|
||||
--ulimit memlock=-1:-1
|
||||
--security-opt seccomp=unconfined
|
||||
--env-file /etc/podinfo/gha-gpu-isolation-settings
|
||||
--user 0:0
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -50,7 +53,7 @@ jobs:
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
repository: "ROCm/TheRock"
|
||||
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
|
||||
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
|
||||
|
||||
- name: Run setup test environment workflow
|
||||
uses: './.github/actions/setup_test_environment'
|
||||
@@ -70,5 +73,5 @@ jobs:
|
||||
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
|
||||
run: |
|
||||
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
|
||||
--log-cli-level=info \
|
||||
-k "not test_rccl_correctness_tests"
|
||||
-k "not test_rccl_correctness_tests" \
|
||||
--log-cli-level=info
|
||||
|
||||
@@ -3,6 +3,6 @@
|
||||
/coverage/
|
||||
build/
|
||||
ext/
|
||||
|
||||
src/transport/net_ib_rocm.cc
|
||||
# Visual Studio Code
|
||||
.vscode
|
||||
.vscode
|
||||
|
||||
@@ -8,3 +8,7 @@
|
||||
url = https://github.com/nlohmann/json.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
[submodule "ext-src/rocSHMEM"]
|
||||
path = ext-src/rocSHMEM
|
||||
url = https://github.com/ROCm/rocSHMEM.git
|
||||
branch = develop
|
||||
|
||||
@@ -2,12 +2,29 @@
|
||||
|
||||
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
|
||||
|
||||
## Unreleased - RCCL 2.28.3 for ROCm 7.11
|
||||
|
||||
### Known issues
|
||||
* AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm.
|
||||
* ROCTx feature needs to be verified.
|
||||
* Profiler plugin needs to be verified.
|
||||
|
||||
### Changed
|
||||
* Compatibility with NCCL 2.28.3.
|
||||
* The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script.
|
||||
* MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL.
|
||||
|
||||
## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
|
||||
|
||||
### Changed
|
||||
|
||||
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
|
||||
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
|
||||
* Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
|
||||
* Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
|
||||
* The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.
|
||||
|
||||
### Known issues
|
||||
* AllToAllv/AlltoAll for single GPU is hanging.
|
||||
|
||||
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
|
||||
|
||||
|
||||
+180
-33
@@ -26,7 +26,7 @@ option(BUILD_TESTS "Build unit test programs"
|
||||
option(COLLTRACE "Collective Trace Option" ON)
|
||||
option(DUMP_ASM "Disassemble and dump" OFF)
|
||||
option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
|
||||
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
|
||||
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" OFF)
|
||||
option(ENABLE_MSCCLPP "Enable MSCCL++" OFF)
|
||||
option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF)
|
||||
option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF)
|
||||
@@ -42,6 +42,7 @@ option(TIMETRACE "Enable time-trace during compila
|
||||
option(TRACE "Enable additional tracing" OFF)
|
||||
option(FAULT_INJECTION "Enable fault injection" ON)
|
||||
option(QUIET_WARNINGS "Supress compiler warnings" OFF)
|
||||
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
|
||||
|
||||
# Default GPU architectures to build
|
||||
#==================================================================================================
|
||||
@@ -65,6 +66,11 @@ include(CheckSymbolExists)
|
||||
include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets
|
||||
include(cmake/CheckSymbolExistsNoWarn.cmake)
|
||||
|
||||
# Include rocSHMEM build module only if enabled
|
||||
if(ENABLE_ROCSHMEM)
|
||||
include(cmake/ROCSHMEM.cmake)
|
||||
endif()
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
|
||||
# Build only for local GPU architecture
|
||||
@@ -80,6 +86,9 @@ endif()
|
||||
# Determine which GPU architectures to build for
|
||||
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
|
||||
|
||||
# ROCM NetIB patch
|
||||
include(cmake/rocmIb.cmake)
|
||||
|
||||
# Modify GPU architectures for Address Sanitizer builds by appending "xnack+"
|
||||
if (BUILD_ADDRESS_SANITIZER)
|
||||
SET(amdgpu_targets "")
|
||||
@@ -252,26 +261,56 @@ find_package(hsa-runtime64 REQUIRED)
|
||||
get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
|
||||
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
|
||||
|
||||
## Check for ROCM-smi
|
||||
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
|
||||
if (rocm_smi_FOUND)
|
||||
message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
|
||||
else()
|
||||
message(STATUS "Checking old include directory structure for rocm_smi")
|
||||
set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
|
||||
set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
|
||||
set(ROCM_SMI_LIBRARIES rocm_smi64)
|
||||
## Check for amd-smi if ROCm 7.11.0 or newer
|
||||
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
|
||||
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
|
||||
if(amd_smi_FOUND)
|
||||
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
|
||||
message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}")
|
||||
set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory")
|
||||
set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory")
|
||||
set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging")
|
||||
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}")
|
||||
endif()
|
||||
message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}")
|
||||
set(SMI_LIBRARIES amd_smi)
|
||||
set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT USE_AMDSMI)
|
||||
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
|
||||
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
|
||||
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
|
||||
if(rocm_smi_FOUND)
|
||||
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
|
||||
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
|
||||
else()
|
||||
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
|
||||
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
|
||||
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
|
||||
endif()
|
||||
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
|
||||
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
|
||||
set(SMI_LIBRARIES rocm_smi64)
|
||||
|
||||
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
|
||||
|
||||
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
|
||||
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
|
||||
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
|
||||
if(${matchres} EQUAL -1)
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
|
||||
else()
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
|
||||
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
|
||||
endif ()
|
||||
endif()
|
||||
check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
|
||||
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
|
||||
file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
|
||||
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
|
||||
if(${matchres} EQUAL -1)
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
|
||||
else()
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
|
||||
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
|
||||
endif ()
|
||||
|
||||
## Check for BFD library if custom backtrace is requested
|
||||
if(BUILD_BFD)
|
||||
@@ -318,6 +357,8 @@ if(BUILD_BFD)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
# Check for --amdgpu-kernarg-preload-count
|
||||
check_cxx_compiler_flag("-mllvm --amdgpu-kernarg-preload-count=16" HAVE_KERNARG_PRELOAD)
|
||||
if (HAVE_KERNARG_PRELOAD)
|
||||
@@ -333,6 +374,7 @@ endif()
|
||||
## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
|
||||
set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
|
||||
|
||||
|
||||
# Check if any of the supported architectures are in GPU_TARGETS
|
||||
set(ARCH_MATCH_FOUND OFF)
|
||||
set(MSCCLPP_GPU_TARGETS "")
|
||||
@@ -355,6 +397,20 @@ if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
|
||||
message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build")
|
||||
endif()
|
||||
|
||||
## Disable WARP_SPEED if the build environment is invalid
|
||||
set(WARP_SPEED_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
|
||||
set(ARCH_MATCH_FOUND OFF)
|
||||
foreach(ARCH IN LISTS GPU_TARGETS)
|
||||
if(ARCH IN_LIST WARP_SPEED_SUPPORTED_ARCHS)
|
||||
set(ARCH_MATCH_FOUND ON)
|
||||
endif()
|
||||
endforeach()
|
||||
if (NOT ARCH_MATCH_FOUND)
|
||||
set(ENABLE_WARP_SPEED OFF)
|
||||
message(WARNING "Can only build WARP_SPEED for supported GPU_TARGETS: ${WARP_SPEED_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling WARP_SPEED build")
|
||||
endif()
|
||||
|
||||
|
||||
# cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22
|
||||
execute_process(
|
||||
COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
|
||||
@@ -437,9 +493,12 @@ configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used b
|
||||
set(SRC_FILES
|
||||
src/allocator.cc
|
||||
src/bootstrap.cc
|
||||
src/ce_coll.cc
|
||||
src/channel.cc
|
||||
src/collectives.cc
|
||||
src/commDump.cc
|
||||
src/debug.cc
|
||||
src/dev_runtime.cc
|
||||
src/enqueue.cc
|
||||
src/group.cc
|
||||
src/init.cc
|
||||
@@ -448,11 +507,12 @@ set(SRC_FILES
|
||||
src/msccl.cc
|
||||
src/proxy.cc
|
||||
src/rccl_wrap.cc
|
||||
src/symmetric.cc
|
||||
src/sym_kernels.cc
|
||||
src/transport.cc
|
||||
src/device/all_gather.h
|
||||
src/device/all_reduce.h
|
||||
src/device/alltoall_pivot.h
|
||||
src/device/alltoall_gda.h
|
||||
src/device/broadcast.h
|
||||
src/device/common.h
|
||||
src/device/common_kernel.h
|
||||
@@ -498,6 +558,7 @@ set(SRC_FILES
|
||||
src/include/BfdBacktrace.hpp
|
||||
src/include/bitops.h
|
||||
src/include/bootstrap.h
|
||||
src/include/ce_coll.h
|
||||
src/include/channel.h
|
||||
src/include/checks.h
|
||||
src/include/collectives.h
|
||||
@@ -507,6 +568,7 @@ set(SRC_FILES
|
||||
src/include/cpuset.h
|
||||
# src/include/cudawrap.h
|
||||
src/include/debug.h
|
||||
src/include/dev_runtime.h
|
||||
src/include/device.h
|
||||
src/include/enqueue.h
|
||||
src/include/gdrwrap.h
|
||||
@@ -521,6 +583,7 @@ set(SRC_FILES
|
||||
src/include/ipcsocket.h
|
||||
src/include/mnnvl.h
|
||||
src/include/nccl_common.h
|
||||
src/include/nccl_device.h
|
||||
src/include/net_device.h
|
||||
src/include/net.h
|
||||
src/include/nvmlwrap.h
|
||||
@@ -537,16 +600,16 @@ set(SRC_FILES
|
||||
src/include/register.h
|
||||
src/include/register_inline.h
|
||||
src/include/rccl_float8.h
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/include/rocmwrap.h
|
||||
src/include/roctx.h
|
||||
src/include/recorder.h
|
||||
src/include/scheduler.h
|
||||
src/include/shm.h
|
||||
src/include/shmutils.h
|
||||
src/include/signals.h
|
||||
src/include/socket.h
|
||||
src/include/strongstream.h
|
||||
src/include/symmetric.h
|
||||
src/include/sym_kernels.h
|
||||
src/include/timer.h
|
||||
src/include/transport.h
|
||||
src/include/trees.h
|
||||
@@ -555,12 +618,32 @@ set(SRC_FILES
|
||||
src/include/mlx5/mlx5dvcore.h
|
||||
src/include/mlx5/mlx5dvsymbols.h
|
||||
src/include/mlx5/mlx5dvwrap.h
|
||||
src/include/ionic/ionicdvcore.h
|
||||
src/include/ionic/ionicdvsymbols.h
|
||||
src/include/ionic/ionicdvwrap.h
|
||||
src/include/msccl/msccl_lifecycle.h
|
||||
src/include/msccl/msccl_parser.h
|
||||
src/include/msccl/msccl_scheduler.h
|
||||
src/include/msccl/msccl_setup.h
|
||||
src/include/msccl/msccl_status.h
|
||||
src/include/msccl/msccl_struct.h
|
||||
src/include/nccl_device/comm.h
|
||||
src/include/nccl_device/coop.h
|
||||
src/include/nccl_device/core.h
|
||||
src/include/nccl_device/ll_a2a.h
|
||||
src/include/nccl_device/mem_barrier.h
|
||||
src/include/nccl_device/ptr.h
|
||||
src/include/nccl_device/utility.h
|
||||
src/include/nccl_device/impl/comm__funcs.h
|
||||
src/include/nccl_device/impl/comm__types.h
|
||||
src/include/nccl_device/impl/core__funcs.h
|
||||
src/include/nccl_device/impl/core__types.h
|
||||
src/include/nccl_device/impl/ll_a2a__funcs.h
|
||||
src/include/nccl_device/impl/ll_a2a__types.h
|
||||
src/include/nccl_device/impl/mem_barrier__funcs.h
|
||||
src/include/nccl_device/impl/mem_barrier__types.h
|
||||
src/include/nccl_device/impl/ptr__funcs.h
|
||||
src/include/nccl_device/impl/ptr__types.h
|
||||
src/include/npkit/npkit.h
|
||||
src/include/npkit/npkit_event.h
|
||||
src/include/npkit/npkit_struct.h
|
||||
@@ -608,6 +691,7 @@ set(SRC_FILES
|
||||
src/include/plugin/net/net_v8.h
|
||||
src/include/plugin/net/net_v9.h
|
||||
src/include/plugin/net/net_v10.h
|
||||
src/include/plugin/net/net_v11.h
|
||||
src/include/plugin/profiler/net_ib_v1.h
|
||||
src/include/plugin/profiler/net_ib.h
|
||||
src/include/plugin/profiler/net_socket_v1.h
|
||||
@@ -616,9 +700,11 @@ set(SRC_FILES
|
||||
src/include/plugin/profiler/profiler_v2.h
|
||||
src/include/plugin/profiler/profiler_v3.h
|
||||
src/include/plugin/profiler/profiler_v4.h
|
||||
src/include/plugin/profiler/profiler_v5.h
|
||||
src/include/plugin/tuner/tuner_v2.h
|
||||
src/include/plugin/tuner/tuner_v3.h
|
||||
src/include/plugin/tuner/tuner_v4.h
|
||||
src/include/plugin/tuner/tuner_v5.h
|
||||
src/misc/alt_rsmi.cc
|
||||
src/misc/archinfo.cc
|
||||
src/misc/argcheck.cc
|
||||
@@ -631,11 +717,12 @@ set(SRC_FILES
|
||||
src/misc/ipcsocket.cc
|
||||
src/misc/mlx5dvsymbols.cc
|
||||
src/misc/mlx5dvwrap.cc
|
||||
src/misc/ionicdvsymbols.cc
|
||||
src/misc/ionicdvwrap.cc
|
||||
src/misc/npkit.cc
|
||||
# src/misc/nvmlwrap.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/param.cc
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
src/misc/rocmwrap.cc
|
||||
src/misc/roctx.cc
|
||||
src/misc/recorder.cc
|
||||
@@ -649,6 +736,9 @@ set(SRC_FILES
|
||||
src/misc/msccl/msccl_setup.cc
|
||||
src/misc/msccl/msccl_status.cc
|
||||
src/misc/proxy_trace/proxy_trace.cc
|
||||
src/nccl_device/core.cc
|
||||
src/nccl_device/ll_a2a.cc
|
||||
src/nccl_device/mem_barrier.cc
|
||||
src/plugin/net.cc
|
||||
src/plugin/plugin_open.cc
|
||||
src/plugin/profiler.cc
|
||||
@@ -658,13 +748,16 @@ set(SRC_FILES
|
||||
src/plugin/net/net_v8.cc
|
||||
src/plugin/net/net_v9.cc
|
||||
src/plugin/net/net_v10.cc
|
||||
src/plugin/net/net_v11.cc
|
||||
src/plugin/profiler/profiler_v1.cc
|
||||
src/plugin/profiler/profiler_v2.cc
|
||||
src/plugin/profiler/profiler_v3.cc
|
||||
src/plugin/profiler/profiler_v4.cc
|
||||
src/plugin/profiler/profiler_v5.cc
|
||||
src/plugin/tuner/tuner_v2.cc
|
||||
src/plugin/tuner/tuner_v3.cc
|
||||
src/plugin/tuner/tuner_v4.cc
|
||||
src/plugin/tuner/tuner_v5.cc
|
||||
src/ras/client.cc
|
||||
src/ras/client_support.cc
|
||||
src/ras/collectives.cc
|
||||
@@ -675,10 +768,12 @@ set(SRC_FILES
|
||||
src/register/coll_reg.cc
|
||||
src/register/register.cc
|
||||
src/register/sendrecv_reg.cc
|
||||
src/scheduler/symmetric_sched.cc
|
||||
src/transport/coll_net.cc
|
||||
src/transport/generic.cc
|
||||
src/transport/net.cc
|
||||
src/transport/net_ib.cc
|
||||
src/transport/net_ib_rocm.cc
|
||||
src/transport/net_socket.cc
|
||||
src/transport/nvls.cc
|
||||
src/transport/p2p.cc
|
||||
@@ -695,6 +790,19 @@ set(SRC_FILES
|
||||
src/misc/latency_profiler/CollTraceUtils.cc
|
||||
)
|
||||
|
||||
if(USE_AMDSMI)
|
||||
set(SMI_SOURCES
|
||||
src/include/amdsmi_wrap.h
|
||||
src/misc/amdsmi_wrap.cc
|
||||
)
|
||||
else()
|
||||
set(SMI_SOURCES
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
)
|
||||
endif()
|
||||
list(APPEND SRC_FILES ${SMI_SOURCES})
|
||||
|
||||
if (ENABLE_MSCCL_KERNEL)
|
||||
set(MSCCL_KERNEL_SOURCES
|
||||
src/device/msccl_kernel_impl.h
|
||||
@@ -846,6 +954,8 @@ target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
|
||||
target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})
|
||||
@@ -858,26 +968,59 @@ if(ROCTX_ENABLE)
|
||||
target_include_directories(rccl PRIVATE ${ROCTRACER_INCLUDE_DIR})
|
||||
endif()
|
||||
|
||||
|
||||
## Set RCCL compile definitions
|
||||
if(COLLTRACE)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
|
||||
endif()
|
||||
if(ENABLE_MSCCL_KERNEL)
|
||||
message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.")
|
||||
target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
|
||||
endif()
|
||||
if(ENABLE_MSCCLPP)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
|
||||
endif()
|
||||
if(HAVE_ROCM_SMI64CONFIG)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
|
||||
|
||||
if(USE_AMDSMI)
|
||||
target_compile_definitions(rccl PRIVATE USE_AMDSMI)
|
||||
else()
|
||||
if(HAVE_ROCM_SMI64CONFIG)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
|
||||
endif()
|
||||
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
endif()
|
||||
endif()
|
||||
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
if(ENABLE_WARP_SPEED)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
|
||||
endif()
|
||||
if(ENABLE_ROCSHMEM)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_ROCSHMEM)
|
||||
endif()
|
||||
|
||||
# ==== rocSHMEM integration (optional) ====
|
||||
|
||||
if (ENABLE_ROCSHMEM)
|
||||
add_rocshmem_targets()
|
||||
# Ensure rocSHMEM is fully built/installed before compiling rccl
|
||||
if (TARGET rocshmem_ext)
|
||||
add_dependencies(rccl rocshmem_ext)
|
||||
endif()
|
||||
|
||||
if (ROCSHMEM_INCLUDE_DIR)
|
||||
target_include_directories(rccl PRIVATE ${ROCSHMEM_INCLUDE_DIR})
|
||||
endif()
|
||||
|
||||
# Moved to where MSCCL target_links
|
||||
## target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
|
||||
target_link_libraries(rccl PRIVATE ${IBVERBS})
|
||||
|
||||
endif()
|
||||
|
||||
# NPKit flags
|
||||
## May be better to move these to a separate file
|
||||
if(ENABLE_NPKIT)
|
||||
message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.")
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
@@ -1099,8 +1242,7 @@ if(ENABLE_CODE_COVERAGE)
|
||||
message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.")
|
||||
|
||||
target_compile_options(rccl PRIVATE
|
||||
-fvisibility=default -Xarch_host -fprofile-instr-generate
|
||||
-Xarch_host -fcoverage-mapping)
|
||||
-fvisibility=default -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping)
|
||||
|
||||
set(COVERAGE_SHARED_LINKER_FLAGS
|
||||
-fprofile-generate
|
||||
@@ -1169,7 +1311,7 @@ if (FAULT_INJECTION)
|
||||
endif()
|
||||
|
||||
## Set RCCL linked library directories
|
||||
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
|
||||
target_link_directories(rccl PRIVATE ${SMI_LIB_DIR})
|
||||
|
||||
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
|
||||
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
|
||||
@@ -1201,11 +1343,15 @@ target_link_libraries(rccl PRIVATE Threads::Threads)
|
||||
target_link_libraries(rccl INTERFACE hip::host)
|
||||
target_link_libraries(rccl PRIVATE hip::device)
|
||||
target_link_libraries(rccl PRIVATE dl)
|
||||
target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
|
||||
target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES})
|
||||
target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
|
||||
if(ENABLE_MSCCLPP)
|
||||
target_link_libraries(rccl PRIVATE mscclpp_nccl)
|
||||
endif()
|
||||
if(ENABLE_ROCSHMEM)
|
||||
target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
|
||||
target_link_libraries(rccl PRIVATE ${IBVERBS})
|
||||
endif()
|
||||
|
||||
## Set RCCL link options
|
||||
## Find out available memory
|
||||
@@ -1317,7 +1463,8 @@ if(BUILD_ADDRESS_SANITIZER)
|
||||
else()
|
||||
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
|
||||
endif()
|
||||
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0")
|
||||
|
||||
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}")
|
||||
set(CPACK_DEB_COMPONENT_INSTALL ON)
|
||||
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
|
||||
set(CPACK_RPM_COMPONENT_INSTALL ON)
|
||||
|
||||
@@ -42,7 +42,7 @@ RCCL build & installation helper script
|
||||
--debug Build debug library
|
||||
--enable_backtrace Build with custom backtrace support
|
||||
--disable-colltrace Build without collective trace
|
||||
--disable-msccl-kernel Build without MSCCL kernels
|
||||
--enable-msccl-kernel Build with MSCCL kernels
|
||||
--enable-mscclpp Build with MSCCL++ support
|
||||
--enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
|
||||
--disable-roctx Build without ROCTX logging
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
find_path(ROCSHMEM_INCLUDE_DIR
|
||||
NAMES rocshmem/rocshmem.hpp rocshmem/rocshmem.h
|
||||
HINTS ${ROCSHMEM_INSTALL_DIR}/include/)
|
||||
|
||||
find_library(ROCSHMEM_LIBRARY
|
||||
NAMES rocshmem
|
||||
HINTS ${ROCSHMEM_INSTALL_DIR}/lib)
|
||||
|
||||
## -- todo --- what to do with verbs? add to handle args call below? -- ##
|
||||
find_library(IBVERBS ibverbs)
|
||||
|
||||
find_package_handle_standard_args(rocshmem_static DEFAULT_MSG ROCSHMEM_INCLUDE_DIR ROCSHMEM_LIBRARY)
|
||||
## mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_NCCL_STATIC_LIB) add this for Rocshmem?
|
||||
@@ -0,0 +1,113 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
include(ExternalProject)
|
||||
|
||||
function(add_rocshmem_targets)
|
||||
|
||||
# Check for an existing installation via the user-provided prefix ROCSHMEM_INSTALL DIR
|
||||
if(ROCSHMEM_INSTALL_DIR)
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
find_package(rocshmem_static)
|
||||
if(NOT IBVERBS)
|
||||
find_library(IBVERBS ibverbs)
|
||||
if(IBVERBS)
|
||||
set(IBVERBS ${IBVERBS} PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# If no pre-existing installation, build from submodule into ext/rocshmem
|
||||
if(NOT rocshmem_static_FOUND)
|
||||
set(_rccl_root "${CMAKE_SOURCE_DIR}")
|
||||
set(ROCSHMEM_SOURCE "${_rccl_root}/ext-src/rocSHMEM")
|
||||
set(ROCSHMEM_INSTALL_DIR "${_rccl_root}/ext/rocshmem")
|
||||
|
||||
# Make sure submodule exists (same style as MSCCL++: custom rule + target)
|
||||
add_custom_command(
|
||||
OUTPUT "${ROCSHMEM_SOURCE}/CMakeLists.txt"
|
||||
COMMAND git submodule update --init --recursive ext-src/rocSHMEM
|
||||
WORKING_DIRECTORY "${_rccl_root}"
|
||||
COMMENT "Checking out submodule: ext-src/rocSHMEM"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(rocshmem_checkout_submodule
|
||||
DEPENDS "${ROCSHMEM_SOURCE}/CMakeLists.txt")
|
||||
|
||||
# Where our patch files live (like MSCCL++)
|
||||
set(EXT_SOURCE "${_rccl_root}/ext-src")
|
||||
|
||||
# Build and install rocSHMEM. We run `../build_scripts/gdx_bxnt`
|
||||
# from a 'build' dir just like the README shows.
|
||||
ExternalProject_Add(rocshmem_ext
|
||||
SOURCE_DIR "${ROCSHMEM_SOURCE}"
|
||||
INSTALL_DIR "${ROCSHMEM_INSTALL_DIR}"
|
||||
UPDATE_DISCONNECTED TRUE
|
||||
LOG_DOWNLOAD FALSE
|
||||
LOG_CONFIGURE FALSE
|
||||
LOG_BUILD FALSE
|
||||
LOG_INSTALL FALSE
|
||||
BUILD_IN_SOURCE TRUE
|
||||
DOWNLOAD_COMMAND "" # using the submodule checkout above
|
||||
TEST_COMMAND ""
|
||||
DEPENDS rocshmem_checkout_submodule
|
||||
|
||||
# Rocshmem submodule commit hash -> commit b28a56bd54ccc581d05a439ffa466c3dacb3385
|
||||
# The project has its own scripts; we replicate the README sequence:
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND
|
||||
${CMAKE_COMMAND} -E make_directory build
|
||||
&& ${CMAKE_COMMAND} -E chdir build bash -lc "../scripts/build_configs/gda_bnxt -DUSE_EXTERNAL_MPI=OFF -DUSE_IPC=ON -DBUILD_EXAMPLES=OFF "
|
||||
&& ${CMAKE_COMMAND} -E chdir build ${CMAKE_COMMAND}
|
||||
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
|
||||
-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
|
||||
-DBUILD_EXAMPLES=OFF ..
|
||||
&& ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} -j
|
||||
INSTALL_COMMAND
|
||||
${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} install
|
||||
)
|
||||
|
||||
# After build, define the variables RCCL expects
|
||||
set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INSTALL_DIR}/include" PARENT_SCOPE)
|
||||
set(ROCSHMEM_LIBRARY "${ROCSHMEM_INSTALL_DIR}/lib/librocshmem.a" PARENT_SCOPE)
|
||||
find_library(_IBVERBS ibverbs)
|
||||
if(NOT _IBVERBS)
|
||||
message(FATAL_ERROR "libibverbs not found (install rdma-core/libibverbs-dev)")
|
||||
endif()
|
||||
set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
|
||||
|
||||
# Provide a dummy target other code can depend on
|
||||
add_custom_target(rocshmem_static ALL DEPENDS rocshmem_ext)
|
||||
else()
|
||||
# We found a prebuilt rocSHMEM; export variables upward as-is
|
||||
set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INCLUDE_DIR}" PARENT_SCOPE)
|
||||
set(ROCSHMEM_LIBRARY "${ROCSHMEM_LIBRARY}" PARENT_SCOPE)
|
||||
|
||||
find_library(_IBVERBS ibverbs)
|
||||
if(NOT _IBVERBS)
|
||||
message(FATAL_ERROR "libibverbs not found")
|
||||
endif()
|
||||
set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
|
||||
endif()
|
||||
|
||||
endfunction()
|
||||
@@ -0,0 +1,257 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# Dependencies
|
||||
|
||||
# HIP dependency is handled earlier in the project cmake file
|
||||
# when VerifyCompiler.cmake is included.
|
||||
|
||||
# GIT
|
||||
|
||||
# Test dependencies
|
||||
|
||||
# For downloading, building, and installing required dependencies
|
||||
include(cmake/DownloadProject.cmake)
|
||||
|
||||
message(STATUS "Generating ROCM NetIB... ")
|
||||
|
||||
# -------------------------
|
||||
# Configurable paths
|
||||
# -------------------------
|
||||
# Path to RCCL source tree (local clone)
|
||||
set(RCCL_SRC_DIR "${CMAKE_SOURCE_DIR}" CACHE PATH "Path to RCCL source directory")
|
||||
# Path to patch file
|
||||
set(ROCM_NETIB_PATCH_FILE "${CMAKE_SOURCE_DIR}/ext-src/rocm_netib.patch" CACHE FILEPATH "ROCM NETIB Patch file to apply to RCCL")
|
||||
set(ROCM_NETIB_FILE "${CMAKE_SOURCE_DIR}/src/transport/net_ib_rocm.cc" CACHE FILEPATH "Generated ROCM NETIB file")
|
||||
|
||||
# -------------------------
|
||||
# Find tools
|
||||
# -------------------------
|
||||
find_program(PATCH_EXECUTABLE patch)
|
||||
find_program(SED_EXECUTABLE sed)
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Applying RCCL ROCM NetIB patch... to ${CMAKE_SOURCE_DIR}"
|
||||
COMMAND bash -c "patch -p1 -i ${ROCM_NETIB_PATCH_FILE} -o ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/NCCL_PARAM(Ib/NCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/RCCL_PARAM(Ib/RCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclParamIb/ncclParamRocmIb/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/rcclParamIb/rcclParamRocmIb/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbMergedDevs/rocmIbMergedDevs/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbDevs/rocmIbDevs/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbLock/rocmIbLock/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ibProviderName/rocmIbProviderName/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbAsyncThread/rocmIbAsyncThread/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbGdrSupport/rocmIbGdrSupport/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbDmaBufSupport/rocmIbDmaBufSupport/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbInitCommDevBase/rocmIbInitCommDevBase/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbDestroyBase/rocmIbDestroyBase/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbRtrQp/rocmIbRtrQp/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbRtsQp/rocmIbRtsQp/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ForceEnableGdrdma/RocmForceEnableGdrdma/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbCheckVProps/rocmIbCheckVProps/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbGetRequest/rocmIbGetRequest/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbFreeRequest/rocmIbFreeRequest/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBufInternal/rocmIbRegMrDmaBufInternal/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbGetNetCommDevBase/rocmIbGetNetCommDevBase/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbDeregMrInternal/rocmIbDeregMrInternal/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbPostFifo/rocmIbPostFifo/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/reqTypeStr/rocmIbReqTypeStr/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/rcclNetP2pPolicy/rcclRocmNetP2pPolicy/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbMakeVDeviceInternal/rocmIbMakeVDeviceInternal/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbMakeVDevice/rocmIbMakeVDevice/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbInit/rocmIbInit/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbDevices/rocmIbDevices/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbGetPhysProperties/rocmIbGetPhysProperties/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbGetProperties/rocmIbGetProperties/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbListen\(/rocmIbListen\(/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbListen,/rocmIbListen,/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbConnect\(/rocmIbConnect\(/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbConnect /rocmIbConnect /g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbConnect,/rocmIbConnect,/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbAccept/rocmIbAccept/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbTest/rocmIbTest/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBuf/rocmIbRegMrDmaBuf/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbRegMr/rocmIbRegMr/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbDeregMr/rocmIbDeregMr/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbIsend/rocmIbIsend/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbIrecv/rocmIbIrecv/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbIflush/rocmIbIflush/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbCloseSend/rocmIbCloseSend/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbCloseRecv/rocmIbCloseRecv/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbCloseListen/rocmIbCloseListen/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
|
||||
WORKING_DIRECTORY ${RCCL_SRC_DIR}
|
||||
)
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
@@ -38,13 +38,15 @@ Collect this information about the ROCm version, GPU/accelerator, platform, and
|
||||
|
||||
rocminfo
|
||||
|
||||
* Run these ``rocm-smi`` commands to display the system topology.
|
||||
* Run these ``amd-smi`` commands to display the system topology.
|
||||
|
||||
.. code:: shell
|
||||
|
||||
rocm-smi
|
||||
rocm-smi --showtopo
|
||||
rocm-smi --showdriverversion
|
||||
amd-smi
|
||||
amd-smi topology
|
||||
amd-smi static --driver
|
||||
amd-smi firmware
|
||||
amd-smi xgmi
|
||||
|
||||
* Determine the values of the ``PATH`` and ``LD_LIBRARY_PATH`` environment variables.
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
rocm-docs-core==1.26.0
|
||||
rocm-docs-core==1.29.0
|
||||
|
||||
@@ -25,7 +25,7 @@ breathe==4.35.0
|
||||
# via rocm-docs-core
|
||||
certifi==2024.7.4
|
||||
# via requests
|
||||
cffi==1.16.0
|
||||
cffi==2.0.0
|
||||
# via
|
||||
# cryptography
|
||||
# pynacl
|
||||
@@ -164,7 +164,7 @@ pygments==2.18.0
|
||||
# sphinx
|
||||
pyjwt[crypto]==2.8.0
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
pynacl==1.6.2
|
||||
# via pygithub
|
||||
python-dateutil==2.9.0.post0
|
||||
# via jupyter-client
|
||||
@@ -187,7 +187,7 @@ requests==2.32.4
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.26.0
|
||||
rocm-docs-core==1.29.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.22.3
|
||||
# via
|
||||
@@ -265,7 +265,7 @@ typing-extensions==4.12.0
|
||||
# pygithub
|
||||
# referencing
|
||||
# sqlalchemy
|
||||
urllib3==2.5.0
|
||||
urllib3==2.6.3
|
||||
# via
|
||||
# pygithub
|
||||
# requests
|
||||
|
||||
@@ -60,36 +60,36 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v10)
|
||||
# API (v11)
|
||||
|
||||
Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclNet_v11` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
|
||||
ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
|
||||
ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
@@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us
|
||||
`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
|
||||
internal ones.
|
||||
|
||||
Every call to `init` returns an opaque context that the plugin uses internally to allocate resources
|
||||
and manage state. Such context is passed to other net plugin calls that create further resources,
|
||||
such as `listen` and `connect`. Every context is uniquely associated to a communicator
|
||||
using the commId. The network can also be initialized with a per communicator configuration using
|
||||
the `config` argument.
|
||||
|
||||
To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
|
||||
function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
|
||||
the plugin code adding the following definitions:
|
||||
@@ -282,7 +288,7 @@ side.
|
||||
`listen`
|
||||
|
||||
To create a connection, NCCL will start by calling `listen` on the receiver side. This function
|
||||
takes a device number as input argument, and should return a local `listenComm` object, and a
|
||||
takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a
|
||||
`handle` to pass to the other side, so that the sender side can connect to the receiver.
|
||||
|
||||
The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
|
||||
@@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
|
||||
should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
|
||||
succeeds.
|
||||
|
||||
The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
|
||||
The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference
|
||||
the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field.
|
||||
This field can be used by the network plugin to specify the QoS level of the connection. By default,
|
||||
`trafficClass` is set to -1 but can be configured by the application during communicator initialization
|
||||
to select a plugin-supported QoS level.
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
set(SRC_FILES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
|
||||
)
|
||||
|
||||
# Create shared library
|
||||
add_library(nccl-net-example SHARED ${SRC_FILES})
|
||||
|
||||
# Set include directories
|
||||
target_include_directories(nccl-net-example PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nccl
|
||||
)
|
||||
|
||||
# Set output name to match Makefile
|
||||
set_target_properties(nccl-net-example PROPERTIES
|
||||
OUTPUT_NAME "nccl-net-example"
|
||||
PREFIX "lib"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
|
||||
)
|
||||
@@ -22,7 +22,9 @@
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC 4
|
||||
|
||||
#include "net_v11.h"
|
||||
#include "net_v10.h"
|
||||
#include "net_v9.h"
|
||||
#include "net_v8.h"
|
||||
@@ -33,9 +35,9 @@
|
||||
#include "net_v3.h"
|
||||
#include "net_v2.h"
|
||||
|
||||
typedef ncclNet_v10_t ncclNet_t;
|
||||
typedef ncclNetProperties_v10_t ncclNetProperties_t;
|
||||
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
|
||||
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
|
||||
typedef ncclNet_v11_t ncclNet_t;
|
||||
typedef ncclNetProperties_v11_t ncclNetProperties_t;
|
||||
typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
|
||||
typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
|
||||
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
|
||||
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
|
||||
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
|
||||
|
||||
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
|
||||
|
||||
@@ -27,6 +27,7 @@ typedef struct {
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
|
||||
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
|
||||
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
|
||||
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
|
||||
typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,10 +5,9 @@
|
||||
#ifndef NET_V10_H_
|
||||
#define NET_V10_H_
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
|
||||
} ncclNetVDeviceProps_v10_t;
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NET_V11_H_
|
||||
#define NET_V11_H_
|
||||
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
|
||||
} ncclNetVDeviceProps_v11_t;
|
||||
|
||||
#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
|
||||
|
||||
typedef struct {
|
||||
// Plugin-specific TC value
|
||||
int trafficClass;
|
||||
} ncclNetCommConfig_v11_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int forceFlush; // Force a flush on receives
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
ncclNetVDeviceProps_v11_t vProps;
|
||||
size_t maxP2pBytes; // Max transfer size for point-to-point operations
|
||||
size_t maxCollBytes; // Max transfer size for collective operations
|
||||
int maxMultiRequestSize; // Maximum number of requests supported in a single multi-request.
|
||||
} ncclNetProperties_v11_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t maxConcurrentPeers;
|
||||
int32_t minConcurrentPeers;
|
||||
int32_t maxFlowsPerPeer;
|
||||
int32_t minFlowsPerPeer;
|
||||
} ncclNetCommAttr_v11_t;
|
||||
|
||||
typedef struct {
|
||||
ncclNetCommAttr_v11_t sendCommAttr;
|
||||
ncclNetCommAttr_v11_t recvCommAttr;
|
||||
uint32_t op;
|
||||
uint32_t algo;
|
||||
uint32_t proto;
|
||||
} ncclNetAttr_v11_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
|
||||
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
|
||||
// what index this new vNIC exists at
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
|
||||
// Finalize the network.
|
||||
ncclResult_t (*finalize)(void* ctx);
|
||||
|
||||
ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
|
||||
} ncclNet_v11_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -5,10 +5,9 @@
|
||||
#ifndef NET_V9_H_
|
||||
#define NET_V9_H_
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
|
||||
} ncclNetVDeviceProps_v9_t;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
int max_requests = NCCL_NET_MAX_REQUESTS;
|
||||
|
||||
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||
@@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
|
||||
@@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
|
||||
__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }
|
||||
|
||||
#define PLUGIN_NAME "Plugin"
|
||||
|
||||
const ncclNet_v10_t ncclNetPlugin_v10 = {
|
||||
const ncclNet_v11_t ncclNetPlugin_v11 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
@@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
|
||||
.getDeviceMr = pluginGetDeviceMr,
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
.makeVDevice = pluginMakeVDevice,
|
||||
.finalize = pluginFinalize,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) {
|
||||
// Below are default values, if unsure don't change.
|
||||
|
||||
props->name = "Example";
|
||||
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
|
||||
props->pciPath = NULL;
|
||||
// Only used to detect NICs with multiple PCI attachments.
|
||||
props->guid = 0;
|
||||
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
|
||||
props->regIsGlobal = 0;
|
||||
// Force flush after receive. Needed if the control path and data path use a different path to the GPU
|
||||
props->forceFlush = 0;
|
||||
// Speed in *Mbps*. 100000 means 100G
|
||||
props->speed = 100000;
|
||||
// Port number, used in conjunction with guid
|
||||
props->port = 0;
|
||||
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
|
||||
props->latency = 0;
|
||||
// Maximum number of comm objects we can create.
|
||||
props->maxComms = 1024*1024;
|
||||
// Maximum number of receive operations taken by irecv().
|
||||
props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
|
||||
// Coupling with NCCL network device-side code.
|
||||
props->netDeviceType = NCCL_NET_DEVICE_HOST;
|
||||
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
|
||||
// Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
|
||||
props->vProps.ndevs = 1;
|
||||
props->vProps.devs[0] = dev;
|
||||
// maximum transfer sizes the plugin can handle
|
||||
props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
|
||||
props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
|
||||
|
||||
const ncclNet_v10_t ncclNetPlugin_v10 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit_v10,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v10,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v10,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend,
|
||||
.irecv = pluginIrecv,
|
||||
.iflush = pluginIflush,
|
||||
.test = pluginTest,
|
||||
.closeSend = pluginCloseSend,
|
||||
.closeRecv = pluginCloseRecv,
|
||||
.closeListen = pluginCloseListen,
|
||||
.getDeviceMr = pluginGetDeviceMr,
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
.makeVDevice = pluginMakeVDevice_v10,
|
||||
};
|
||||
|
||||
|
||||
__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
|
||||
return pluginInit(logFunction, NULL);
|
||||
return pluginInit_v10(logFunction, NULL);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
|
||||
return pluginGetProperties(dev, (ncclNetProperties_t*)props);
|
||||
return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
|
||||
return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
|
||||
return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
|
||||
@@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = {
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v9,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v9,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr,
|
||||
@@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v8,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v9,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr,
|
||||
@@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v7,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v9,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr_v7,
|
||||
@@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v6,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr_v7,
|
||||
@@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v6,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v6,
|
||||
.accept = pluginAccept_v6,
|
||||
.regMr = pluginRegMr_v7,
|
||||
@@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
|
||||
ncclResult_t ret;
|
||||
do {
|
||||
ncclNetDeviceHandle_v7_t* handle = NULL;
|
||||
ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
|
||||
ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle);
|
||||
} while (ret == ncclSuccess && *sendComm == NULL);
|
||||
return ret;
|
||||
}
|
||||
@@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v4,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v10,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr_v7,
|
||||
@@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
|
||||
}
|
||||
static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
|
||||
max_requests = NCCL_NET_MAX_REQUESTS_V3;
|
||||
return pluginInit(logFunction, NULL);
|
||||
return pluginInit_v10(logFunction, NULL);
|
||||
}
|
||||
#include <string.h>
|
||||
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
|
||||
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
|
||||
ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm);
|
||||
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
|
||||
return ret;
|
||||
}
|
||||
@@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
|
||||
.devices = pluginDevices,
|
||||
.pciPath = pluginPciPath,
|
||||
.ptrSupport = pluginPtrSupport,
|
||||
.listen = pluginListen,
|
||||
.listen = pluginListen_v3,
|
||||
.connect = pluginConnect_v4,
|
||||
.accept = pluginAccept_v4,
|
||||
.regMr = pluginRegMr_v7,
|
||||
|
||||
@@ -49,9 +49,9 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v4)
|
||||
# API (v5)
|
||||
|
||||
Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
@@ -60,15 +60,15 @@ typedef struct {
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commId : communicator id
|
||||
// - commName : user assigned communicator name
|
||||
// - commHash : communicator id
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
@@ -76,7 +76,7 @@ typedef struct {
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
@@ -88,13 +88,13 @@ typedef struct {
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v4_t;
|
||||
} ncclProfiler_v5_t;
|
||||
```
|
||||
|
||||
## Error codes
|
||||
@@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
|
||||
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
|
||||
int rank; // rank that generated the event
|
||||
uint64_t type; // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ...
|
||||
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
|
||||
int rank; // rank that generated the event
|
||||
union {
|
||||
struct { // GroupAPI event metadata
|
||||
bool graphCaptured; // Set to true if the Group API event is emitted inside a CUDA graph capture
|
||||
int groupDepth; // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL)
|
||||
// and not called by the user. Any depth greater than 1 means that the user made the Group API call.
|
||||
} groupApi;
|
||||
|
||||
struct { // Collective API call metadata
|
||||
const char* func; // string containing name of the collective operation during
|
||||
size_t count; // data count
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
int root; // root rank
|
||||
void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in
|
||||
bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture
|
||||
} collApi;
|
||||
|
||||
struct { // Point-to-point API call metadata
|
||||
const char* func; // string containing name of the p2p operation
|
||||
size_t count; // data count
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
void* stream; // Opaque handle that points to a CUDA stream object
|
||||
bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture
|
||||
} p2pApi;
|
||||
|
||||
struct { // Kernel Launch event metadata
|
||||
void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in
|
||||
} kernelLaunch;
|
||||
|
||||
struct { // collective events metadata
|
||||
uint64_t seqNumber; // sequence number of this collective operation in the communicator
|
||||
const char* func; // string containing name of the collective
|
||||
@@ -164,6 +191,7 @@ typedef struct {
|
||||
uint8_t nWarps; // number of GPU warps for this collective
|
||||
const char* algo; // string containing name of the algorithm for this collective
|
||||
const char* proto; // string containing name of the protocol for this collective
|
||||
void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent
|
||||
} coll;
|
||||
|
||||
struct { // point-to-point events metadata
|
||||
@@ -173,6 +201,7 @@ typedef struct {
|
||||
size_t count;
|
||||
int peer; // peer rank for this point-to-point
|
||||
uint8_t nChannels; // number of channels for this p2p
|
||||
void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent
|
||||
} p2p;
|
||||
|
||||
struct { // proxyOp events metadata
|
||||
@@ -198,12 +227,12 @@ typedef struct {
|
||||
void* data; // pointer to network plugin defined event
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v4_t;
|
||||
} ncclProfilerEventDescr_v5_t;
|
||||
```
|
||||
|
||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
|
||||
`ncclProfileNetPlugin`.
|
||||
NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`,
|
||||
`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`,
|
||||
`ncclProfileKernelCh` and `ncclProfileNetPlugin`.
|
||||
|
||||
#### stopEvent
|
||||
|
||||
@@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior.
|
||||
|
||||
#### recordEventState
|
||||
|
||||
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
|
||||
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
|
||||
Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`,
|
||||
`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`.
|
||||
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
|
||||
`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
|
||||
`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
|
||||
|
||||
The state of these events can be updated, along with event attributes, using `recordEventState`.
|
||||
@@ -258,9 +287,21 @@ typedef enum {
|
||||
|
||||
// ncclProfileKernelCh event states
|
||||
ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update
|
||||
} ncclProfilerEventState_v4_t;
|
||||
|
||||
// Group API States
|
||||
ncclProfilerGroupStartApiStop = 23,// state marks the end of a ncclGroupStart() API call
|
||||
ncclProfilerEndGroupApiStart = 24 // state marks the start of a ncclGroupEnd() API call
|
||||
} ncclProfilerEventState_v5_t;
|
||||
```
|
||||
|
||||
NCCL profile API events are generated when the API calls are made, right after NCCL checks
|
||||
for graph capture information. They parent collective, point-to-point and kernel launch events
|
||||
and persist across multiple operations in a group.
|
||||
|
||||
`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the
|
||||
case of graph capture, the event start indicates that the kernel launch operation has been recorded,
|
||||
not launched.
|
||||
|
||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
|
||||
provide a summary of the activity of the proxy progress thread for that channel. Most of the
|
||||
@@ -379,7 +420,7 @@ typedef union {
|
||||
struct { // attribute to update for ncclProfileKernelCh events
|
||||
uint64_t pTimer; // timestamp provided by the NCCL kernel
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v4_t;
|
||||
} ncclProfilerEventStateArgs_v5_t;
|
||||
```
|
||||
|
||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||
@@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur
|
||||
NCCL core events (reported above) are organized into a hierarchy as reported below:
|
||||
|
||||
```
|
||||
Group event
|
||||
Group API event
|
||||
|
|
||||
+- Collective event
|
||||
+- Collective API event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| | |
|
||||
| | +- ProxyStep event
|
||||
| | |
|
||||
| | +- NetPlugin event
|
||||
| |
|
||||
| +- KernelCh event
|
||||
| +- Collective event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| | |
|
||||
| | +- ProxyStep event
|
||||
| | |
|
||||
| | +- NetPlugin event
|
||||
| |
|
||||
| +- KernelCh event
|
||||
|
|
||||
+- Point-to-point event
|
||||
|
|
||||
+- ProxyOp event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| |
|
||||
| +- NetPlugin event
|
||||
|
|
||||
+- KernelCh event
|
||||
+- Point-to-point API event
|
||||
| |
|
||||
| +- Point-to-point event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| | |
|
||||
| | +- ProxyStep event
|
||||
| | |
|
||||
| | +- NetPlugin event
|
||||
| |
|
||||
| +- KernelCh event
|
||||
|
|
||||
+- Kernel Launch event
|
||||
|
||||
ProxyCtrl event
|
||||
```
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
# Find all C source files in current directory
|
||||
set(SRC_FILES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc
|
||||
)
|
||||
|
||||
# Create shared library
|
||||
add_library(nccl-profiler-example SHARED ${SRC_FILES})
|
||||
|
||||
# Set include directories
|
||||
target_include_directories(nccl-profiler-example PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nccl
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
# Set output name to match Makefile
|
||||
set_target_properties(nccl-profiler-example PROPERTIES
|
||||
OUTPUT_NAME "nccl-profiler-example"
|
||||
PREFIX "lib"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
|
||||
)
|
||||
|
||||
add_custom_command(TARGET nccl-profiler-example POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins
|
||||
)
|
||||
|
||||
# Add custom target for clean (equivalent to Makefile clean target)
|
||||
add_custom_target(clean-profiler-lib
|
||||
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so
|
||||
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so
|
||||
COMMENT "Cleaning libnccl-profiler-example.so"
|
||||
)
|
||||
@@ -4,19 +4,26 @@
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
.DEFAULT_GOAL: build
|
||||
include ../../makefiles/common.mk
|
||||
SRCDIR ?= $(abspath ../..)
|
||||
ROCM_PATH ?= $(wildcard /opt/rocm)
|
||||
CXX = $(ROCM_PATH)/lib/llvm/bin/amdclang++
|
||||
BUILDDIR ?= .
|
||||
NCCLDIR := $(BUILDDIR)
|
||||
HIPIFY_DIR := hipify-profiler
|
||||
|
||||
SRC_FILES := $(wildcard *.c)
|
||||
SRC_FILES := $(wildcard *.cc)
|
||||
HIPIFY_SRC := $(addprefix $(HIPIFY_DIR)/,$(SRC_FILES))
|
||||
|
||||
build: ${BUILDDIR}/librccl-profiler.so
|
||||
build: ${BUILDDIR}/librccl-profiler-example.so
|
||||
|
||||
${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
|
||||
${BUILDDIR}/librccl-profiler-example.so: $(HIPIFY_SRC)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${BUILDDIR}
|
||||
$(CC) -Inccl -fPIC -shared -o $@ $^
|
||||
$(CXX) -D__HIP_PLATFORM_AMD__ -I$(HIPIFY_DIR) -I$(HIPIFY_DIR)/nccl -I$(ROCM_PATH)/include -fPIC -shared -o $@ $^
|
||||
|
||||
$(HIPIFY_DIR)/%.cc: %.cc
|
||||
@mkdir -p $(HIPIFY_DIR)/nccl
|
||||
@cp *.cc *.h $(HIPIFY_DIR)/
|
||||
@cp nccl/*.h $(HIPIFY_DIR)/nccl/
|
||||
@hipify-perl -inplace -quiet-warnings $(HIPIFY_DIR)/*.cc $(HIPIFY_DIR)/*.h
|
||||
|
||||
clean:
|
||||
rm -f ${BUILDDIR}/librccl-profiler.so
|
||||
rm -rf ${BUILDDIR}/librccl-profiler-example.so $(HIPIFY_DIR)
|
||||
@@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of.
|
||||
|
||||
## Building the profiler plugin
|
||||
|
||||
To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
|
||||
You can override `NCCL_HOME` to where the NCCL installation is on your system.
|
||||
To build the example plugin shipped as part of NCCL, just type `make`.
|
||||
|
||||
## Using the profiler plugin
|
||||
|
||||
@@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system.
|
||||
|
||||
As an example, setting:
|
||||
|
||||
`NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
|
||||
`NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
|
||||
|
||||
enables the profiling of the group, the collective and the proxy op events. The same events can be
|
||||
enables the profiling of the group API, the collective and the proxy op events. The same events can be
|
||||
expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
|
||||
in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
|
||||
is that the profiler can easily correlate events that belong to the same NCCL operation and present
|
||||
them accordingly.
|
||||
them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler.
|
||||
|
||||
3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
|
||||
${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
|
||||
@@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events
|
||||
contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
|
||||
generated by remote proxies. A list of pools and their size is reported below:
|
||||
|
||||
- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
|
||||
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
|
||||
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
|
||||
- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256)
|
||||
- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256)
|
||||
- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256)
|
||||
- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256)
|
||||
- `NCCL_PROFILE_COLL_POOL_SIZE` (256)
|
||||
- `NCCL_PROFILE_P2P_POOL_SIZE` (256)
|
||||
- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
|
||||
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
|
||||
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256)
|
||||
|
||||
Remote proxy operations are generated when PXN is in use. Refer to this article for more information
|
||||
about PXN and how it works:
|
||||
@@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace
|
||||
|
||||
```
|
||||
[
|
||||
{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
|
||||
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
|
||||
{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
|
||||
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
|
||||
{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
|
||||
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
|
||||
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
|
||||
{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}},
|
||||
{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}},
|
||||
{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990},
|
||||
{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}},
|
||||
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}},
|
||||
{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}},
|
||||
{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
|
||||
{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995},
|
||||
{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
|
||||
{“name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}},
|
||||
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}},
|
||||
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}},
|
||||
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997},
|
||||
{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995},
|
||||
{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
|
||||
{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979},
|
||||
{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}},
|
||||
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993},
|
||||
{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}},
|
||||
{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992},
|
||||
... [ trace truncated for brevity ]
|
||||
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
|
||||
{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
|
||||
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980},
|
||||
{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981},
|
||||
{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001},
|
||||
{}]
|
||||
```
|
||||
|
||||
Details about the fields used in the trace can be found at this link:
|
||||
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
|
||||
|
||||
The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
|
||||
The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through
|
||||
the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
|
||||
(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
|
||||
one collective and this is what is presented in the traces above).
|
||||
@@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation.
|
||||
- datatype : NCCL datatype
|
||||
- algorithm : algorithm used to process the ncclAllReduce
|
||||
- protocol : protocol used to process the ncclAllReduce
|
||||
- nMaxChannels: max number of channels used to process the ncclAllReduce
|
||||
- nChannels : Number of channels used to process the ncclAllReduce
|
||||
|
||||
If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
|
||||
consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
|
||||
of collective and p2p operations`.
|
||||
|
||||
### Proxy Send
|
||||
The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
|
||||
info in the args field:
|
||||
|
||||
- Channel : id of the channel used by this proxy operation to send data to the peer
|
||||
- Peer : peer rank
|
||||
- Steps : number of network steps required to transfer transSize bytes to the peer
|
||||
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
|
||||
- transSize : bytes transferred across the channel by this proxy operation
|
||||
- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post
|
||||
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
|
||||
- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send
|
||||
- DONE : struct containing the number of network sends completed and the time stamp of the last send completed
|
||||
|
||||
In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
|
||||
which could help identify at which point the network problem occurred.
|
||||
|
||||
The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
|
||||
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
|
||||
entries below are also reported by the profiler.
|
||||
|
||||
#### Proxy SendBufferWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
|
||||
|
||||
#### Proxy SendGPUWait
|
||||
#### Proxy SendGpuWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
|
||||
buffer.
|
||||
@@ -201,31 +164,6 @@ buffer.
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
|
||||
|
||||
### Proxy Recv
|
||||
|
||||
The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
|
||||
info in the args field:
|
||||
|
||||
- Channel : id of the channel used by this proxy operation to recv data from the peer
|
||||
- Peer : peer rank
|
||||
- Steps : number of network steps required to transfer transSize bytes from the peer
|
||||
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
|
||||
- transSize : bytes transferred across the channel by this proxy operation
|
||||
- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted
|
||||
- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed
|
||||
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
|
||||
- DONE : struct containing the number of flush completed and the time stamp for the last flush completed
|
||||
|
||||
The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
|
||||
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
|
||||
entries below are also reported by the profiler.
|
||||
|
||||
|
||||
#### Proxy RecvBufferWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
|
||||
become available.
|
||||
|
||||
#### Proxy RecvWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
|
||||
@@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
|
||||
|
||||
#### Proxy RecvGPUWait
|
||||
#### Proxy RecvGpuWait
|
||||
|
||||
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "event.h"
|
||||
|
||||
int taskEventQueueEmpty(struct group* g) {
|
||||
return g->eventHead == NULL;
|
||||
}
|
||||
|
||||
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
|
||||
event->next = NULL;
|
||||
if (g->eventHead) g->eventTail->next = event;
|
||||
else g->eventHead = event;
|
||||
g->eventTail = event;
|
||||
}
|
||||
|
||||
struct taskEventBase* taskEventQueueHead(struct group* g) {
|
||||
return g->eventHead;
|
||||
}
|
||||
|
||||
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
|
||||
struct taskEventBase* tmp = g->eventHead;
|
||||
g->eventHead = g->eventHead->next;
|
||||
if (g->eventHead == NULL) g->eventTail = NULL;
|
||||
return tmp;
|
||||
}
|
||||
@@ -10,10 +10,14 @@
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#include <cstring>
|
||||
#include "err.h"
|
||||
#include "profiler.h"
|
||||
#include "queue.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define MAX_CHANNELS 128 // Match RCCL's MAXCHANNELS
|
||||
#define MAX_STEPS 16
|
||||
#define MAX_STEPS 1024
|
||||
#define MAX_OPS 16 // Up to 64K ranks for PAT
|
||||
#define MAX_EVENTS_PER_REQ (8)
|
||||
|
||||
@@ -21,7 +25,7 @@ struct proxyOp;
|
||||
struct proxyStep;
|
||||
|
||||
struct netPlugin {
|
||||
uint8_t type;
|
||||
uint64_t type;
|
||||
int pluginType;
|
||||
int pluginVer;
|
||||
uint8_t pluginEvent;
|
||||
@@ -63,7 +67,7 @@ struct kernelCh {
|
||||
#define PROXY_STEP_MAX_STATES 3
|
||||
|
||||
struct proxyStep {
|
||||
uint8_t type; // type of event: network transfer
|
||||
uint64_t type; // type of event: network transfer
|
||||
int state;
|
||||
int step; // network transfer id in given channel
|
||||
int isSend; // send/recv channel operation
|
||||
@@ -76,7 +80,7 @@ struct proxyStep {
|
||||
};
|
||||
|
||||
struct proxyOp {
|
||||
uint8_t type; // type of event: proxy operation
|
||||
uint64_t type; // type of event: proxy operation
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
pid_t pid;
|
||||
int rank;
|
||||
@@ -97,7 +101,7 @@ struct group;
|
||||
struct context;
|
||||
|
||||
struct proxyCtrl {
|
||||
uint8_t type;
|
||||
uint64_t type;
|
||||
struct context* ctx; // profiler context
|
||||
double startTs;
|
||||
double stopTs;
|
||||
@@ -107,12 +111,12 @@ struct proxyCtrl {
|
||||
|
||||
// task level event base structure
|
||||
struct taskEventBase {
|
||||
uint8_t type; // event type: collective/p2p
|
||||
uint64_t type; // event type: collective/p2p
|
||||
int rank; // rank of the operation in NCCL communicator
|
||||
const char* func; // ncclFunc*
|
||||
int refCount; // number of references for this operation
|
||||
struct group* parent; // parent event group
|
||||
struct taskEventBase* next; // next top level event in group
|
||||
void* parent; // parent API event
|
||||
struct taskEventBase* next; // next top level event
|
||||
double startTs;
|
||||
double stopTs;
|
||||
};
|
||||
@@ -147,7 +151,7 @@ struct p2p {
|
||||
};
|
||||
|
||||
struct group {
|
||||
uint8_t type;
|
||||
uint64_t type;
|
||||
struct context* ctx; // profiler context
|
||||
int groupId;
|
||||
int refCount;
|
||||
@@ -158,6 +162,70 @@ struct group {
|
||||
struct group* next; // next group event in queue
|
||||
};
|
||||
|
||||
struct collApi {
|
||||
uint64_t type;
|
||||
struct groupApi* parent;
|
||||
struct context* ctx; // profiler context
|
||||
int collApiId;
|
||||
int refCount;
|
||||
cudaStream_t stream;
|
||||
const char* func;
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
int root;
|
||||
bool graphCaptured;
|
||||
struct taskEventBase* eventHead; // queue head for task events
|
||||
struct taskEventBase* eventTail; // queue tail for task events
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct collApi* next;
|
||||
};
|
||||
|
||||
struct p2pApi {
|
||||
uint64_t type;
|
||||
struct groupApi* parent;
|
||||
struct context* ctx; // profiler context
|
||||
int p2pApiId;
|
||||
int refCount;
|
||||
const char* func;
|
||||
cudaStream_t stream;
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
bool graphCaptured;
|
||||
struct taskEventBase* eventHead; // queue head for task events
|
||||
struct taskEventBase* eventTail; // queue tail for task events
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct p2pApi* next;
|
||||
};
|
||||
|
||||
struct kernelLaunch {
|
||||
uint64_t type;
|
||||
struct groupApi* parent;
|
||||
cudaStream_t stream;
|
||||
int kernelLaunchId;
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct kernelLaunch* next;
|
||||
};
|
||||
|
||||
struct groupApi {
|
||||
uint64_t type;
|
||||
struct context* ctx;
|
||||
int groupApiId;
|
||||
int refCount;
|
||||
bool graphCaptured;
|
||||
int groupDepth;
|
||||
struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
|
||||
struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
|
||||
struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
|
||||
double endOfncclGroupStartTs;
|
||||
double startOfncclGroupEndTs;
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct groupApi* next;
|
||||
};
|
||||
|
||||
// arrays for different event objects
|
||||
struct context {
|
||||
const char* commName;
|
||||
@@ -165,6 +233,26 @@ struct context {
|
||||
int nranks;
|
||||
int rank;
|
||||
|
||||
int groupApiPoolSize;
|
||||
int groupApiPoolBase;
|
||||
int groupApiPoolIndex;
|
||||
struct groupApi* groupApiPool;
|
||||
|
||||
int collApiPoolSize;
|
||||
int collApiPoolBase;
|
||||
int collApiPoolIndex;
|
||||
struct collApi* collApiPool;
|
||||
|
||||
int p2pApiPoolSize;
|
||||
int p2pApiPoolBase;
|
||||
int p2pApiPoolIndex;
|
||||
struct p2pApi* p2pApiPool;
|
||||
|
||||
int kernelLaunchPoolSize;
|
||||
int kernelLaunchPoolBase;
|
||||
int kernelLaunchPoolIndex;
|
||||
struct kernelLaunch* kernelLaunchPool;
|
||||
|
||||
int groupPoolSize;
|
||||
int groupPoolBase;
|
||||
int groupPoolIndex;
|
||||
@@ -186,9 +274,50 @@ struct context {
|
||||
struct proxyCtrl* proxyCtrlPool;
|
||||
};
|
||||
|
||||
int taskEventQueueEmpty(struct group* g);
|
||||
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
|
||||
struct taskEventBase* taskEventQueueHead(struct group* g);
|
||||
struct taskEventBase* taskEventQueueDequeue(struct group* g);
|
||||
template <typename T>
|
||||
inline int taskEventQueueEmpty(T *obj) {
|
||||
return obj->eventHead == NULL;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
|
||||
event->next = NULL;
|
||||
if (obj->eventHead) obj->eventTail->next = event;
|
||||
else obj->eventHead = event;
|
||||
obj->eventTail = event;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline struct taskEventBase* taskEventQueueHead(T *obj) {
|
||||
return obj->eventHead;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
|
||||
struct taskEventBase* tmp = obj->eventHead;
|
||||
obj->eventHead = obj->eventHead->next;
|
||||
if (obj->eventHead == NULL) obj->eventTail = NULL;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void resetTaskEvents(T *obj, struct context* ctx) {
|
||||
while (!taskEventQueueEmpty(obj)) {
|
||||
struct taskEventBase* base = taskEventQueueDequeue(obj);
|
||||
if (base->type == ncclProfileColl) {
|
||||
struct collective* c = (struct collective *)base;
|
||||
// reset event proxyOps & proxySteps
|
||||
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
|
||||
// release collective events in the group and return them to the collective pool
|
||||
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
|
||||
} else if (base->type == ncclProfileP2p) {
|
||||
struct p2p* p = (struct p2p *)base;
|
||||
// reset event proxyOp and proxySteps
|
||||
memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
|
||||
// release p2p events in the group and return them to the p2p pool
|
||||
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -11,17 +11,20 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||
ncclProfileGroupApi = (1 << 8), // Group API events
|
||||
ncclProfileCollApi = (1 << 9), // Collective API events
|
||||
ncclProfileP2pApi = (1 << 10), // Point-to-Point API events
|
||||
ncclProfileKernelLaunch = (1 << 11), // Kernel launch events
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
@@ -56,21 +59,27 @@ typedef enum {
|
||||
|
||||
/* Kernel event states */
|
||||
ncclProfilerKernelChStop = 22,
|
||||
|
||||
/* Group API States */
|
||||
ncclProfilerEndGroupApiStart = 23,
|
||||
ncclProfilerBeginGroupApiEnd = 24
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
|
||||
|
||||
#include "profiler_v5.h"
|
||||
#include "profiler_v4.h"
|
||||
#include "profiler_v3.h"
|
||||
#include "profiler_v2.h"
|
||||
#include "profiler_v1.h"
|
||||
#include "profiler_net.h"
|
||||
|
||||
typedef ncclProfiler_v4_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v5_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V5_H_
|
||||
#define PROFILER_V5_H_
|
||||
#include <stdbool.h>
|
||||
|
||||
typedef struct {
|
||||
uint64_t type; // event type descriptor: ncclProfileGroupApi, ...
|
||||
void* parentObj; // pointer to the profiler parent object
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
int graphCaptured;
|
||||
int groupDepth;
|
||||
} groupApi;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
int root;
|
||||
void* stream;
|
||||
bool graphCaptured;
|
||||
} collApi;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
void* stream;
|
||||
bool graphCaptured;
|
||||
} p2pApi;
|
||||
|
||||
struct {
|
||||
void* stream;
|
||||
} kernelLaunch;
|
||||
|
||||
struct {
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
void* parentGroup; // for backward compatibility with v4
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
void* parentGroup; // for backward compatibility with v4
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
uint64_t pTimer; // start timestamp from GPU globaltimer
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v5_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
|
||||
struct {
|
||||
void* data;
|
||||
} netPlugin;
|
||||
|
||||
struct {
|
||||
uint64_t pTimer;
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v5_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commId : communicator id
|
||||
// - commName : user assigned communicator name
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v5_t;
|
||||
|
||||
#endif
|
||||
+231
-34
@@ -6,7 +6,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <cstring>
|
||||
#include <linux/limits.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
@@ -22,12 +22,20 @@ static int initialized; // initialization counter for profiler
|
||||
static double startTime; // profiler start time
|
||||
|
||||
static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
|
||||
static const int defaultGroupPoolSize = 16;
|
||||
static const int defaultCollPoolSize = 16;
|
||||
static const int defaultP2pPoolSize = 1024;
|
||||
static const int defaultGroupApiPoolSize = 256;
|
||||
static const int defaultCollApiPoolSize = 256;
|
||||
static const int defaultP2pApiPoolSize = 256;
|
||||
static const int defaultKernelLaunchPoolSize = 256;
|
||||
static const int defaultGroupPoolSize = 256;
|
||||
static const int defaultCollPoolSize = 256;
|
||||
static const int defaultP2pPoolSize = 256;
|
||||
static const int defaultProxyCtrlPoolSize = 16;
|
||||
static const int defaultDetachPoolSize = 128;
|
||||
static const int defaultDetachPoolSize = 256;
|
||||
|
||||
static int groupApiPoolSize;
|
||||
static int collApiPoolSize;
|
||||
static int p2pApiPoolSize;
|
||||
static int kernelLaunchPoolSize;
|
||||
static int groupPoolSize;
|
||||
static int collPoolSize;
|
||||
static int p2pPoolSize;
|
||||
@@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pid_t pid;
|
||||
static int* eActivationMaskPtr;
|
||||
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
pthread_mutex_lock(&lock);
|
||||
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
|
||||
// first thread initializes event mask, environment and detach pool
|
||||
@@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
|
||||
str = getenv("NCCL_PROFILE_EVENT_MASK");
|
||||
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
|
||||
|
||||
str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE");
|
||||
groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize;
|
||||
|
||||
str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE");
|
||||
collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize;
|
||||
|
||||
str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE");
|
||||
p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize;
|
||||
|
||||
str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE");
|
||||
kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize;
|
||||
|
||||
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
|
||||
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
|
||||
|
||||
@@ -95,12 +115,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
|
||||
|
||||
// pre-allocate memory for event object pools in dedicated profiler context
|
||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||
if (ctx == nullptr) return ncclSystemError;
|
||||
ctx->commName = commName;
|
||||
ctx->commHash = commHash;
|
||||
ctx->commHash = commId;
|
||||
ctx->nranks = nranks;
|
||||
ctx->rank = rank;
|
||||
logFn = logfn;
|
||||
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
|
||||
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank);
|
||||
|
||||
ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool));
|
||||
if (ctx->groupApiPool == NULL) goto fail;
|
||||
|
||||
ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool));
|
||||
if (ctx->collApiPool == NULL) goto fail;
|
||||
|
||||
ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool));
|
||||
if (ctx->p2pApiPool == NULL) goto fail;
|
||||
|
||||
ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool));
|
||||
if (ctx->kernelLaunchPool == NULL) goto fail;
|
||||
|
||||
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
||||
if (ctx->groupPool == NULL) goto fail;
|
||||
@@ -130,16 +163,22 @@ fail:
|
||||
if (ctx->p2pPool) free(ctx->p2pPool);
|
||||
if (ctx->collPool) free(ctx->collPool);
|
||||
if (ctx->groupPool) free(ctx->groupPool);
|
||||
if (ctx->collApiPool) free(ctx->collApiPool);
|
||||
if (ctx->p2pApiPool) free(ctx->p2pApiPool);
|
||||
if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool);
|
||||
if (ctx->groupApiPool) free(ctx->groupApiPool);
|
||||
free(ctx);
|
||||
if (detachPool) free(detachPool);
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
static const char* profilerDumpFile;
|
||||
|
||||
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
|
||||
FILE* fh = NULL;
|
||||
char filename[PATH_MAX] = { 0 };
|
||||
struct context* ctx = (struct context *)context;
|
||||
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
|
||||
const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE");
|
||||
if (dump) {
|
||||
sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
|
||||
fh = fopen(filename, "w");
|
||||
@@ -148,10 +187,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
|
||||
INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
|
||||
|
||||
// print last N groups/collectives/p2ps
|
||||
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
|
||||
int end = ctx->groupPoolIndex;
|
||||
// Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy.
|
||||
// Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example.
|
||||
int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0;
|
||||
int end = ctx->groupApiPoolIndex;
|
||||
for (int i = start; i < end; i++) {
|
||||
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
|
||||
printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]);
|
||||
}
|
||||
|
||||
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
|
||||
@@ -161,6 +202,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
|
||||
}
|
||||
|
||||
free(ctx->groupPool);
|
||||
free(ctx->collApiPool);
|
||||
free(ctx->p2pApiPool);
|
||||
free(ctx->kernelLaunchPool);
|
||||
free(ctx->groupApiPool);
|
||||
free(ctx->collPool);
|
||||
free(ctx->p2pPool);
|
||||
free(ctx->proxyCtrlPool);
|
||||
@@ -187,7 +232,113 @@ __hidden void updateEvent(void* handle);
|
||||
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
|
||||
*eHandle = NULL;
|
||||
struct context* ctx = (struct context *)context;
|
||||
if (eDescr->type == ncclProfileGroup) {
|
||||
if (eDescr->type == ncclProfileGroupApi) {
|
||||
struct groupApi* event;
|
||||
int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) {
|
||||
// if there are available group API events grab one
|
||||
event = &ctx->groupApiPool[groupApiId%groupApiPoolSize];
|
||||
// Make sure all child events of the picked group API event are cleared
|
||||
while (!profilerQueueEmpty(&event->collApiEvents)) {
|
||||
struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents);
|
||||
resetTaskEvents(collApiEvent, ctx);
|
||||
__atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
while (!profilerQueueEmpty(&event->p2pApiEvents)) {
|
||||
struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents);
|
||||
resetTaskEvents(p2pApiEvent, ctx);
|
||||
__atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
while (!profilerQueueEmpty(&event->kernelLaunchEvents)) {
|
||||
profilerQueueDequeue(&event->kernelLaunchEvents);
|
||||
__atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
event->type = ncclProfileGroupApi;
|
||||
event->ctx = ctx;
|
||||
event->groupApiId = groupApiId;
|
||||
event->graphCaptured = eDescr->groupApi.graphCaptured;
|
||||
event->groupDepth = eDescr->groupApi.groupDepth;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
} else if (eDescr->type == ncclProfileCollApi) {
|
||||
if (eDescr->parentObj == NULL) return ncclSuccess;
|
||||
struct collApi* event;
|
||||
int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) {
|
||||
// if there are available Coll API events grab one
|
||||
event = &ctx->collApiPool[collApiId%collApiPoolSize];
|
||||
resetTaskEvents(event, ctx);
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
event->type = ncclProfileCollApi;
|
||||
event->collApiId = collApiId;
|
||||
event->ctx = ctx;
|
||||
event->func = eDescr->collApi.func;
|
||||
event->stream = (cudaStream_t) eDescr->collApi.stream;
|
||||
event->count = eDescr->collApi.count;
|
||||
event->datatype = eDescr->collApi.datatype;
|
||||
event->root = eDescr->collApi.root;
|
||||
event->graphCaptured = eDescr->collApi.graphCaptured;
|
||||
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
|
||||
event->parent = parent;
|
||||
profilerQueueEnqueue(&parent->collApiEvents, event);
|
||||
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
|
||||
*eHandle = event;
|
||||
} else if (eDescr->type == ncclProfileP2pApi) {
|
||||
if (eDescr->parentObj == NULL) return ncclSuccess;
|
||||
struct p2pApi* event;
|
||||
int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) {
|
||||
// if there are available p2p API events grab one
|
||||
event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize];
|
||||
resetTaskEvents(event, ctx);
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
event->type = ncclProfileP2pApi;
|
||||
event->p2pApiId = p2pApiId;
|
||||
event->ctx = ctx;
|
||||
event->func = eDescr->p2pApi.func;
|
||||
event->stream = (cudaStream_t) eDescr->p2pApi.stream;
|
||||
event->count = eDescr->p2pApi.count;
|
||||
event->datatype = eDescr->p2pApi.datatype;
|
||||
event->graphCaptured = eDescr->p2pApi.graphCaptured;
|
||||
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
|
||||
event->parent = parent;
|
||||
profilerQueueEnqueue(&parent->p2pApiEvents, event);
|
||||
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
|
||||
*eHandle = event;
|
||||
} else if (eDescr->type == ncclProfileKernelLaunch) {
|
||||
if (eDescr->parentObj == NULL) return ncclSuccess;
|
||||
struct kernelLaunch* event;
|
||||
int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) {
|
||||
// if there are available kernel API events grab one
|
||||
event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize];
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
event->type = ncclProfileKernelLaunch;
|
||||
event->stream = (cudaStream_t) eDescr->kernelLaunch.stream;
|
||||
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
|
||||
event->parent = parent;
|
||||
profilerQueueEnqueue(&parent->kernelLaunchEvents, event);
|
||||
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
|
||||
*eHandle = event;
|
||||
} else if (eDescr->type == ncclProfileGroup) {
|
||||
if (eDescr->parentObj == NULL) return ncclSuccess;
|
||||
struct group* event;
|
||||
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
|
||||
@@ -222,7 +373,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
debugEvent(event, "GroupStart");
|
||||
} else if (eDescr->type == ncclProfileColl) {
|
||||
// the parent might be null if we run out of events
|
||||
struct group* parent = (struct group *)eDescr->parentObj;
|
||||
struct collApi* parent = (struct collApi *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
struct collective* event;
|
||||
@@ -253,12 +404,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->proto = eDescr->coll.proto;
|
||||
*eHandle = event;
|
||||
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
|
||||
// increment the group ref counter so the event will staty open
|
||||
// increment the group ref counter so the event will stay open
|
||||
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "CollStart");
|
||||
} else if (eDescr->type == ncclProfileP2p) {
|
||||
// the parent might be null if we run out of events
|
||||
struct group* parent = (struct group *)eDescr->parentObj;
|
||||
struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
struct p2p* event;
|
||||
@@ -458,8 +609,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
}
|
||||
|
||||
void updateEvent(void* handle) {
|
||||
uint8_t type = *(uint8_t *)handle;
|
||||
if (type == ncclProfileGroup) {
|
||||
uint64_t type = *(uint64_t *)handle;
|
||||
if (type == ncclProfileGroupApi) {
|
||||
struct groupApi* event = (struct groupApi*) handle;
|
||||
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
|
||||
event->stopTs = gettime() - startTime;
|
||||
__atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
} else if (type == ncclProfileCollApi) {
|
||||
struct collApi* event = (struct collApi*) handle;
|
||||
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
|
||||
event->stopTs = gettime() - startTime;
|
||||
__atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
updateEvent(event->parent);
|
||||
return;
|
||||
} else if (type == ncclProfileP2pApi) {
|
||||
struct p2pApi* event = (struct p2pApi*) handle;
|
||||
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
|
||||
event->stopTs = gettime() - startTime;
|
||||
__atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
updateEvent(event->parent);
|
||||
event->stopTs = gettime() - startTime;
|
||||
} else if (type == ncclProfileKernelLaunch) {
|
||||
struct kernelLaunch* event = (struct kernelLaunch*) handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
updateEvent(event->parent);
|
||||
} else if (type == ncclProfileGroup) {
|
||||
struct group* event = (struct group *)handle;
|
||||
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
|
||||
event->stopTs = gettime() - startTime;
|
||||
@@ -527,25 +704,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
|
||||
// the event handle might be null if we run out of events
|
||||
if (eHandle == NULL) return ncclSuccess;
|
||||
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
if (type == ncclProfileGroup) {
|
||||
// stopping the group event in NCCL core does not
|
||||
// mean the group has completed. It means the group
|
||||
// was submitted/enqueued so we need to keep the event open
|
||||
uint64_t type = *(uint64_t *)eHandle;
|
||||
// Stopping API events, Kernel Launch events, collective/p2p task events
|
||||
// in NCCL core do not mean that they are complete. It means that the
|
||||
// operation was enqueued so we need to keep the events open
|
||||
if (type == ncclProfileGroupApi) {
|
||||
struct groupApi* event = (struct groupApi*) eHandle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileCollApi) {
|
||||
struct collApi* event = (struct collApi*) eHandle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileP2pApi) {
|
||||
struct p2pApi* event = (struct p2pApi*) eHandle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileKernelLaunch) {
|
||||
struct kernelLaunch* event = (struct kernelLaunch*) eHandle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileGroup) {
|
||||
struct group* event = (struct group *)eHandle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileColl) {
|
||||
// stopping the collective event in NCCL core does not
|
||||
// mean the collective has completed. It means the collective
|
||||
// was submitted/enqueued so we need to keep the event open
|
||||
struct collective* event = (struct collective *)eHandle;
|
||||
event->base.stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileP2p) {
|
||||
// stopping the p2p event in NCCL core does not
|
||||
// mean the p2p has completed. It means the p2p
|
||||
// was submitted/enqueued so we need to keep the event open
|
||||
struct p2p* event = (struct p2p *)eHandle;
|
||||
event->base.stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
@@ -559,8 +746,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
||||
// the event handle might be null if we run out of events
|
||||
if (eHandle == NULL) return ncclSuccess;
|
||||
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
if (type == ncclProfileProxyOp) {
|
||||
uint64_t type = *(uint64_t *)eHandle;
|
||||
if (type == ncclProfileGroupApi) {
|
||||
struct groupApi* event = (struct groupApi*) eHandle;
|
||||
if (eState == ncclProfilerEndGroupApiStart) {
|
||||
event->endOfncclGroupStartTs = gettime() - startTime;
|
||||
} else if (eState == ncclProfilerBeginGroupApiEnd) {
|
||||
event->startOfncclGroupEndTs = gettime() - startTime;
|
||||
}
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* event = (struct proxyOp *)eHandle;
|
||||
if (eState == ncclProfilerProxyOpInProgress_v4) {
|
||||
event->progrTs = gettime() - startTime;
|
||||
@@ -592,6 +786,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
|
||||
@@ -609,7 +805,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t ncclProfiler_v4 = {
|
||||
ncclProfiler_t ncclProfiler_v5 = {
|
||||
"Example-profiler",
|
||||
exampleProfilerInit,
|
||||
exampleProfilerStartEvent,
|
||||
@@ -618,14 +814,15 @@ ncclProfiler_t ncclProfiler_v4 = {
|
||||
exampleProfilerFinalize,
|
||||
};
|
||||
|
||||
int exampleProfilerStart(int eActivationMask) {
|
||||
__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) {
|
||||
profilerDumpFile = name;
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int exampleProfilerStop(void) {
|
||||
__attribute__((visibility("default"))) int exampleProfilerStop(void) {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
|
||||
}
|
||||
@@ -7,7 +7,8 @@
|
||||
#ifndef PLUGIN_H_
|
||||
#define PLUGIN_H_
|
||||
|
||||
int exampleProfilerStart(int eActivationMask);
|
||||
int exampleProfilerStop(void);
|
||||
__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name);
|
||||
__attribute__((visibility("default"))) int exampleProfilerStop(void);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
+93
-6
@@ -5,15 +5,59 @@
|
||||
************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "err.h"
|
||||
#include "profiler.h"
|
||||
#include "event.h"
|
||||
#include "print_event.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
|
||||
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
|
||||
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
|
||||
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
|
||||
// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET)
|
||||
static __thread int groupApiId;
|
||||
__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n",
|
||||
"Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth);
|
||||
}
|
||||
|
||||
__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"Group API", groupApiId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int p2pApiId;
|
||||
__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
|
||||
event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream);
|
||||
}
|
||||
|
||||
__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
event->func, p2pApiId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int collApiId;
|
||||
__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
|
||||
event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream);
|
||||
}
|
||||
|
||||
__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
event->func, collApiId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int kernelLaunchId;
|
||||
__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream);
|
||||
}
|
||||
|
||||
__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int groupId;
|
||||
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
|
||||
@@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
|
||||
static __thread int collId;
|
||||
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
|
||||
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
|
||||
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
|
||||
}
|
||||
|
||||
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
|
||||
@@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
|
||||
static __thread int p2pId;
|
||||
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
|
||||
event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
|
||||
event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
|
||||
}
|
||||
|
||||
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
|
||||
@@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
char filename[64] = { 0 };
|
||||
sprintf(filename, "EventDebug-%d", getpid());
|
||||
FILE* fh = fopen(filename, "a+");
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
uint64_t type = *(uint64_t *)eHandle;
|
||||
if (type == ncclProfileGroup) {
|
||||
struct group* event = (struct group *)eHandle;
|
||||
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
|
||||
@@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
|
||||
void printEvent(FILE* fh, void* handle) {
|
||||
if (handle == NULL || fh == NULL) return;
|
||||
uint8_t type = *(uint8_t *)handle;
|
||||
if (type == ncclProfileGroup) {
|
||||
uint64_t type = *(uint64_t *)handle;
|
||||
if (type == ncclProfileGroupApi) {
|
||||
struct groupApi* g = (struct groupApi*) handle;
|
||||
printGroupApiEventHeader(fh, g);
|
||||
struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents);
|
||||
while (kernelLaunchHead != NULL) {
|
||||
printEvent(fh, kernelLaunchHead);
|
||||
kernelLaunchHead = kernelLaunchHead->next;
|
||||
}
|
||||
struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents);
|
||||
while (collApiHead != NULL) {
|
||||
printEvent(fh, collApiHead);
|
||||
collApiHead = collApiHead->next;
|
||||
}
|
||||
struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents);
|
||||
while (p2pApiHead != NULL) {
|
||||
printEvent(fh, p2pApiHead);
|
||||
p2pApiHead = p2pApiHead->next;
|
||||
}
|
||||
printGroupApiEventTrailer(fh, g);
|
||||
} else if (type == ncclProfileCollApi) {
|
||||
struct collApi* collApiEvent = (struct collApi *) handle;
|
||||
printCollApiEventHeader(fh, collApiEvent);
|
||||
struct taskEventBase* base = taskEventQueueHead(collApiEvent);
|
||||
while (base) {
|
||||
struct taskEventBase* next = base->next;
|
||||
printEvent(fh, base);
|
||||
base = next;
|
||||
}
|
||||
printCollApiEventTrailer(fh, collApiEvent);
|
||||
} else if (type == ncclProfileP2pApi) {
|
||||
struct p2pApi* p2pApiEvent = (struct p2pApi *) handle;
|
||||
printP2pApiEventHeader(fh, p2pApiEvent);
|
||||
struct taskEventBase* base = taskEventQueueHead(p2pApiEvent);
|
||||
while (base) {
|
||||
struct taskEventBase* next = base->next;
|
||||
printEvent(fh, base);
|
||||
base = next;
|
||||
}
|
||||
printP2pApiEventTrailer(fh, p2pApiEvent);
|
||||
} else if (type == ncclProfileKernelLaunch) {
|
||||
struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle;
|
||||
printKernelLaunchEventHeader(fh, kernelLaunchEvent);
|
||||
printKernelLaunchEventTrailer(fh, kernelLaunchEvent);
|
||||
} else if (type == ncclProfileGroup) {
|
||||
struct group* g = (struct group *)handle;
|
||||
printGroupEventHeader(fh, g);
|
||||
struct taskEventBase* base = taskEventQueueHead(g);
|
||||
@@ -0,0 +1,50 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef QUEUE_H
|
||||
#define QUEUE_H
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
struct profilerQueue {
|
||||
T *head, *tail;
|
||||
};
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline void profilerQueueConstruct(profilerQueue<T,next> *me) {
|
||||
me->head = nullptr;
|
||||
me->tail = nullptr;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline bool profilerQueueEmpty(profilerQueue<T,next> *me) {
|
||||
return me->head == nullptr;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* profilerQueueHead(profilerQueue<T,next> *me) {
|
||||
return me->head;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* profilerQueueTail(profilerQueue<T,next> *me) {
|
||||
return me->tail;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline void profilerQueueEnqueue(profilerQueue<T,next> *me, T *x) {
|
||||
x->*next = nullptr;
|
||||
(me->head ? me->tail->*next : me->head) = x;
|
||||
me->tail = x;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* profilerQueueDequeue(profilerQueue<T,next> *me) {
|
||||
T *ans = me->head;
|
||||
me->head = ans->*next;
|
||||
if (me->head == nullptr) me->tail = nullptr;
|
||||
return ans;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,22 @@
|
||||
.PHONY: build-CoMMA
|
||||
|
||||
all: build-CoMMA
|
||||
|
||||
build-CoMMA: clone-CoMMA
|
||||
cd CoMMA && cargo build
|
||||
|
||||
clone-CoMMA:
|
||||
@if [ ! -d CoMMA ] ; then \
|
||||
git clone https://github.com/google/CoMMA.git; \
|
||||
ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \
|
||||
fi
|
||||
|
||||
clean:
|
||||
@if [ -d CoMMA ] ; then \
|
||||
cd CoMMA && cargo clean; \
|
||||
fi
|
||||
|
||||
delete:
|
||||
@if [ -d CoMMA ] ; then \
|
||||
rm -rf CoMMA; \
|
||||
fi
|
||||
@@ -0,0 +1,62 @@
|
||||
#
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
# Variables
|
||||
NCCL_HOME := ../../build
|
||||
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
|
||||
PLUGIN_SO := libnccl-profiler-inspector.so
|
||||
VERSION_FILE := version.cc
|
||||
|
||||
# Compiler and flags
|
||||
CXX := g++
|
||||
CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra
|
||||
|
||||
ifeq ($(DEBUG), 1)
|
||||
CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer
|
||||
endif
|
||||
|
||||
ifeq ($(ASAN), 1)
|
||||
CXXFLAGS += -fsanitize=address
|
||||
LDFLAGS += -fsanitize=address -static-libasan
|
||||
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
|
||||
endif
|
||||
|
||||
ifeq ($(UBSAN), 1)
|
||||
CXXFLAGS += -fsanitize=undefined
|
||||
LDFLAGS += -fsanitize=undefined -static-libubsan
|
||||
NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
|
||||
endif
|
||||
|
||||
# Source files
|
||||
SOURCES := inspector_plugin.cc inspector.cc json.cc
|
||||
|
||||
# Default target
|
||||
all: $(PLUGIN_SO)
|
||||
|
||||
# Rule to build the plugin
|
||||
$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES)
|
||||
@echo "Compiling to create $@ from $^"
|
||||
$(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
|
||||
# Rule to generate version.cc
|
||||
$(VERSION_FILE):
|
||||
@GIT_INFO=$$(./utils/extract_git_version.sh); \
|
||||
echo '#include "version.h"' > $(VERSION_FILE).tmp; \
|
||||
echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \
|
||||
if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \
|
||||
echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \
|
||||
mv $(VERSION_FILE).tmp $(VERSION_FILE); \
|
||||
else \
|
||||
echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \
|
||||
rm $(VERSION_FILE).tmp; \
|
||||
fi
|
||||
|
||||
# Clean target
|
||||
clean:
|
||||
rm -f $(VERSION_FILE) $(PLUGIN_SO)
|
||||
|
||||
# Phony targets
|
||||
.PHONY: all clean
|
||||
@@ -0,0 +1,216 @@
|
||||
# NCCL Inspector Plugin
|
||||
|
||||
The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation.
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs
|
||||
|
||||
## Folder Location
|
||||
|
||||
The Inspector plugin source is located in:
|
||||
|
||||
```
|
||||
ext-profiler/inspector/
|
||||
```
|
||||
|
||||
## Building the Inspector Plugin
|
||||
|
||||
To build the Inspector plugin, run:
|
||||
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments.
|
||||
|
||||
### Build Options
|
||||
|
||||
The Makefile supports several build options:
|
||||
|
||||
- **DEBUG=1**: Enable debug build with additional debugging information
|
||||
- **ASAN=1**: Enable Address Sanitizer for memory error detection
|
||||
- **UBSAN=1**: Enable Undefined Behavior Sanitizer
|
||||
|
||||
Example debug build:
|
||||
```bash
|
||||
make DEBUG=1
|
||||
```
|
||||
|
||||
### Build Output
|
||||
|
||||
The build process creates:
|
||||
- `libnccl-profiler-inspector.so`: The main inspector plugin library
|
||||
- `version.cc`: Auto-generated version information from git
|
||||
|
||||
## Using NCCL Inspector
|
||||
|
||||
### Key Differences from Normal NCCL Usage
|
||||
|
||||
The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging:
|
||||
|
||||
**Normal NCCL Run:**
|
||||
```bash
|
||||
# Standard NCCL execution
|
||||
./your_nccl_application
|
||||
```
|
||||
|
||||
**NCCL Inspector Run:**
|
||||
```bash
|
||||
# NCCL Inspector enabled execution
|
||||
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
|
||||
export NCCL_INSPECTOR_ENABLE=1
|
||||
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
|
||||
./your_nccl_application
|
||||
```
|
||||
|
||||
### Required Environment Variables
|
||||
|
||||
- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so`
|
||||
Loads the Inspector plugin into NCCL.
|
||||
- `NCCL_INSPECTOR_ENABLE=1`
|
||||
Enables the Inspector plugin.
|
||||
- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=<interval>`
|
||||
Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`.
|
||||
- `NCCL_INSPECTOR_DUMP_DIR=<output_dir>` (optional)
|
||||
Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-<slurm_job_id>` if running under SLURM.
|
||||
- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional)
|
||||
Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default).
|
||||
|
||||
### Example Usage
|
||||
|
||||
**Single Node:**
|
||||
```bash
|
||||
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
|
||||
export NCCL_INSPECTOR_ENABLE=1
|
||||
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
|
||||
./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8
|
||||
```
|
||||
|
||||
**Multi-Node (SLURM):**
|
||||
```bash
|
||||
# Add these environment variables to your SLURM script
|
||||
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
|
||||
export NCCL_INSPECTOR_ENABLE=1
|
||||
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
|
||||
export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/
|
||||
|
||||
# Then run your normal NCCL application
|
||||
srun your_nccl_application
|
||||
```
|
||||
|
||||
## Example Scripts
|
||||
|
||||
For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory:
|
||||
|
||||
- **Single Node Example**: Basic NCCL performance testing with inspector
|
||||
- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations
|
||||
- **Training Workload Example**: Integration with distributed training workloads
|
||||
|
||||
## Output Example
|
||||
|
||||
Each output file contains JSON objects with the following structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"header": {
|
||||
"id": "0x7f8c496ae9f661",
|
||||
"rank": 2,
|
||||
"n_ranks": 8,
|
||||
"nnodes": 1
|
||||
},
|
||||
"metadata": {
|
||||
"inspector_output_format_version": "v4.0",
|
||||
"git_rev": "",
|
||||
"rec_mechanism": "profiler_plugin",
|
||||
"dump_timestamp_us": 1748030377748202,
|
||||
"hostname": "example-hostname",
|
||||
"pid": 1639453
|
||||
},
|
||||
"coll_perf": {
|
||||
"coll": "AllReduce",
|
||||
"coll_sn": 1407,
|
||||
"coll_msg_size_bytes": 17179869184,
|
||||
"coll_exec_time_us": 61974,
|
||||
"coll_algobw_gbs": 277.210914,
|
||||
"coll_busbw_gbs": 485.119099
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Output Example Verbose
|
||||
|
||||
To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable:
|
||||
|
||||
```bash
|
||||
export NCCL_INSPECTOR_DUMP_VERBOSE=1
|
||||
```
|
||||
|
||||
This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event.
|
||||
|
||||
```json
|
||||
{
|
||||
"header": {
|
||||
"id": "0xe62dedaa97644a",
|
||||
"rank": 4,
|
||||
"n_ranks": 8,
|
||||
"nnodes": 1
|
||||
},
|
||||
"metadata": {
|
||||
"inspector_output_format_version": "v4.0",
|
||||
"git_rev": "9019a1912-dirty",
|
||||
"rec_mechanism": "nccl_profiler_interface",
|
||||
"dump_timestamp_us": 1752867229276385,
|
||||
"hostname": "example-hostname",
|
||||
"pid": 438776
|
||||
},
|
||||
"coll_perf": {
|
||||
"coll": "ReduceScatter",
|
||||
"coll_sn": 1231,
|
||||
"coll_msg_size_bytes": 2147483648,
|
||||
"coll_exec_time_us": 41057,
|
||||
"coll_timing_source": "kernel_gpu",
|
||||
"coll_algobw_gbs": 418.439467,
|
||||
"coll_busbw_gbs": 366.134533,
|
||||
"event_trace_sn": {
|
||||
"coll_start_sn": 1,
|
||||
"coll_stop_sn": 2,
|
||||
"kernel_events": [
|
||||
{
|
||||
"channel_id": 0,
|
||||
"kernel_start_sn": 3,
|
||||
"kernel_stop_sn": 48,
|
||||
"kernel_record_sn": 47
|
||||
}
|
||||
]
|
||||
},
|
||||
"event_trace_ts": {
|
||||
"coll_start_ts": 1752867229235059,
|
||||
"coll_stop_ts": 1752867229235064,
|
||||
"kernel_events": [
|
||||
{
|
||||
"channel_id": 0,
|
||||
"kernel_start_ts": 1752867229235181,
|
||||
"kernel_stop_ts": 1752867229275811,
|
||||
"kernel_record_ts": 1752867229275811
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Multiple such JSON objects are written, one per collective operation per communicator.
|
||||
|
||||
## Output Directory
|
||||
|
||||
- By default, output files are written to:
|
||||
- `nccl-inspector-unknown-jobid` (if no SLURM job ID is present)
|
||||
- `nccl-inspector-<slurm_job_id>` (if running under SLURM)
|
||||
- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable.
|
||||
|
||||
## Additional Notes
|
||||
|
||||
- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments.
|
||||
- For more details, see the source code and comments in `ext-profiler/inspector/`.
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
# NCCL Inspector Performance Summary Exporter
|
||||
|
||||
This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries.
|
||||
One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems.
|
||||
|
||||
## Features
|
||||
|
||||
- **Performance Analysis**: Generates statistical summaries for collective operations
|
||||
- **Communication Type Classification**: Automatically categorizes communication patterns
|
||||
- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics
|
||||
- **Data Export**: Converts logs to Parquet format for efficient processing
|
||||
- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files
|
||||
- **Parallel Processing**: Utilizes multi-core processing for faster analysis
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.7+
|
||||
- Access to NCCL Inspector log files
|
||||
|
||||
## Installation
|
||||
|
||||
### Clone the Repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/NVIDIA/nccl.git
|
||||
cd nccl/ext-profiler/inspector/exporter/example
|
||||
```
|
||||
|
||||
Install the required dependencies using the provided `requirements.txt` file:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
The script processes NCCL Inspector log files from a specified directory.
|
||||
|
||||
**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md).
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs
|
||||
```
|
||||
|
||||
This mode processes all log files in the specified directory and its subdirectories recursively.
|
||||
|
||||
### Command Line Arguments
|
||||
|
||||
- `--input_dir <path>`: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories)
|
||||
- `--output_dir <name>`: **Optional**. Custom output directory name (default: `<input_directory_name>-analysis`)
|
||||
|
||||
## Output
|
||||
|
||||
The tool generates:
|
||||
|
||||
1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory)
|
||||
2. **Summary Directory**: Contains comprehensive analysis results
|
||||
3. **Visualizations**: Scatter plots, histograms, and box plots for each message size
|
||||
4. **CSV Files**: Detailed summaries for each message size and collective type
|
||||
5. **Log File**: Processing log with detailed information
|
||||
|
||||
## Example Output Structure
|
||||
|
||||
```
|
||||
<output_dir_name>/
|
||||
├── output.log
|
||||
├── parquet_files/
|
||||
│ ├── <filename1>.parquet
|
||||
│ ├── <filename2>.parquet
|
||||
│ └── ...
|
||||
└── summary/
|
||||
├── scatter_plot_<comm_type>_<coll_type>.png
|
||||
├── combined_scatter_plot_<comm_type>_<coll_type>.png
|
||||
└── msg_size_<human_readable_size>/
|
||||
├── histograms/
|
||||
│ └── histogram_<comm_type>_<coll_type>_<size>.png
|
||||
├── boxplots/
|
||||
│ └── boxplot_<comm_type>_<coll_type>_<size>.png
|
||||
└── summary_<comm_type>_<coll_type>_<size>.csv
|
||||
```
|
||||
|
||||
## Supported Communicator Types
|
||||
|
||||
- `single-rank`
|
||||
- `nvlink-only`
|
||||
- `hca-only`
|
||||
- `mixed`
|
||||
|
||||
## Supported Collective Types
|
||||
|
||||
- `AllReduce`
|
||||
- `AllGather`
|
||||
- `ReduceScatter`
|
||||
- `Broadcast`
|
||||
|
||||
## Log File Formats
|
||||
|
||||
### Supported Formats
|
||||
|
||||
- `.log` - Plain text JSON lines
|
||||
- `.log.gz` - Compressed JSON lines
|
||||
- `.jsonl` - JSON lines format
|
||||
- `.jsonl.gz` - Compressed JSON lines
|
||||
|
||||
### Expected JSON Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"header": {
|
||||
"id": "0x9e7a479f95a66c",
|
||||
"rank": 31,
|
||||
"n_ranks": 32,
|
||||
"nnodes": 4
|
||||
},
|
||||
"metadata": {
|
||||
"inspector_output_format_version": "v4.0",
|
||||
"git_rev": "75e61acda-dirty",
|
||||
"rec_mechanism": "nccl_profiler_interface",
|
||||
"dump_timestamp_us": 1749490229087081,
|
||||
"hostname": "example-hostname",
|
||||
"pid": 468528
|
||||
},
|
||||
"coll_perf": {
|
||||
"coll": "ReduceScatter",
|
||||
"coll_sn": 129,
|
||||
"coll_msg_size_bytes": 65536,
|
||||
"coll_exec_time_us": 110,
|
||||
"coll_timing_source": "kernel_gpu",
|
||||
"coll_algobw_gbs": 19.065018,
|
||||
"coll_busbw_gbs": 18.469236
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **No log files found**: Ensure the log directory path is correct and contains valid log files
|
||||
2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment
|
||||
3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings.
|
||||
|
||||
### Log Files
|
||||
|
||||
The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages.
|
||||
|
||||
## Support
|
||||
|
||||
Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title.
|
||||
@@ -0,0 +1,548 @@
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import glob
|
||||
import gzip
|
||||
import sys
|
||||
import pandas as pd
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import json
|
||||
from tqdm.auto import tqdm
|
||||
import duckdb
|
||||
import math
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates
|
||||
from matplotlib.gridspec import GridSpec
|
||||
import os
|
||||
import logging
|
||||
import contextlib
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
|
||||
def setup_logging(output_dir):
|
||||
log_file = output_dir / "output.log"
|
||||
logging.basicConfig(
|
||||
filename=log_file,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def smart_open(filename, mode="r"):
|
||||
if filename.endswith(".gz"):
|
||||
opener = gzip.open
|
||||
else:
|
||||
opener = open
|
||||
|
||||
with opener(filename, mode) as f:
|
||||
yield f
|
||||
|
||||
|
||||
def get_log_files_and_output_dir():
|
||||
parser = argparse.ArgumentParser(description="Process log files in a directory.")
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
type=str,
|
||||
help="The directory containing NCCL Inspector log files to process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
help="Custom output directory name (default: auto-generated from input directory)."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.input_dir:
|
||||
# Use the provided input directory
|
||||
root_dir = Path(args.input_dir)
|
||||
if not root_dir.exists():
|
||||
raise FileNotFoundError(f"Input directory not found: {root_dir}")
|
||||
|
||||
logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True))
|
||||
gzlogfiles = list(
|
||||
glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True)
|
||||
)
|
||||
jsonlfiles = list(
|
||||
glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True)
|
||||
)
|
||||
gzjsonlfiles = list(
|
||||
glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True)
|
||||
)
|
||||
if (
|
||||
sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0))
|
||||
> 1
|
||||
):
|
||||
### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail
|
||||
logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!")
|
||||
sys.exit(1)
|
||||
|
||||
files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles
|
||||
|
||||
if not files:
|
||||
print("No inspector logs found")
|
||||
sys.exit(1)
|
||||
|
||||
# Generate output directory name from input directory
|
||||
if args.output_dir:
|
||||
output_dir_name = args.output_dir
|
||||
else:
|
||||
output_dir_name = f"{root_dir.name}-analysis"
|
||||
|
||||
return files, output_dir_name
|
||||
|
||||
def bytes_to_human_readable(size_bytes):
|
||||
"""
|
||||
Convert bytes to human-readable format using decimal (SI) units.
|
||||
|
||||
Uses powers of 1000 (decimal/SI standard):
|
||||
- 1 KB = 1,000 bytes
|
||||
- 1 MB = 1,000,000 bytes
|
||||
- 1 GB = 1,000,000,000 bytes
|
||||
|
||||
Not binary units (powers of 1024):
|
||||
- Does NOT use KiB, MiB, GiB (1024-based)
|
||||
|
||||
Args:
|
||||
size_bytes: Number of bytes to convert
|
||||
|
||||
Returns:
|
||||
Human-readable string (e.g., "1.50MB", "2.34GB")
|
||||
"""
|
||||
if size_bytes == 0:
|
||||
return "0B"
|
||||
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
|
||||
i = int(math.log10(int(size_bytes)) / 3)
|
||||
s = round(size_bytes * math.pow(10, -3 * i), 2)
|
||||
return f"{s:.2f}{size_name[i]}"
|
||||
|
||||
def timestamp_to_datetime(timestamp_us):
|
||||
"""Convert microsecond timestamp to datetime string"""
|
||||
return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
||||
|
||||
def microseconds_to_human_readable(microseconds):
|
||||
"""Convert microseconds to human readable format"""
|
||||
if microseconds < 1000:
|
||||
return f"{microseconds:.1f}μs"
|
||||
elif microseconds < 1000000:
|
||||
return f"{microseconds/1000:.1f}ms"
|
||||
else:
|
||||
return f"{microseconds/1000000:.1f}s"
|
||||
|
||||
def get_comm_type(row) -> str:
|
||||
if row["n_ranks"] == 1:
|
||||
return "single-rank"
|
||||
elif row["nnodes"] == 1:
|
||||
return "nvlink-only"
|
||||
elif row["n_ranks"] == row["nnodes"]:
|
||||
return "hca-only"
|
||||
else:
|
||||
return "mixed"
|
||||
|
||||
def parse_file(filepath: Path, output_dir):
|
||||
filename = Path(filepath).stem
|
||||
parquet_file = output_dir / f"{filename}.parquet"
|
||||
|
||||
# Check if parquet file exists and is newer than source file
|
||||
if parquet_file.exists():
|
||||
source_mtime = Path(filepath).stat().st_mtime
|
||||
parquet_mtime = parquet_file.stat().st_mtime
|
||||
if parquet_mtime >= source_mtime:
|
||||
logging.info(f"Parquet file {parquet_file} is up to date. Skipping...")
|
||||
return
|
||||
else:
|
||||
logging.info(f"Source file {filepath} is newer than parquet. Regenerating...")
|
||||
|
||||
# Check if file is empty or too small
|
||||
file_size = Path(filepath).stat().st_size
|
||||
if file_size == 0:
|
||||
logging.warning(f"Skipping empty file: {filepath}")
|
||||
return
|
||||
|
||||
recs = []
|
||||
try:
|
||||
with smart_open(filepath, "r") as infile:
|
||||
for lineno, line in enumerate(infile):
|
||||
try:
|
||||
json_recs = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
logging.error(f"Failed to parse line {filepath}:{lineno}")
|
||||
continue
|
||||
|
||||
# Validate that required fields exist
|
||||
if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]):
|
||||
logging.error(f"Missing required fields in {filepath}:{lineno}")
|
||||
continue
|
||||
|
||||
header = json_recs["header"]
|
||||
metadata = json_recs["metadata"]
|
||||
comm_type = get_comm_type(header)
|
||||
coll_perf = json_recs["coll_perf"]
|
||||
recs.append(
|
||||
dict(
|
||||
**header,
|
||||
comm_type=comm_type,
|
||||
**coll_perf,
|
||||
**metadata,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading file {filepath}: {e}")
|
||||
return
|
||||
|
||||
# Skip files with no valid records
|
||||
if not recs:
|
||||
logging.warning(f"No valid records found in file: {filepath}. Skipping...")
|
||||
return
|
||||
|
||||
df = pd.DataFrame(recs)
|
||||
df.to_parquet(parquet_file)
|
||||
logging.info(f"Created parquet file {parquet_file} with {len(recs)} records")
|
||||
|
||||
def create_per_node_parquet_files(files, output_dir):
|
||||
output_dir = Path(output_dir) / "parquet_files"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
max_workers = min(64, len(files), os.cpu_count() or 1)
|
||||
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
list(
|
||||
tqdm(
|
||||
executor.map(parse_file, files, [output_dir] * len(files)),
|
||||
total=len(files),
|
||||
desc="Processing files",
|
||||
unit="file",
|
||||
)
|
||||
)
|
||||
return output_dir
|
||||
|
||||
def generate_scatter_plot(df, comm_type, coll_type, output_file):
|
||||
plt.figure(figsize=(10, 6), dpi=100)
|
||||
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
|
||||
|
||||
for msg_size in distinct_msg_sizes:
|
||||
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
|
||||
mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
|
||||
plt.scatter(
|
||||
df_msg_size["coll_sn"],
|
||||
df_msg_size["mean_coll_busbw_gbs"],
|
||||
label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
|
||||
alpha=0.5,
|
||||
)
|
||||
|
||||
plt.xlabel("Operation Sequence Number")
|
||||
plt.ylabel("Mean Collective Bus BW (GB/s)")
|
||||
plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}")
|
||||
plt.legend(title="Message Size", loc="upper right")
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file)
|
||||
plt.close()
|
||||
logging.info(f"Scatter plot saved to {output_file}")
|
||||
|
||||
def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3):
|
||||
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
|
||||
num_plots = len(distinct_msg_sizes)
|
||||
|
||||
# Compute number of rows and columns
|
||||
num_cols = min(max_cols, num_plots) # Limit max columns
|
||||
num_rows = (num_plots + num_cols - 1) // num_cols # Calculate rows dynamically
|
||||
|
||||
# Create figure with GridSpec
|
||||
fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100)
|
||||
gs = GridSpec(num_rows, num_cols, figure=fig)
|
||||
|
||||
for i, msg_size in enumerate(distinct_msg_sizes):
|
||||
row, col = divmod(i, num_cols) # Determine row & column index
|
||||
ax = fig.add_subplot(gs[row, col]) # Create subplot at position
|
||||
|
||||
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
|
||||
mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
|
||||
ax.scatter(
|
||||
df_msg_size["coll_sn"],
|
||||
df_msg_size["mean_coll_busbw_gbs"],
|
||||
label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
|
||||
alpha=0.5,
|
||||
)
|
||||
ax.set_xlabel("Op Seq No")
|
||||
ax.set_ylabel("Mean Collective Bus BW (GB/s)")
|
||||
ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})")
|
||||
ax.legend(loc="upper right")
|
||||
|
||||
fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file)
|
||||
plt.close()
|
||||
logging.info(f"Combined scatter plot saved to {output_file}")
|
||||
|
||||
def generate_histogram(df, comm_type, coll_type, output_file, message_size):
|
||||
plt.figure(figsize=(10, 6), dpi=100)
|
||||
data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min()
|
||||
num_bins = min(50, int(data_range) + 1)
|
||||
plt.hist(
|
||||
df["mean_coll_busbw_gbs"],
|
||||
bins=num_bins,
|
||||
alpha=0.7,
|
||||
color="b",
|
||||
edgecolor="black",
|
||||
linewidth=1.2,
|
||||
)
|
||||
plt.xlabel("Mean Collective Bus BW (GB/s)")
|
||||
plt.ylabel("Frequency")
|
||||
plt.title(
|
||||
f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}"
|
||||
)
|
||||
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}"))
|
||||
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s"))
|
||||
plt.gca().xaxis.get_offset_text().set_visible(False)
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file)
|
||||
plt.close()
|
||||
logging.info(f"Histogram saved to {output_file}")
|
||||
|
||||
def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
|
||||
plt.figure(figsize=(10, 6))
|
||||
boxprops = dict(linestyle="-", linewidth=2, color="blue")
|
||||
flierprops = dict(marker="o", color="red", alpha=0.5)
|
||||
medianprops = dict(linestyle="-", linewidth=2.5, color="orange")
|
||||
whiskerprops = dict(linestyle="--", linewidth=2, color="green")
|
||||
capprops = dict(linestyle="-", linewidth=2, color="black")
|
||||
|
||||
plt.boxplot(
|
||||
df["mean_coll_busbw_gbs"],
|
||||
vert=False,
|
||||
patch_artist=True,
|
||||
boxprops=boxprops,
|
||||
flierprops=flierprops,
|
||||
medianprops=medianprops,
|
||||
whiskerprops=whiskerprops,
|
||||
capprops=capprops,
|
||||
)
|
||||
|
||||
plt.xlabel("Mean Coll Bus BW (GB/s)")
|
||||
plt.title(
|
||||
f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})"
|
||||
)
|
||||
|
||||
# Adding labels for min, max, and median
|
||||
stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5])
|
||||
plt.annotate(
|
||||
f"Min: {stats['min']:.2f}",
|
||||
xy=(stats["min"], 1),
|
||||
xytext=(stats["min"], 1.1),
|
||||
arrowprops=dict(facecolor="black", shrink=0.05),
|
||||
)
|
||||
plt.annotate(
|
||||
f"Median: {stats['50%']:.2f}",
|
||||
xy=(stats["50%"], 1),
|
||||
xytext=(stats["50%"], 1.1),
|
||||
arrowprops=dict(facecolor="black", shrink=0.05),
|
||||
)
|
||||
plt.annotate(
|
||||
f"Max: {stats['max']:.2f}",
|
||||
xy=(stats["max"], 1),
|
||||
xytext=(stats["max"], 1.1),
|
||||
arrowprops=dict(facecolor="black", shrink=0.05),
|
||||
)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file)
|
||||
plt.close()
|
||||
logging.info(f"Box plot saved to {output_file}")
|
||||
|
||||
|
||||
def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name):
|
||||
"""Summarize parquet data per communication and collective type using DuckDB"""
|
||||
logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}")
|
||||
|
||||
# Check if there are any parquet files
|
||||
parquet_dir = output_root / "parquet_files"
|
||||
parquet_files = list(parquet_dir.glob("*.parquet"))
|
||||
if not parquet_files:
|
||||
logging.warning(f"No parquet files found for {comm_type} and {coll_type}")
|
||||
return None
|
||||
|
||||
# Clean up invalid/empty parquet files by moving them to a separate directory
|
||||
invalid_dir = parquet_dir / "invalid"
|
||||
invalid_dir.mkdir(exist_ok=True)
|
||||
|
||||
invalid_count = 0
|
||||
for pf in parquet_files:
|
||||
try:
|
||||
# Check file size first
|
||||
if pf.stat().st_size == 0:
|
||||
logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory")
|
||||
pf.rename(invalid_dir / pf.name)
|
||||
invalid_count += 1
|
||||
continue
|
||||
|
||||
# Use pyarrow to check parquet metadata without reading data
|
||||
import pyarrow.parquet as pq
|
||||
parquet_file = pq.ParquetFile(pf)
|
||||
if parquet_file.metadata.num_rows == 0:
|
||||
logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory")
|
||||
pf.rename(invalid_dir / pf.name)
|
||||
invalid_count += 1
|
||||
except Exception as e:
|
||||
logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}")
|
||||
pf.rename(invalid_dir / pf.name)
|
||||
invalid_count += 1
|
||||
|
||||
# Check if any valid files remain
|
||||
remaining_files = list(parquet_dir.glob("*.parquet"))
|
||||
if not remaining_files:
|
||||
logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)")
|
||||
return None
|
||||
|
||||
logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)")
|
||||
|
||||
try:
|
||||
duckdb.execute(
|
||||
f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')"
|
||||
)
|
||||
df = duckdb.execute(f"""
|
||||
SELECT
|
||||
id,
|
||||
coll_sn,
|
||||
coll_msg_size_bytes,
|
||||
AVG(coll_busbw_gbs) as mean_coll_busbw_gbs,
|
||||
COUNT(*) as log_count,
|
||||
ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks,
|
||||
ARRAY_DISTINCT(LIST(nnodes)) as nnodes,
|
||||
MIN(dump_timestamp_us) as coll_start_timestamp_us,
|
||||
MAX(dump_timestamp_us) as coll_end_timestamp_us,
|
||||
(MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us
|
||||
FROM logs
|
||||
WHERE coll = '{coll_type}' and comm_type = '{comm_type}'
|
||||
GROUP BY id, coll_sn, coll_msg_size_bytes
|
||||
ORDER BY coll_sn
|
||||
""").df()
|
||||
except Exception as e:
|
||||
logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}")
|
||||
return None
|
||||
|
||||
if df.empty:
|
||||
logging.info(f"No data for {comm_type} and {coll_type}")
|
||||
return None
|
||||
|
||||
# Add human-readable formatting
|
||||
df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply(
|
||||
bytes_to_human_readable
|
||||
)
|
||||
|
||||
# Log example of time range data for first few rows
|
||||
if len(df) > 0:
|
||||
sample_row = df.iloc[0]
|
||||
start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us'])
|
||||
end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us'])
|
||||
duration = microseconds_to_human_readable(sample_row['coll_duration_us'])
|
||||
logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, "
|
||||
f"Start: {start_time}, End: {end_time}, Duration: {duration}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def generate_visualizations(df, output_root, comm_type, coll_type):
|
||||
"""Generate all visualizations and save CSV files for the processed data"""
|
||||
logging.info(f"Generating visualizations for {comm_type} and {coll_type}")
|
||||
|
||||
summary_dir = output_root / "summary"
|
||||
summary_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Scatter Plot for all message sizes
|
||||
output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png"
|
||||
generate_scatter_plot(df, comm_type, coll_type, output_file)
|
||||
|
||||
# Combined Scatter Plot for all message sizes
|
||||
output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png"
|
||||
generate_combined_scatter_plot(df, comm_type, coll_type, output_file)
|
||||
|
||||
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
|
||||
for msg_size in distinct_msg_sizes:
|
||||
hr_msg_size = bytes_to_human_readable(msg_size)
|
||||
msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}"
|
||||
msg_size_hist_dir = msg_size_dir / "histograms"
|
||||
msg_size_boxplot_dir = msg_size_dir / "boxplots"
|
||||
msg_size_dir.mkdir(parents=True, exist_ok=True)
|
||||
msg_size_hist_dir.mkdir(parents=True, exist_ok=True)
|
||||
msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
|
||||
|
||||
# Add human-readable time formatting
|
||||
df_msg_size = df_msg_size.copy()
|
||||
df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime)
|
||||
df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime)
|
||||
df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable)
|
||||
|
||||
# Histogram
|
||||
output_file = (
|
||||
msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png"
|
||||
)
|
||||
generate_histogram(
|
||||
df_msg_size,
|
||||
comm_type,
|
||||
coll_type,
|
||||
output_file,
|
||||
bytes_to_human_readable(msg_size),
|
||||
)
|
||||
|
||||
# Box Plot
|
||||
output_file = (
|
||||
msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png"
|
||||
)
|
||||
generate_boxplot(
|
||||
df_msg_size,
|
||||
comm_type,
|
||||
coll_type,
|
||||
output_file,
|
||||
bytes_to_human_readable(msg_size),
|
||||
)
|
||||
|
||||
output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv"
|
||||
df_msg_size.to_csv(output_file, index=False)
|
||||
logging.info(
|
||||
f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}"
|
||||
)
|
||||
|
||||
|
||||
def generate_summary(output_root, comm_type, coll_type, output_dir_name):
|
||||
"""Generate summary by summarizing data per comm/coll type and creating visualizations"""
|
||||
logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}")
|
||||
|
||||
# Step 1: Summarize data per communication and collective type
|
||||
df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name)
|
||||
|
||||
# Step 2: Generate visualizations if data exists
|
||||
if df is not None:
|
||||
generate_visualizations(df, output_root, comm_type, coll_type)
|
||||
else:
|
||||
logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation")
|
||||
|
||||
|
||||
def generate_summary_wrapper(args):
|
||||
return generate_summary(*args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
files, output_dir_name = get_log_files_and_output_dir()
|
||||
print(f"Number of log files found: {len(files)}")
|
||||
print(f"Output directory: {output_dir_name}")
|
||||
output_dir = Path(output_dir_name)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
setup_logging(output_dir)
|
||||
create_per_node_parquet_files(files, output_dir)
|
||||
comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"]
|
||||
coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"]
|
||||
summary_args = [
|
||||
(output_dir, comm_type, coll_type, output_dir_name)
|
||||
for comm_type in comm_types
|
||||
for coll_type in coll_types
|
||||
]
|
||||
max_workers = min(64, len(summary_args), os.cpu_count() or 1)
|
||||
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
list(
|
||||
tqdm(
|
||||
executor.map(generate_summary_wrapper, summary_args),
|
||||
total=len(summary_args),
|
||||
desc="Generating summaries",
|
||||
)
|
||||
)
|
||||
print("Done!")
|
||||
@@ -0,0 +1,6 @@
|
||||
pandas>=1.3.0
|
||||
tqdm>=4.60.0
|
||||
duckdb>=0.8.0
|
||||
matplotlib>=3.3.0
|
||||
pyarrow>=5.0.0
|
||||
numpy>=1.21.0
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,198 @@
|
||||
#pragma once
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#include "json.h"
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define MAX_CHANNELS 64
|
||||
|
||||
#define INS_CHK_GOTO(call, res, label) \
|
||||
do { \
|
||||
res = call; \
|
||||
if (inspectorSuccess != res) { \
|
||||
INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
|
||||
inspectorErrorString(res)); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncSendRecv = 5,
|
||||
ncclFuncSend = 6,
|
||||
ncclFuncRecv = 7,
|
||||
ncclNumFuncs = 8
|
||||
} ncclFunc_t;
|
||||
|
||||
typedef enum {
|
||||
inspectorSuccess = 0,
|
||||
inspectorUninitializedError,
|
||||
inspectorMemoryError,
|
||||
inspectorFileOpenError,
|
||||
inspectorDisabledError,
|
||||
inspectorLockError,
|
||||
inspectorPthreadError,
|
||||
inspectorJsonError,
|
||||
inspectorCudaError,
|
||||
inspectorBadHash,
|
||||
inspectorDeleteUnknownCommError,
|
||||
inspectorAddDuplicateCommError,
|
||||
inspectorNop,
|
||||
inspectorNullTally,
|
||||
inspectorGlobalInitError,
|
||||
inspectorReturn,
|
||||
} inspectorResult_t;
|
||||
|
||||
typedef enum {
|
||||
inspectorTimingSourceKernelGpu = 0,
|
||||
inspectorTimingSourceKernelCpu = 1,
|
||||
inspectorTimingSourceCollectiveCpu = 2,
|
||||
} inspectorTimingSource_t;
|
||||
|
||||
struct inspectorEventTraceInfo {
|
||||
uint64_t ts;
|
||||
uint64_t sn;
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
NCCL_INSP_EVT_TRK_COLL_START = 0,
|
||||
NCCL_INSP_EVT_TRK_COLL_STOP = 1,
|
||||
NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
|
||||
} inspectorEventTrkColl_t;
|
||||
|
||||
typedef enum {
|
||||
NCCL_INSP_EVT_TRK_KERNEL_START = 0,
|
||||
NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
|
||||
NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
|
||||
NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
|
||||
} inspectorEventTrkKernel_t;
|
||||
|
||||
struct inspectorEventTrkKernelInfo {
|
||||
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
|
||||
};
|
||||
|
||||
struct inspectorEventTrkCollInfo {
|
||||
int sn;
|
||||
uint32_t nChannels;
|
||||
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
|
||||
struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
|
||||
};
|
||||
|
||||
struct inspectorCompletedCollInfo {
|
||||
ncclFunc_t func;
|
||||
uint64_t sn;
|
||||
size_t msgSizeBytes;
|
||||
uint64_t execTimeUsecs;
|
||||
inspectorTimingSource_t timingSource;
|
||||
double algoBwGbs;
|
||||
double busBwGbs;
|
||||
// Event trace information
|
||||
struct inspectorEventTrkCollInfo collEvtTrk;
|
||||
};
|
||||
|
||||
enum {
|
||||
NCCL_COMM_HASH_LENGTH = 17
|
||||
};
|
||||
|
||||
struct inspectorCommInfo {
|
||||
struct inspectorCommInfo* next;
|
||||
|
||||
const char* commName;
|
||||
uint64_t commHash;
|
||||
char commHashStr[NCCL_COMM_HASH_LENGTH];
|
||||
int rank;
|
||||
int nranks;
|
||||
int nnodes;
|
||||
|
||||
bool dump;
|
||||
struct inspectorCompletedCollInfo completedCollInfo;
|
||||
pthread_rwlock_t guard;
|
||||
};
|
||||
|
||||
struct inspectorKernelChInfo {
|
||||
uint64_t type;
|
||||
int refCount; /*unused*/
|
||||
struct inspectorCollInfo *collInfo;
|
||||
uint8_t channelId;
|
||||
uint64_t tsStartUsec;
|
||||
uint64_t tsCompletedUsec;
|
||||
uint64_t startGpuClk;
|
||||
uint64_t stopGpuClk;
|
||||
};
|
||||
|
||||
struct inspectorCollInfo {
|
||||
uint64_t type;
|
||||
int refCount;
|
||||
struct inspectorCommInfo *commInfo;
|
||||
const char* func;
|
||||
uint64_t sn;
|
||||
size_t msgSizeBytes;
|
||||
uint64_t tsStartUsec;
|
||||
uint64_t tsCompletedUsec;
|
||||
uint32_t nChannels;
|
||||
uint32_t nKernelChStarted;
|
||||
uint32_t nKernelChCompleted;
|
||||
pthread_rwlock_t guard;
|
||||
struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
|
||||
struct inspectorEventTrkCollInfo collEvtTrk;
|
||||
};
|
||||
|
||||
|
||||
|
||||
extern ncclDebugLogger_t logFn;
|
||||
#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
inline int ncclTypeSize(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8:
|
||||
case ncclUint8:
|
||||
case ncclFloat8e4m3:
|
||||
case ncclFloat8e5m2:
|
||||
return 1;
|
||||
case ncclFloat16:
|
||||
case ncclBfloat16:
|
||||
return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32:
|
||||
return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64:
|
||||
return 8;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const char* inspectorErrorString(inspectorResult_t result);
|
||||
|
||||
inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
|
||||
inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
|
||||
inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
|
||||
inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
|
||||
inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
|
||||
inspectorResult_t inspectorGlobalInit(int rank);
|
||||
inspectorResult_t inspectorGlobalFinalize();
|
||||
uint64_t inspectorGetTime();
|
||||
inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
|
||||
const char* commName, uint64_t commHash,
|
||||
int nNodes, int nranks, int rank);
|
||||
inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
|
||||
|
||||
void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
|
||||
struct inspectorCollInfo *collInfo);
|
||||
ncclDataType_t inspectorStringToDatatype(const char* str);
|
||||
|
||||
void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
|
||||
struct inspectorCompletedCollInfo *completedColl,
|
||||
ncclFunc_t collType);
|
||||
@@ -0,0 +1,493 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <linux/limits.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#include "profiler.h"
|
||||
#include "inspector.h"
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
|
||||
static int gInitialized;
|
||||
|
||||
static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
|
||||
/*
|
||||
* Description:
|
||||
* Records an event trace with timestamp and sequence number
|
||||
*
|
||||
* Thread Safety:
|
||||
* Not thread-safe - must be called with proper locking. This function
|
||||
* is designed to be called from within locked sections where the
|
||||
* collective info structure is already protected.
|
||||
*
|
||||
* Input:
|
||||
* struct inspectorEventTraceInfo* evtTrace - event trace array
|
||||
* int eventIndex - index in the event trace array (must be valid)
|
||||
* struct inspectorCollInfo* collInfo - collective info structure (must not be NULL)
|
||||
*
|
||||
* Output:
|
||||
* Event trace is updated with current timestamp and next sequence
|
||||
* number from collective
|
||||
*
|
||||
* Return:
|
||||
* uint64_t - the sequence number assigned to this event
|
||||
*
|
||||
* Preconditions:
|
||||
* - collInfo must not be NULL
|
||||
* - eventIndex must be within valid bounds for evtTrace array
|
||||
* - Function must be called from within a locked section
|
||||
*/
|
||||
static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace,
|
||||
int eventIndex,
|
||||
struct inspectorCollInfo* collInfo) {
|
||||
evtTrace[eventIndex].ts = inspectorGetTime();
|
||||
evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter
|
||||
|
||||
return evtTrace[eventIndex].sn;
|
||||
}
|
||||
|
||||
/*
|
||||
* Description:
|
||||
*
|
||||
* Initializes the NCCL Inspector plugin and global state for a
|
||||
* communicator.
|
||||
*
|
||||
* Thread Safety:
|
||||
* Thread-safe (uses mutex for initialization).
|
||||
*
|
||||
* Input:
|
||||
* void** context - pointer to plugin context.
|
||||
* int* eActivationMask - pointer to activation mask output.
|
||||
* const char* commName - communicator name.
|
||||
* uint64_t commHash - communicator hash.
|
||||
* int nNodes - number of nodes.
|
||||
* int nranks - number of ranks.
|
||||
* int rank - rank.
|
||||
* ncclDebugLogger_t logfn - logger function pointer.
|
||||
*
|
||||
* Output:
|
||||
* context is set to plugin context; eActivationMask is set.
|
||||
*
|
||||
* Return:
|
||||
* ncclResult_t - success or error code.
|
||||
*
|
||||
*/
|
||||
__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash,
|
||||
int* eActivationMask,
|
||||
const char* commName,
|
||||
int nNodes, int nranks, int rank,
|
||||
ncclDebugLogger_t logfn) {
|
||||
inspectorResult_t res = inspectorSuccess;
|
||||
*context = nullptr;
|
||||
logFn = logfn;
|
||||
|
||||
pthread_mutex_lock(&gLock);
|
||||
if (++gInitialized == 1) {
|
||||
res = inspectorGlobalInit(rank);
|
||||
if (res != inspectorSuccess) {
|
||||
WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res,
|
||||
inspectorErrorString(res));
|
||||
gInitialized = 0;
|
||||
pthread_mutex_unlock(&gLock);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&gLock);
|
||||
|
||||
INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context,
|
||||
commName, commHash,
|
||||
nNodes, nranks, rank), res, success);
|
||||
*eActivationMask = ncclProfileColl | ncclProfileKernelCh;
|
||||
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d",
|
||||
commName ? commName : "", commHash, nranks, rank);
|
||||
success:
|
||||
if (res != inspectorSuccess) {
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Description:
|
||||
*
|
||||
* Finalizes the NCCL Inspector plugin and global state for a
|
||||
* communicator.
|
||||
*
|
||||
* Thread Safety:
|
||||
* Thread-safe (uses mutex for finalization).
|
||||
*
|
||||
* Input:
|
||||
* void* context - plugin context.
|
||||
*
|
||||
* Output:
|
||||
* Plugin context is finalized and cleaned up.
|
||||
*
|
||||
* Return:
|
||||
* ncclResult_t - success or error code.
|
||||
*
|
||||
*/
|
||||
__hidden ncclResult_t inspectorPluginFinalize(void* context) {
|
||||
inspectorDelComm((struct inspectorCommInfo *)context);
|
||||
pthread_mutex_lock(&gLock);
|
||||
if (--gInitialized == 0) {
|
||||
inspectorGlobalFinalize();
|
||||
}
|
||||
pthread_mutex_unlock(&gLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) {
|
||||
collInfo->refCount += 1;
|
||||
return inspectorSuccess;
|
||||
}
|
||||
|
||||
inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) {
|
||||
inspectorLockWr(&collInfo->guard);
|
||||
inspectorPluginCollInfoRef(collInfo);
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
return inspectorSuccess;
|
||||
}
|
||||
|
||||
inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) {
|
||||
collInfo->refCount -= 1;
|
||||
if (collInfo->refCount == 0) {
|
||||
inspectorLockDestroy(&collInfo->guard);
|
||||
memset(collInfo, 0, sizeof(struct inspectorCollInfo));
|
||||
free(collInfo);
|
||||
return inspectorReturn;
|
||||
}
|
||||
return inspectorSuccess;
|
||||
}
|
||||
|
||||
inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) {
|
||||
inspectorLockWr(&collInfo->guard);
|
||||
inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo);
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* Description:
|
||||
* Initializes a new inspectorCollInfo structure for a collective
|
||||
* event.
|
||||
*
|
||||
* Thread Safety:
|
||||
* Not thread-safe (allocates and initializes a new collective info
|
||||
* structure).
|
||||
*
|
||||
* Input:
|
||||
*
|
||||
* struct inspectorCollInfo **collInfo - pointer to output
|
||||
* collective info struct.
|
||||
* ncclProfilerEventDescr_t *eDescr - event descriptor.
|
||||
*
|
||||
* Output:
|
||||
* collInfo is set to the new collective info struct.
|
||||
*
|
||||
* Return:
|
||||
* None.
|
||||
*/
|
||||
static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo,
|
||||
ncclProfilerEventDescr_t *eDescr,
|
||||
struct inspectorCommInfo *commInfo) {
|
||||
struct inspectorCollInfo *collInfoPtr
|
||||
= (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo));
|
||||
if (collInfoPtr == nullptr) {
|
||||
WARN("Inspector: Failed to allocate memory for collective info structure");
|
||||
*collInfo = nullptr;
|
||||
return;
|
||||
}
|
||||
collInfoPtr->type = ncclProfileColl;
|
||||
collInfoPtr->refCount = 0;
|
||||
inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed
|
||||
collInfoPtr->func = eDescr->coll.func;
|
||||
collInfoPtr->sn = eDescr->coll.seqNumber;
|
||||
collInfoPtr->nChannels = eDescr->coll.nChannels;
|
||||
if (collInfoPtr->nChannels > 0) {
|
||||
inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion
|
||||
}
|
||||
collInfoPtr->tsStartUsec = inspectorGetTime();
|
||||
collInfoPtr->msgSizeBytes =
|
||||
ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count;
|
||||
|
||||
|
||||
collInfoPtr->commInfo = commInfo;
|
||||
collInfoPtr->collEvtTrk.sn = 0;
|
||||
collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels;
|
||||
inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace,
|
||||
NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr);
|
||||
|
||||
inspectorLockInit(&collInfoPtr->guard);
|
||||
*collInfo = collInfoPtr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Description:
|
||||
*
|
||||
* Initializes a new inspectorKernelChInfo structure for a kernel
|
||||
* channel event.
|
||||
*
|
||||
* Thread Safety:
|
||||
* Not thread-safe (initializes kernel channel info within a
|
||||
* collective info structure).
|
||||
*
|
||||
* Input:
|
||||
* struct inspectorKernelChInfo **kernelChInfo - pointer to output
|
||||
* kernel channel info struct.
|
||||
* ncclProfilerEventDescr_t *eDescr - event descriptor.
|
||||
*
|
||||
* Output:
|
||||
*
|
||||
* kernelChInfo is set to the new kernel channel info struct.
|
||||
*
|
||||
* Return:
|
||||
* None.
|
||||
*/
|
||||
static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo,
|
||||
ncclProfilerEventDescr_t *eDescr) {
|
||||
if (eDescr->parentObj) {
|
||||
uint64_t parentType=*(uint64_t*)eDescr->parentObj;
|
||||
if (parentType == ncclProfileColl) {
|
||||
struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj;
|
||||
if (collInfo && collInfo->type == ncclProfileColl) {
|
||||
inspectorLockWr(&collInfo->guard);
|
||||
struct inspectorEventTraceInfo *krnlEvtTrk =
|
||||
collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace;
|
||||
inspectorRecordEventTrace(krnlEvtTrk,
|
||||
NCCL_INSP_EVT_TRK_KERNEL_START,
|
||||
collInfo);
|
||||
struct inspectorKernelChInfo *kernelChInfoPtr
|
||||
= &collInfo->kernelCh[eDescr->kernelCh.channelId];
|
||||
kernelChInfoPtr->type = ncclProfileKernelCh;
|
||||
kernelChInfoPtr->channelId = eDescr->kernelCh.channelId;
|
||||
kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer;
|
||||
if (kernelChInfoPtr->stopGpuClk == 0) {
|
||||
inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event
|
||||
}
|
||||
kernelChInfoPtr->tsStartUsec = inspectorGetTime();
|
||||
if (collInfo->nKernelChStarted == 0) {
|
||||
collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec;
|
||||
}
|
||||
collInfo->nKernelChStarted += 1;
|
||||
inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event
|
||||
kernelChInfoPtr->collInfo = collInfo;
|
||||
|
||||
*kernelChInfo = kernelChInfoPtr;
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Description:
|
||||
*
|
||||
* Starts a profiling event for the NCCL Inspector plugin.
|
||||
*
|
||||
* Thread Safety:
|
||||
* Thread-safe (allocates and initializes event structures).
|
||||
*
|
||||
* Input:
|
||||
* void* context - plugin context.
|
||||
* void** eHandle - pointer to event handle output.
|
||||
* ncclProfilerEventDescr_t* eDescr - event descriptor.
|
||||
*
|
||||
* Output:
|
||||
* eHandle is set to the new event structure.
|
||||
*
|
||||
* Return:
|
||||
* ncclResult_t - success or error code.
|
||||
*
|
||||
*/
|
||||
__hidden ncclResult_t inspectorPluginStartEvent(void* context,
|
||||
void** eHandle,
|
||||
ncclProfilerEventDescr_t* eDescr) {
|
||||
if (context == nullptr || eDescr == nullptr) {
|
||||
INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__);
|
||||
return ncclSuccess;
|
||||
}
|
||||
*eHandle = nullptr;
|
||||
if (eDescr->type == ncclProfileColl) {
|
||||
struct inspectorCollInfo *collEvent = nullptr;
|
||||
struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context;
|
||||
inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx);
|
||||
*eHandle = collEvent;
|
||||
} else if (eDescr->type == ncclProfileKernelCh) {
|
||||
struct inspectorKernelChInfo *kernelChEvent = nullptr;
|
||||
inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr);
|
||||
*eHandle = kernelChEvent;
|
||||
} else {
|
||||
return ncclSuccess;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/*
|
||||
* Description:
|
||||
*
|
||||
* Stops a profiling event for the NCCL Inspector plugin.
|
||||
*
|
||||
* Thread Safety:
|
||||
*
|
||||
* Thread-safe (updates event state and performance info).
|
||||
*
|
||||
* Input:
|
||||
*
|
||||
* void *eHandle - event handle.
|
||||
*
|
||||
* Output:
|
||||
*
|
||||
* Event is stopped and performance info may be updated.
|
||||
*
|
||||
* Return:
|
||||
* ncclResult_t - success or error code.
|
||||
*
|
||||
*/
|
||||
__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
|
||||
|
||||
if (eHandle == nullptr) {
|
||||
INFO(NCCL_INIT,
|
||||
"Profiler/Plugin: Event Handle NULL for start event %s", __func__);
|
||||
return ncclSuccess;
|
||||
}
|
||||
uint64_t type = *(uint64_t *)eHandle;
|
||||
inspectorResult_t res = inspectorSuccess;
|
||||
|
||||
if (type == ncclProfileColl) {
|
||||
struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle;
|
||||
// Record collective stop event
|
||||
inspectorLockWr(&collInfo->guard);
|
||||
inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace,
|
||||
NCCL_INSP_EVT_TRK_COLL_STOP,
|
||||
collInfo);
|
||||
res = inspectorPluginCollInfoDeRef(collInfo);
|
||||
if (res == inspectorReturn) {
|
||||
// WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl");
|
||||
return ncclSuccess;
|
||||
}
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileKernelCh) {
|
||||
struct inspectorKernelChInfo *kernelChInfo
|
||||
= (struct inspectorKernelChInfo *)eHandle;
|
||||
struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
|
||||
if (collInfo && collInfo->type == ncclProfileColl) {
|
||||
inspectorLockWr(&collInfo->guard);
|
||||
struct inspectorEventTraceInfo *krnlEvtTrk =
|
||||
collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
|
||||
inspectorRecordEventTrace(krnlEvtTrk,
|
||||
NCCL_INSP_EVT_TRK_KERNEL_STOP,
|
||||
collInfo);
|
||||
kernelChInfo->tsCompletedUsec = inspectorGetTime();
|
||||
collInfo->nKernelChCompleted += 1;
|
||||
|
||||
res = inspectorPluginCollInfoDeRef(collInfo);
|
||||
if (res == inspectorReturn) {
|
||||
WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh");
|
||||
return ncclSuccess;
|
||||
}
|
||||
if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted)
|
||||
&& (collInfo->nKernelChCompleted == collInfo->nChannels)) {
|
||||
struct inspectorCompletedCollInfo completedColl;
|
||||
struct inspectorCommInfo *commInfo = collInfo->commInfo;
|
||||
collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec;
|
||||
inspectorUpdateCollPerf(&completedColl, collInfo);
|
||||
|
||||
res = inspectorPluginCollInfoDeRef(collInfo);
|
||||
if (res != inspectorReturn) {
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
}
|
||||
if (commInfo != nullptr) {
|
||||
inspectorLockWr(&commInfo->guard);
|
||||
inspectorComputeCollBw(commInfo,
|
||||
&completedColl,
|
||||
completedColl.func);
|
||||
memcpy(&commInfo->completedCollInfo,
|
||||
&completedColl,
|
||||
sizeof(struct inspectorCompletedCollInfo));
|
||||
commInfo->dump = true;
|
||||
inspectorUnlockRWLock(&commInfo->guard);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/*
|
||||
* Description:
|
||||
*
|
||||
* Records the state of a profiling event for the NCCL Inspector
|
||||
* plugin.
|
||||
*
|
||||
* Thread Safety:
|
||||
*
|
||||
* Thread-safe (updates event state as needed).
|
||||
*
|
||||
* Input:
|
||||
* void* eHandle - event handle.
|
||||
* ncclProfilerEventState_t eState - event state.
|
||||
* ncclProfilerEventStateArgs_t* eStateArgs - event state arguments.
|
||||
*
|
||||
* Output:
|
||||
* Event state is updated as needed.
|
||||
*
|
||||
* Return:
|
||||
* ncclResult_t - success or error code.
|
||||
*
|
||||
*/
|
||||
__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
|
||||
ncclProfilerEventState_t eState,
|
||||
ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
if (eHandle == nullptr || eStateArgs == nullptr)
|
||||
return ncclSuccess;
|
||||
|
||||
uint64_t type = *(uint64_t *)eHandle;
|
||||
|
||||
if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) {
|
||||
struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle;
|
||||
struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
|
||||
inspectorResult_t res = inspectorSuccess;
|
||||
if (collInfo && collInfo->type == ncclProfileColl) {
|
||||
inspectorLockWr(&collInfo->guard);
|
||||
struct inspectorEventTraceInfo *krnlEvtTrk
|
||||
= collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
|
||||
inspectorRecordEventTrace(krnlEvtTrk,
|
||||
NCCL_INSP_EVT_TRK_KERNEL_RECORD,
|
||||
collInfo);
|
||||
kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer;
|
||||
if (kernelChInfo->startGpuClk != 0) {
|
||||
res = inspectorPluginCollInfoDeRef(collInfo);
|
||||
if (res == inspectorReturn) {
|
||||
WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState");
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
inspectorUnlockRWLock(&collInfo->guard);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t ncclProfiler_v5 = {
|
||||
"Inspector",
|
||||
inspectorPluginInit,
|
||||
inspectorPluginStartEvent,
|
||||
inspectorPluginStopEvent,
|
||||
inspectorPluginRecordEventState,
|
||||
inspectorPluginFinalize,
|
||||
};
|
||||
@@ -0,0 +1,496 @@
|
||||
#include "json.h"
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
const char* jsonErrorString(jsonResult_t res) {
|
||||
switch (res) {
|
||||
case jsonSuccess:
|
||||
return "jsonSuccess";
|
||||
case jsonFileError:
|
||||
return "jsonFileError";
|
||||
case jsonUnknownStateError:
|
||||
return "jsonUnknownStateError";
|
||||
case jsonEmptyStateError:
|
||||
return "jsonEmptyStateError";
|
||||
case jsonExpectedNonNoneStateError:
|
||||
return "jsonExpectedNonNoneStateError";
|
||||
case jsonMemoryError:
|
||||
return "jsonMemoryError";
|
||||
case jsonStringOverflowError:
|
||||
return "jsonStringOverflowError";
|
||||
case jsonStringBadChar:
|
||||
return "jsonStringBadChar";
|
||||
case jsonLockError:
|
||||
return "jsonLockError";
|
||||
default:
|
||||
return "unknown json error";
|
||||
}
|
||||
}
|
||||
|
||||
// We use these statics to mantain a stack of states where we are writing.
|
||||
typedef struct jsonFileOutput {
|
||||
jsonState_t* states;
|
||||
size_t state_cap; // Allocated stack capacity
|
||||
size_t state_n; // # of items in the stack.
|
||||
FILE* fp;
|
||||
pthread_mutex_t mutex;
|
||||
} jsonFileOutput;
|
||||
|
||||
jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) {
|
||||
jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput));
|
||||
if (new_jfo == NULL) {
|
||||
return jsonMemoryError;
|
||||
}
|
||||
if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) {
|
||||
free(new_jfo);
|
||||
*jfo = 0;
|
||||
return jsonLockError;
|
||||
}
|
||||
new_jfo->states = NULL;
|
||||
new_jfo->state_cap = 0;
|
||||
new_jfo->state_n = 0;
|
||||
new_jfo->fp = fopen(outfile, "w");
|
||||
if (new_jfo->fp == NULL) {
|
||||
free(new_jfo);
|
||||
*jfo = 0;
|
||||
return jsonFileError;
|
||||
}
|
||||
*jfo = new_jfo;
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
jsonResult_t jsonNewline(jsonFileOutput* jfo) {
|
||||
fprintf(jfo->fp, "\n");
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
|
||||
fflush(jfo->fp);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
|
||||
if (pthread_mutex_lock(&jfo->mutex) != 0) {
|
||||
return jsonLockError;
|
||||
}
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
|
||||
if (pthread_mutex_unlock(&jfo->mutex) != 0) {
|
||||
return jsonLockError;
|
||||
}
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
|
||||
// Really should probably complain if we aren't in a valid state
|
||||
|
||||
if (pthread_mutex_destroy(&jfo->mutex) != 0) {
|
||||
free(jfo);
|
||||
return jsonLockError;
|
||||
}
|
||||
if (jfo->states != NULL) {
|
||||
free(jfo->states);
|
||||
}
|
||||
jfo->states = NULL;
|
||||
jfo->state_cap = 0;
|
||||
jfo->state_n = 0;
|
||||
if (jfo->fp) {
|
||||
fclose(jfo->fp);
|
||||
jfo->fp = 0;
|
||||
}
|
||||
|
||||
free(jfo);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) {
|
||||
int copy_len;
|
||||
if ((in[0] & 0xE0) == 0xC0) {
|
||||
// 2-byte sequence
|
||||
if ((in[1] & 0xC0) != 0x80 || out_lim < 2) {
|
||||
return 0;
|
||||
}
|
||||
copy_len = 2;
|
||||
} else if ((in[0] & 0xF0) == 0xE0) {
|
||||
// 3-byte sequence
|
||||
if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) {
|
||||
return 0;
|
||||
}
|
||||
copy_len = 3;
|
||||
} else if ((in[0] & 0xF8) == 0xF0) {
|
||||
// 4-byte sequence
|
||||
if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) {
|
||||
return 0;
|
||||
}
|
||||
copy_len = 4;
|
||||
} else {
|
||||
// Invalid start byte
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < copy_len; ++i) {
|
||||
out[i] = in[i];
|
||||
}
|
||||
|
||||
return copy_len;
|
||||
}
|
||||
|
||||
// This tries to sanitize/quote a string from 'in' into 'out',
|
||||
// assuming 'out' has length 'lim'. We mainly quote ",/,\,\t,\n, and
|
||||
// bail if we encounter non-printable stuff or non-ASCII stuff.
|
||||
// 'in' should be null-terminated, of course.
|
||||
//
|
||||
// We return false if we were not able to copy all of 'in', either for
|
||||
// length reasons or for unhandled characters.
|
||||
static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) {
|
||||
int c = 0;
|
||||
while (*in) {
|
||||
if (c + 1 >= lim) {
|
||||
out[c] = 0;
|
||||
return jsonStringOverflowError;
|
||||
}
|
||||
switch (*in) {
|
||||
case '"':
|
||||
case '\\':
|
||||
case '/':
|
||||
case '\t':
|
||||
case '\n':
|
||||
if (c + 2 > lim) {
|
||||
out[c] = 0;
|
||||
return jsonStringOverflowError;
|
||||
}
|
||||
|
||||
out[c++] = '\\';
|
||||
if (*in == '\n') {
|
||||
out[c++] = 'n';
|
||||
} else if (*in == '\t') {
|
||||
out[c++] = 't';
|
||||
} else {
|
||||
out[c++] = *in;
|
||||
}
|
||||
++in;
|
||||
break;
|
||||
default:
|
||||
if (*in <= 0x1F) {
|
||||
out[c] = 0;
|
||||
return jsonStringBadChar;
|
||||
} else if (*in <= 0x7F) {
|
||||
out[c++] = *in;
|
||||
++in;
|
||||
} else {
|
||||
const int utf8len = utf8copy(out + c, lim - c - 1, in);
|
||||
if (utf8len == 0) {
|
||||
out[c] = 0;
|
||||
return jsonStringBadChar;
|
||||
}
|
||||
c += utf8len;
|
||||
in += utf8len;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
out[c] = 0;
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
static size_t max(size_t a, size_t b) {
|
||||
if (a < b) {
|
||||
return b;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Push state onto the state stack. Reallocate for extra storage if needed.
|
||||
// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
|
||||
static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
|
||||
if (state == JSON_NONE) {
|
||||
return jsonExpectedNonNoneStateError;
|
||||
}
|
||||
if (jfo->state_cap <= (jfo->state_n + 1)) {
|
||||
jfo->state_cap = max((size_t)16, jfo->state_cap * 2);
|
||||
jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap);
|
||||
if (jfo->states == 0) {
|
||||
return jsonMemoryError;
|
||||
}
|
||||
}
|
||||
jfo->states[jfo->state_n++] = state;
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Return the current state at the top of the stack
|
||||
static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
|
||||
if (jfo->state_n == 0) {
|
||||
return JSON_NONE;
|
||||
}
|
||||
return jfo->states[jfo->state_n - 1];
|
||||
}
|
||||
|
||||
// Replace the stack with state (equivalent to a pop & push if stack is not empty)
|
||||
static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) {
|
||||
if (state == JSON_NONE) {
|
||||
return jsonExpectedNonNoneStateError;
|
||||
}
|
||||
if (jfo->state_n == 0) {
|
||||
return jsonEmptyStateError;
|
||||
}
|
||||
jfo->states[jfo->state_n - 1] = state;
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Pop the top state off the stack, or return that the state is empty
|
||||
static jsonState_t jsonPopState(jsonFileOutput* jfo) {
|
||||
if (jfo->state_n == 0) {
|
||||
return JSON_NONE;
|
||||
}
|
||||
return jfo->states[--jfo->state_n];
|
||||
}
|
||||
|
||||
// Emit a key and separator. Santize the key.
|
||||
// This is only acceptable if the top state is an object
|
||||
// Emit a ',' separator of we aren't the first item.
|
||||
jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
|
||||
switch (jsonCurrState(jfo)) {
|
||||
case JSON_OBJECT_EMPTY:
|
||||
jsonReplaceState(jfo, JSON_OBJECT_SOME);
|
||||
break;
|
||||
case JSON_OBJECT_SOME:
|
||||
fprintf(jfo->fp, ",");
|
||||
break;
|
||||
default:
|
||||
return jsonUnknownStateError;
|
||||
}
|
||||
unsigned char tmp[2048];
|
||||
const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "\"%s\":", tmp);
|
||||
jsonPushState(jfo, JSON_KEY);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Helper function for inserting values.
|
||||
// Only acceptable after keys, top-level, or in lists.
|
||||
// Emit preceeding ',' if in a list and not first item.
|
||||
static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
|
||||
switch (jsonCurrState(jfo)) {
|
||||
case JSON_LIST_EMPTY:
|
||||
jsonReplaceState(jfo, JSON_LIST_SOME);
|
||||
break;
|
||||
case JSON_LIST_SOME:
|
||||
fprintf(jfo->fp, ",");
|
||||
break;
|
||||
case JSON_KEY:
|
||||
jsonPopState(jfo);
|
||||
break;
|
||||
case JSON_NONE:
|
||||
break;
|
||||
default:
|
||||
return jsonUnknownStateError;
|
||||
}
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Start an object
|
||||
jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "{");
|
||||
return jsonPushState(jfo, JSON_OBJECT_EMPTY);
|
||||
}
|
||||
|
||||
// Close an object
|
||||
jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
|
||||
switch (jsonPopState(jfo)) {
|
||||
case JSON_OBJECT_EMPTY:
|
||||
case JSON_OBJECT_SOME:
|
||||
break;
|
||||
default:
|
||||
return jsonUnknownStateError;
|
||||
}
|
||||
fprintf(jfo->fp, "}");
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Start a list
|
||||
jsonResult_t jsonStartList(jsonFileOutput* jfo) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "[");
|
||||
return jsonPushState(jfo, JSON_LIST_EMPTY);
|
||||
}
|
||||
|
||||
// Close a list
|
||||
jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
|
||||
switch (jsonPopState(jfo)) {
|
||||
case JSON_LIST_EMPTY:
|
||||
case JSON_LIST_SOME:
|
||||
break;
|
||||
default:
|
||||
return jsonUnknownStateError;
|
||||
}
|
||||
fprintf(jfo->fp, "]");
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Write a null value
|
||||
jsonResult_t jsonNull(jsonFileOutput* jfo) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "null");
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Write a (sanititzed) string
|
||||
jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
|
||||
if (str == NULL) {
|
||||
jsonNull(jfo);
|
||||
return jsonSuccess;
|
||||
}
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
unsigned char tmp[2048];
|
||||
const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str);
|
||||
if (san_res != jsonSuccess) {
|
||||
return san_res;
|
||||
}
|
||||
fprintf(jfo->fp, "\"%s\"", tmp);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Write a bool as "true" or "false" strings.
|
||||
jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
|
||||
return jsonStr(jfo, val ? "true" : "false");
|
||||
}
|
||||
|
||||
// Write an integer value
|
||||
jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "%d", val);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Write an integer value
|
||||
jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "%u", val);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
|
||||
// Write an integer value
|
||||
jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "%lu", val);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Write a size_t value
|
||||
jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
fprintf(jfo->fp, "%zu", val);
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
// Write a double value
|
||||
jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
|
||||
const jsonResult_t res = jsonValHelper(jfo);
|
||||
if (res != jsonSuccess) {
|
||||
return res;
|
||||
}
|
||||
if (val != val) {
|
||||
fprintf(jfo->fp, "\"nan\"");
|
||||
} else {
|
||||
fprintf(jfo->fp, "%lf", val);
|
||||
}
|
||||
return jsonSuccess;
|
||||
}
|
||||
|
||||
#ifdef DO_JSON_TEST
|
||||
// compile with
|
||||
// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test
|
||||
// run with:
|
||||
// ./json_test
|
||||
// if something fails, it will print out the error
|
||||
// if it all works, print out "output matches reference"
|
||||
#define JSONCHECK(expr) \
|
||||
do { \
|
||||
const jsonResult_t res = (expr); \
|
||||
if (res != jsonSuccess) { \
|
||||
fprintf(stderr, "jsonError: %s\n", jsonErrorString(res)); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int main() {
|
||||
|
||||
const char refstr[] =
|
||||
"{\"number\":123,\"utfstring\":\"∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ "
|
||||
"¬β = ¬(¬α ∨ β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}";
|
||||
|
||||
jsonFileOutput* jfo;
|
||||
JSONCHECK(jsonInitFileOutput(&jfo, "test.json"));
|
||||
JSONCHECK(jsonStartObject(jfo));
|
||||
JSONCHECK(jsonKey(jfo, "number"));
|
||||
JSONCHECK(jsonInt(jfo, 123));
|
||||
JSONCHECK(jsonKey(jfo, "utfstring"));
|
||||
JSONCHECK(
|
||||
jsonStr(jfo, "∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),"));
|
||||
JSONCHECK(jsonKey(jfo, "list"));
|
||||
JSONCHECK(jsonStartList(jfo));
|
||||
JSONCHECK(jsonBool(jfo, true));
|
||||
JSONCHECK(jsonNull(jfo));
|
||||
JSONCHECK(jsonUint64(jfo, 9423812381231ULL));
|
||||
JSONCHECK(jsonSize_t(jfo, 3123111));
|
||||
JSONCHECK(jsonDouble(jfo, 0.69423413));
|
||||
JSONCHECK(jsonFinishList(jfo));
|
||||
JSONCHECK(jsonFinishObject(jfo));
|
||||
JSONCHECK(jsonFinalizeFileOutput(jfo));
|
||||
|
||||
FILE* fp = fopen("test.json", "r");
|
||||
|
||||
const size_t reflen = sizeof(refstr) / sizeof(char);
|
||||
|
||||
char buffer[reflen];
|
||||
|
||||
fread(buffer, sizeof(char), reflen, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
if (memcmp(buffer, refstr, reflen) == 0) {
|
||||
printf("output matches reference\n");
|
||||
} else {
|
||||
printf("output %s\nreference %s\n", buffer, refstr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,83 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
typedef enum {
|
||||
JSON_NONE, // A pseudo-state meaning that the document is empty
|
||||
JSON_KEY,
|
||||
JSON_OBJECT_EMPTY,
|
||||
JSON_OBJECT_SOME,
|
||||
JSON_LIST_EMPTY,
|
||||
JSON_LIST_SOME,
|
||||
} jsonState_t;
|
||||
|
||||
typedef enum {
|
||||
jsonSuccess,
|
||||
jsonFileError,
|
||||
jsonUnknownStateError,
|
||||
jsonEmptyStateError,
|
||||
jsonExpectedNonNoneStateError,
|
||||
jsonStringOverflowError,
|
||||
jsonStringBadChar,
|
||||
jsonMemoryError,
|
||||
jsonLockError,
|
||||
} jsonResult_t;
|
||||
|
||||
const char *jsonErrorString(jsonResult_t res);
|
||||
|
||||
typedef struct jsonFileOutput jsonFileOutput;
|
||||
|
||||
jsonResult_t jsonLockOutput(jsonFileOutput *jfo);
|
||||
|
||||
jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo);
|
||||
|
||||
jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo,
|
||||
const char *outfile);
|
||||
|
||||
jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo);
|
||||
|
||||
jsonResult_t jsonNewline(jsonFileOutput *jfo);
|
||||
jsonResult_t jsonFlushOutput(jsonFileOutput *jfo);
|
||||
|
||||
// Emit a key and separator. Santize the key.
|
||||
// This is only acceptable if the top state is an object
|
||||
// Emit a ',' separator of we aren't the first item.
|
||||
jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name);
|
||||
|
||||
// Start an object
|
||||
jsonResult_t jsonStartObject(jsonFileOutput *jfo);
|
||||
|
||||
// Close an object
|
||||
jsonResult_t jsonFinishObject(jsonFileOutput *jfo);
|
||||
|
||||
// Start a list
|
||||
jsonResult_t jsonStartList(jsonFileOutput *jfo);
|
||||
|
||||
// Close a list
|
||||
jsonResult_t jsonFinishList(jsonFileOutput *jfo);
|
||||
|
||||
// Emit a null value
|
||||
jsonResult_t jsonNull(jsonFileOutput *jfo);
|
||||
|
||||
// Write a (sanititzed) string
|
||||
jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str);
|
||||
|
||||
// Write a bool as "true" or "false" strings.
|
||||
jsonResult_t jsonBool(jsonFileOutput *jfo, bool val);
|
||||
|
||||
// Write an integer value
|
||||
jsonResult_t jsonInt(jsonFileOutput *jfo, const int val);
|
||||
|
||||
//Write an unsigned int value
|
||||
jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val);
|
||||
|
||||
// Write an integer value
|
||||
jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val);
|
||||
|
||||
// Write a size_t value
|
||||
jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val);
|
||||
|
||||
// Write a double value
|
||||
jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val);
|
||||
@@ -0,0 +1,73 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef COMMON_H_
|
||||
#define COMMON_H_
|
||||
|
||||
/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */
|
||||
/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */
|
||||
|
||||
/* Data types */
|
||||
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2, ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6, ncclHalf = 6,
|
||||
ncclFloat32 = 7, ncclFloat = 7,
|
||||
ncclFloat64 = 8, ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
ncclFloat8e4m3 = 10,
|
||||
ncclFloat8e5m2 = 11,
|
||||
ncclNumTypes = 12
|
||||
} ncclDataType_t;
|
||||
|
||||
typedef enum {
|
||||
NCCL_LOG_NONE = 0,
|
||||
NCCL_LOG_VERSION = 1,
|
||||
NCCL_LOG_WARN = 2,
|
||||
NCCL_LOG_INFO = 3,
|
||||
NCCL_LOG_ABORT = 4,
|
||||
NCCL_LOG_TRACE = 5
|
||||
} ncclDebugLogLevel;
|
||||
|
||||
typedef enum { ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclRemoteError = 6,
|
||||
ncclInProgress = 7,
|
||||
ncclNumResults = 8 } ncclResult_t;
|
||||
|
||||
|
||||
typedef enum {
|
||||
NCCL_INIT = 0x1,
|
||||
NCCL_COLL = 0x2,
|
||||
NCCL_P2P = 0x4,
|
||||
NCCL_SHM = 0x8,
|
||||
NCCL_NET = 0x10,
|
||||
NCCL_GRAPH = 0x20,
|
||||
NCCL_TUNING = 0x40,
|
||||
NCCL_ENV = 0x80,
|
||||
NCCL_ALLOC = 0x100,
|
||||
NCCL_CALL = 0x200,
|
||||
NCCL_PROXY = 0x400,
|
||||
NCCL_NVLS = 0x800,
|
||||
NCCL_BOOTSTRAP = 0x1000,
|
||||
NCCL_REG = 0x2000,
|
||||
NCCL_PROFILE = 0x4000,
|
||||
NCCL_RAS = 0x8000,
|
||||
NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts
|
||||
NCCL_ALL = ~0
|
||||
} ncclDebugLogSubSys;
|
||||
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,85 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_H_
|
||||
#define PROFILER_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||
ncclProfileGroupApi = (1 << 8), // Group API events
|
||||
ncclProfileCollApi = (1 << 9), // Collective API events
|
||||
ncclProfileP2pApi = (1 << 10), // Point-to-Point API events
|
||||
ncclProfileKernelLaunch = (1 << 11), // Kernel launch events
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
|
||||
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
|
||||
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
|
||||
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
|
||||
ncclProfilerProxyOpInProgress_v4 = 19,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait = 8,
|
||||
ncclProfilerProxyStepSendPeerWait_v4 = 20,
|
||||
ncclProfilerProxyStepSendWait = 9,
|
||||
ncclProfilerProxyStepRecvWait = 10,
|
||||
ncclProfilerProxyStepRecvFlushWait = 11,
|
||||
ncclProfilerProxyStepRecvGPUWait = 12,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle = 13,
|
||||
ncclProfilerProxyCtrlActive = 14,
|
||||
ncclProfilerProxyCtrlSleep = 15,
|
||||
ncclProfilerProxyCtrlWakeup = 16,
|
||||
ncclProfilerProxyCtrlAppend = 17,
|
||||
ncclProfilerProxyCtrlAppendEnd = 18,
|
||||
|
||||
/* Network defined events states */
|
||||
ncclProfilerNetPluginUpdate = 21,
|
||||
|
||||
/* Kernel event states */
|
||||
ncclProfilerKernelChStop = 22,
|
||||
|
||||
/* Group API States */
|
||||
ncclProfilerEndGroupApiStart = 23,
|
||||
ncclProfilerBeginGroupApiEnd = 24
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
|
||||
|
||||
#include "profiler_v5.h"
|
||||
#include "profiler_v4.h"
|
||||
#include "profiler_v3.h"
|
||||
#include "profiler_v2.h"
|
||||
#include "profiler_v1.h"
|
||||
#include "profiler_net.h"
|
||||
|
||||
typedef ncclProfiler_v5_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -0,0 +1,19 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_NET_H_
|
||||
#define PROFILER_NET_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
|
||||
|
||||
typedef enum {
|
||||
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
|
||||
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
|
||||
} ncclProfilerNetType;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,112 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V1_H_
|
||||
#define PROFILER_V1_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
uint8_t func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
uint8_t datatype;
|
||||
uint32_t op;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
uint8_t algo;
|
||||
uint8_t proto;
|
||||
int isCollnet;
|
||||
int isNvls;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint8_t func;
|
||||
void* buff;
|
||||
uint8_t datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,108 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V2_H_
|
||||
#define PROFILER_V2_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,116 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V3_H_
|
||||
#define PROFILER_V3_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v3_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,127 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V4_H_
|
||||
#define PROFILER_V4_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
uint64_t pTimer; // start timestamp from GPU globaltimer
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v4_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
|
||||
struct {
|
||||
void* data;
|
||||
} netPlugin;
|
||||
|
||||
struct {
|
||||
uint64_t pTimer;
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v4_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commName : user assigned communicator name
|
||||
// - commHash : communicator id
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v4_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,151 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V5_H_
|
||||
#define PROFILER_V5_H_
|
||||
|
||||
typedef struct {
|
||||
uint64_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
bool graphCaptured;
|
||||
int groupDepth;
|
||||
} groupApi;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
int root;
|
||||
void* stream;
|
||||
bool graphCaptured;
|
||||
} collApi;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
void* stream;
|
||||
bool graphCaptured;
|
||||
} p2pApi;
|
||||
|
||||
struct {
|
||||
void* stream;
|
||||
} kernelLaunch;
|
||||
|
||||
struct {
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
void* parentGroup; // for backward compatibility with v4
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
void* parentGroup; // for backward compatibility with v4
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
uint64_t pTimer; // start timestamp from GPU globaltimer
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v5_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
|
||||
struct {
|
||||
void* data;
|
||||
} netPlugin;
|
||||
|
||||
struct {
|
||||
uint64_t pTimer;
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v5_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commId : communicator id
|
||||
// - commName : user assigned communicator name
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v5_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_TYPES_H_
|
||||
#define NCCL_TYPES_H_
|
||||
|
||||
/* Data types */
|
||||
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2, ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6, ncclHalf = 6,
|
||||
ncclFloat32 = 7, ncclFloat = 7,
|
||||
ncclFloat64 = 8, ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
} ncclDataType_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,12 @@
|
||||
#ifndef VERSION_H
|
||||
#define VERSION_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
const char* get_git_version_info();
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif // VERSION_H
|
||||
Submodule
+1
Submodule projects/rccl/ext-src/rocSHMEM added at b28a56bd54
@@ -0,0 +1,803 @@
|
||||
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
|
||||
index 9bfd8dcf..4d3f0a08 100644
|
||||
--- a/src/transport/net_ib.cc
|
||||
+++ b/src/transport/net_ib.cc
|
||||
@@ -29,6 +29,7 @@
|
||||
|
||||
#include "ibvwrap.h"
|
||||
#include "mlx5/mlx5dvwrap.h"
|
||||
+#include "ionic/ionicdvwrap.h"
|
||||
#include "graph/xml.h"
|
||||
|
||||
#define MAXSUFFIXSIZE 16
|
||||
@@ -110,16 +111,38 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
|
||||
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
|
||||
static std::mutex ncclIbMutex;
|
||||
static int ncclIbRelaxedOrderingEnabled = 0;
|
||||
+static bool rcclAinicRoce = 0;
|
||||
+static bool rcclCtsInlineData = 0;
|
||||
+static bool rcclCtsOffloadEnabled = 0;
|
||||
+static bool ncclIbUseInline = 0;
|
||||
+static int ncclIbGdrFlushDisable = 0;
|
||||
+
|
||||
+enum ncclIbChannelType {
|
||||
+ ncclIbChannelTypeCts = 0,
|
||||
+ ncclIbChannelTypeData = 1,
|
||||
+ ncclIbChannelTypeMax = 2
|
||||
+};
|
||||
+
|
||||
+struct ncclChannelToUd {
|
||||
+ int channelId;
|
||||
+ bool udId;
|
||||
+ bool udAllocated;
|
||||
+};
|
||||
+
|
||||
+static ncclChannelToUd nccl_channel_ud_map[MAXCHANNELS][ncclIbChannelTypeMax];
|
||||
+static bool nccl_channel_last_ud[MAX_IB_DEVS][ncclIbChannelTypeMax];
|
||||
|
||||
// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
|
||||
// rather than once for all communicators. However, the internal plugin implementation
|
||||
// still assumes the plugin is initialized only once across all communicators. The ref
|
||||
// counter makes sure the plugin internally initializes only once. When per communicator
|
||||
// context support is added to the plugin the ref counter can be removed.
|
||||
static int netRefCount;
|
||||
|
||||
#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
|
||||
|
||||
+#define NCCL_CTS_QP_SLOT_INVALID 0xFF
|
||||
+
|
||||
#define NCCL_IB_SL_DEFAULT 0
|
||||
#define NCCL_IB_TC_DEFAULT 0
|
||||
|
||||
@@ -141,6 +164,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
|
||||
NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
|
||||
NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
|
||||
RCCL_PARAM(IbQpsPerP2p, "IB_QPS_PER_P2P", 0);
|
||||
+NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
|
||||
+
|
||||
+// AMD AINIC
|
||||
+RCCL_PARAM(CtsInlineData, "CTS_INLINE_DATA", -1);
|
||||
+RCCL_PARAM(CtsOffloadEnabled, "CTS_OFFLOAD_ENABLED", -1);
|
||||
+
|
||||
+extern int64_t rcclParamAinicRoce();
|
||||
|
||||
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
|
||||
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
|
||||
@@ -779,6 +809,10 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
|
||||
static int shownIbHcaEnv = 0;
|
||||
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||
if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
|
||||
+ if(wrap_ionicdv_symbols() != ncclSuccess) {
|
||||
+ WARN("NET/IB : Failed to open ionicdv symbols. Advance features like AINIC UD load balancing will be disabled.");
|
||||
+ return ncclInternalError;
|
||||
+ }
|
||||
|
||||
// Detect IB cards
|
||||
int nIbDevs = 0;
|
||||
@@ -944,6 +978,23 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
|
||||
ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
|
||||
|
||||
+ ncclIbUseInline = ncclParamIbUseInline();
|
||||
+ ncclIbGdrFlushDisable = ncclParamIbGdrFlushDisable();
|
||||
+
|
||||
+ rcclAinicRoce = ((rcclParamAinicRoce() == 1) ? true : false);
|
||||
+ if (rcclAinicRoce) {
|
||||
+ // for AINIC, these params are defaulted to enabled unless user forces it to disable(0).
|
||||
+ rcclCtsInlineData = ((rcclParamCtsInlineData() == 0) ? false : true);
|
||||
+ rcclCtsOffloadEnabled = ((rcclParamCtsOffloadEnabled() == 0) ? false : true);
|
||||
+ // for AINIC IbUseInline is enabled by default always
|
||||
+ ncclIbUseInline = true;
|
||||
+ // for AINIC GDR flush is disabled by default
|
||||
+ ncclIbGdrFlushDisable = 1;
|
||||
+
|
||||
+ INFO(NCCL_INIT|NCCL_NET, "NET/IB : AINIC RoCEv2 optimizations enabled: CTS Inline Data: %s; CTS Offload: %s; "
|
||||
+ "IB Use Inline: enabled; GDR Flush: disabled", rcclCtsInlineData ? "Enabled": "Disabled",
|
||||
+ rcclCtsOffloadEnabled ? "Enabled": "Disabled");
|
||||
+ }
|
||||
}
|
||||
exit:
|
||||
ibContext.trafficClass = config->trafficClass;
|
||||
@@ -1271,6 +1322,8 @@ struct ncclIbListenComm {
|
||||
struct ncclIbCommStage stage;
|
||||
};
|
||||
|
||||
+#define MAX_INLINE_DATA_SIZE 24
|
||||
+
|
||||
struct alignas(64) ncclIbSendFifo {
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
@@ -1281,10 +1334,21 @@ struct alignas(64) ncclIbSendFifo {
|
||||
char padding[16];
|
||||
};
|
||||
|
||||
+struct alignas(32) ncclIbSendFifoCtsInline {
|
||||
+ uint64_t addr;
|
||||
+ uint32_t rkeys[1];
|
||||
+ int size;
|
||||
+ uint8_t nreqs;
|
||||
+ uint16_t tag;
|
||||
+ uint32_t idx;
|
||||
+ char padding[9];
|
||||
+} __attribute__((packed));
|
||||
+
|
||||
struct ncclIbQp {
|
||||
struct ibv_qp* qp;
|
||||
int devIndex;
|
||||
int remDevIdx;
|
||||
+ int8_t ctsQpSlot;
|
||||
};
|
||||
|
||||
struct ncclIbRemSizesFifo {
|
||||
@@ -1331,6 +1395,7 @@ struct ncclIbSendComm {
|
||||
struct ncclIbNetCommBase base;
|
||||
// Start with fifo and ibv structs as they have alignment restrictions
|
||||
struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
|
||||
+ struct ncclIbSendFifoCtsInline fifo_inline[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
|
||||
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
|
||||
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
|
||||
// Each dev correlates to a mergedIbDev
|
||||
@@ -1346,6 +1411,7 @@ struct ncclIbSendComm {
|
||||
static_assert((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset");
|
||||
static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
|
||||
static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
|
||||
+static_assert((sizeof(struct ncclIbSendFifoCtsInline) % 32) == 0, "ncclIbSendFifoCtsInline element size must be 32-byte multiples");
|
||||
static_assert((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned");
|
||||
static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned");
|
||||
|
||||
@@ -1360,6 +1426,7 @@ struct ncclIbGpuFlush {
|
||||
|
||||
struct ncclIbRemFifo {
|
||||
struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
|
||||
+ struct ncclIbSendFifoCtsInline elems_cts_inline[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
|
||||
uint64_t fifoTail;
|
||||
uint64_t addr;
|
||||
uint32_t flags;
|
||||
@@ -1415,20 +1482,59 @@ ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
-ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
|
||||
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
|
||||
+ int access_flags, void* qp_context, struct ncclIbQp* qp,
|
||||
+ int channel_id, bool data_qp, int8_t cts_qp_slot) {
|
||||
struct ibv_qp_init_attr qpInitAttr;
|
||||
+ enum ncclIbChannelType channel_type = (data_qp ? ncclIbChannelTypeData : ncclIbChannelTypeCts);
|
||||
memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
|
||||
qpInitAttr.qp_context = qp_context;
|
||||
qpInitAttr.send_cq = base->cq;
|
||||
qpInitAttr.recv_cq = base->cq;
|
||||
qpInitAttr.qp_type = IBV_QPT_RC;
|
||||
+
|
||||
+ if (rcclAinicRoce) {
|
||||
+ if (!nccl_channel_ud_map[channel_id][channel_type].udAllocated) {
|
||||
+ bool lud = nccl_channel_last_ud[base->ibDevN][channel_type];
|
||||
+ nccl_channel_ud_map[channel_id][channel_type].udId = lud;
|
||||
+ nccl_channel_ud_map[channel_id][channel_type].udAllocated = true;
|
||||
+ nccl_channel_last_ud[base->ibDevN][channel_type] =
|
||||
+ !(nccl_channel_last_ud[base->ibDevN][channel_type]);
|
||||
+ }
|
||||
+ if (nccl_channel_ud_map[channel_id][channel_type].udId) {
|
||||
+ wrap_ionicdv_pd_set_udma_mask(base->pd, IONIC_UDMA_MASK_HIGH);
|
||||
+ } else {
|
||||
+ wrap_ionicdv_pd_set_udma_mask(base->pd, IONIC_UDMA_MASK_LOW);
|
||||
+ }
|
||||
+ qpInitAttr.sq_sig_all |= (1 << 16);
|
||||
+ if (data_qp) {
|
||||
+ qpInitAttr.sq_sig_all |= (1 << 17);
|
||||
+ } else {
|
||||
+ qpInitAttr.sq_sig_all &= (~(1 << 17));
|
||||
+ }
|
||||
+ qpInitAttr.sq_sig_all |= (1 << 18);
|
||||
+
|
||||
+ if (rcclCtsOffloadEnabled) {
|
||||
+ qpInitAttr.sq_sig_all |= (1 << 19);
|
||||
+ } else {
|
||||
+ qpInitAttr.sq_sig_all &= (~(1 << 19));
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
// We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
|
||||
qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
|
||||
qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
|
||||
qpInitAttr.cap.max_send_sge = 1;
|
||||
qpInitAttr.cap.max_recv_sge = 1;
|
||||
- qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ qpInitAttr.cap.max_inline_data = MAX_INLINE_DATA_SIZE;
|
||||
+ } else {
|
||||
+ qpInitAttr.cap.max_inline_data = ncclIbUseInline ? sizeof(struct ncclIbSendFifo) : 0;
|
||||
+ }
|
||||
NCCLCHECK(wrap_ibv_create_qp(&qp->qp, base->pd, &qpInitAttr));
|
||||
+ if (rcclAinicRoce) {
|
||||
+ NCCLCHECK(wrap_ionicdv_qp_set_gda(qp->qp, false, true));
|
||||
+ }
|
||||
struct ibv_qp_attr qpAttr;
|
||||
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
|
||||
qpAttr.qp_state = IBV_QPS_INIT;
|
||||
@@ -1438,6 +1544,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
|
||||
NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
|
||||
TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
|
||||
ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
|
||||
+ if (rcclAinicRoce) {
|
||||
+ qp->ctsQpSlot = cts_qp_slot;
|
||||
+ }
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1521,7 +1630,7 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
-ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
+ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
|
||||
struct ncclIbCommStage* stage = &handle->stage;
|
||||
@@ -1529,8 +1638,13 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
|
||||
int ready;
|
||||
uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
|
||||
int isP2p = 0;
|
||||
+ int channel_id = 0;
|
||||
*sendComm = NULL;
|
||||
|
||||
+ if (rcclAinicRoce) {
|
||||
+ channel_id = ((ncclNet_ctxt_t *)sendDevComm)->chId;
|
||||
+ }
|
||||
+
|
||||
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
|
||||
if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
|
||||
if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
|
||||
@@ -1612,7 +1726,7 @@ ib_recv_dev_list:
|
||||
for (int q = 0; q < comm->base.nqps; q++) {
|
||||
ncclIbSendCommDev* commDev = comm->devs + devIndex;
|
||||
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
|
||||
- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
|
||||
+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q, channel_id, true, NCCL_CTS_QP_SLOT_INVALID), ret, fail);
|
||||
comm->base.qps[q].devIndex = devIndex;
|
||||
meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num;
|
||||
meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
|
||||
@@ -1637,7 +1751,11 @@ ib_recv_dev_list:
|
||||
devInfo->lid = ibDev->portAttr.lid;
|
||||
devInfo->ibv_dev_index = commDev->base.ibDevN;
|
||||
// Prepare my fifo
|
||||
- NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo_inline, sizeof(struct ncclIbSendFifoCtsInline)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
+ } else {
|
||||
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
+ }
|
||||
devInfo->fifoRkey = commDev->fifoMr->rkey;
|
||||
|
||||
// Pack local GID info
|
||||
@@ -1680,7 +1798,11 @@ ib_recv_dev_list:
|
||||
}
|
||||
}
|
||||
config = (ncclNetCommConfig_t*)ctx;
|
||||
- meta.fifoAddr = (uint64_t)comm->fifo;
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ meta.fifoAddr = (uint64_t)comm->fifo_inline;
|
||||
+ } else {
|
||||
+ meta.fifoAddr = (uint64_t)comm->fifo;
|
||||
+ }
|
||||
meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
|
||||
meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
|
||||
strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
|
||||
@@ -1825,18 +1947,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
-NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
|
||||
RCCL_PARAM(IbGdrFlushGpuMemNoRelaxedOrdering, "GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING", 1);
|
||||
|
||||
-ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
|
||||
struct ncclIbCommStage* stage = &lComm->stage;
|
||||
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
|
||||
int ready;
|
||||
int link_layer = IBV_LINK_LAYER_UNSPECIFIED;
|
||||
+ int channel_id = 0;
|
||||
*recvComm = NULL;
|
||||
|
||||
+ if (rcclAinicRoce) {
|
||||
+ channel_id = ((ncclNet_ctxt_t *) recvDevComm)->chId;
|
||||
+ }
|
||||
+
|
||||
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
|
||||
if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
|
||||
if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
|
||||
@@ -1966,7 +2092,7 @@ ib_recv:
|
||||
// Local ibDevN
|
||||
ibDevN = rComm->devs[devIndex].base.ibDevN;
|
||||
ibDev = ncclIbDevs + ibDevN;
|
||||
- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
|
||||
+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp, channel_id, false, q), ret, fail);
|
||||
qp->devIndex = devIndex;
|
||||
devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
|
||||
|
||||
@@ -1992,16 +2118,22 @@ ib_recv:
|
||||
|
||||
useDmaBuf = (ncclIbDmaBufSupport(lComm->dev) == ncclSuccess);
|
||||
rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || useDmaBuf)
|
||||
- && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
|
||||
+ && (ncclIbGdrFlushDisable == 0)) ? 1 : 0;
|
||||
for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
|
||||
rCommDev = rComm->devs + i;
|
||||
ibDev = ncclIbDevs + rCommDev->base.ibDevN;
|
||||
|
||||
// Retain remote fifo info and prepare my RDMA ops
|
||||
rComm->remFifo.addr = remMeta.fifoAddr;
|
||||
- NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems_cts_inline,
|
||||
+ sizeof(struct ncclIbSendFifoCtsInline)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS,
|
||||
+ IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
+ } else {
|
||||
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
+ }
|
||||
rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
|
||||
- if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
|
||||
+ if (ncclIbUseInline) rComm->remFifo.flags = IBV_SEND_INLINE;
|
||||
|
||||
// Allocate Flush dummy buffer for GPU Direct RDMA
|
||||
if (rComm->flushEnabled) {
|
||||
@@ -2039,7 +2171,7 @@ ib_recv:
|
||||
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
|
||||
rCommDev->gpuFlush.sge.length = 1;
|
||||
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
|
||||
- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
|
||||
+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp, channel_id, true, NCCL_CTS_QP_SLOT_INVALID), ret, fail);
|
||||
struct ncclIbDevInfo devInfo;
|
||||
devInfo.lid = ibDev->portAttr.lid;
|
||||
devInfo.link_layer = ibDev->portAttr.link_layer;
|
||||
@@ -2257,10 +2389,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
|
||||
|
||||
NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
|
||||
|
||||
-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, bool use_write_op) {
|
||||
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
|
||||
volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
|
||||
- int nreqs = slots[0].nreqs;
|
||||
+ int nreqs;
|
||||
+ if (rcclCtsOffloadEnabled) {
|
||||
+ nreqs = 1;
|
||||
+ } else {
|
||||
+ nreqs = slots[0].nreqs;
|
||||
+ }
|
||||
if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
|
||||
|
||||
uint64_t wr_id = 0ULL;
|
||||
@@ -2272,7 +2409,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
sge->addr=(uintptr_t)reqs[r]->send.data;
|
||||
wr->opcode = IBV_WR_RDMA_WRITE;
|
||||
wr->send_flags = 0;
|
||||
- wr->wr.rdma.remote_addr = slots[r].addr;
|
||||
+ if (rcclCtsOffloadEnabled) {
|
||||
+ wr->wr.rdma.remote_addr = 0xdeadbeef;
|
||||
+ } else {
|
||||
+ wr->wr.rdma.remote_addr = slots[r].addr;
|
||||
+ }
|
||||
wr->next = wr + 1;
|
||||
wr_id += (reqs[r] - comm->base.reqs) << (r*8);
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
@@ -2283,7 +2424,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
// Write size as immediate data. In the case of multi-send, only write
|
||||
// 0 or 1 as size to indicate whether there was data sent or received.
|
||||
uint32_t immData = 0;
|
||||
- if (nreqs == 1) {
|
||||
+ if ((nreqs == 1) && (use_write_op == false)) {
|
||||
immData = reqs[0]->send.size;
|
||||
} else {
|
||||
int* sizes = comm->remSizesFifo.elems[slot];
|
||||
@@ -2293,22 +2434,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
}
|
||||
|
||||
struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
|
||||
- if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
|
||||
- // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
|
||||
- // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
|
||||
- // completion.
|
||||
- lastWr++;
|
||||
- memset(lastWr, 0, sizeof(struct ibv_send_wr));
|
||||
- if (nreqs > 1) {
|
||||
- // Write remote sizes Fifo
|
||||
- lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
|
||||
- lastWr->num_sge = 1;
|
||||
- lastWr->sg_list = &comm->remSizesFifo.sge;
|
||||
+ if (use_write_op == false) {
|
||||
+ if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
|
||||
+ // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
|
||||
+ // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
|
||||
+ // completion.
|
||||
+ lastWr++;
|
||||
+ memset(lastWr, 0, sizeof(struct ibv_send_wr));
|
||||
+ if (nreqs > 1) {
|
||||
+ // Write remote sizes Fifo
|
||||
+ lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
|
||||
+ lastWr->num_sge = 1;
|
||||
+ lastWr->sg_list = &comm->remSizesFifo.sge;
|
||||
+ }
|
||||
}
|
||||
+ lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
+ lastWr->imm_data = immData;
|
||||
}
|
||||
lastWr->wr_id = wr_id;
|
||||
- lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
- lastWr->imm_data = immData;
|
||||
lastWr->next = NULL;
|
||||
lastWr->send_flags = IBV_SEND_SIGNALED;
|
||||
|
||||
@@ -2324,7 +2467,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
//ncclIbAddEvent(reqs[r], devIndex, &comm->devs[devIndex].base);
|
||||
|
||||
// Select proper rkey (needed even for 0-size send)
|
||||
- comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
|
||||
+ if (rcclCtsOffloadEnabled) {
|
||||
+ comm->wrs[r].wr.rdma.rkey = 0xbade;
|
||||
+ } else {
|
||||
+ comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
|
||||
+ }
|
||||
|
||||
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
|
||||
int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
|
||||
@@ -2340,7 +2487,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
}
|
||||
}
|
||||
|
||||
- if (nreqs > 1) {
|
||||
+ if ((use_write_op == false) && (nreqs > 1)) {
|
||||
// Also make sure lastWr writes remote sizes using the right lkey
|
||||
comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey;
|
||||
lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex];
|
||||
@@ -2398,32 +2545,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
|
||||
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
|
||||
+ bool use_write_op = false;
|
||||
+ if (rcclAinicRoce) {
|
||||
+ use_write_op = (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) ? true : false;
|
||||
+ }
|
||||
|
||||
// Wait for the receiver to have posted the corresponding receive
|
||||
int nreqs = 0;
|
||||
volatile struct ncclIbSendFifo* slots;
|
||||
|
||||
+ if (rcclCtsOffloadEnabled) {
|
||||
+ nreqs = 1;
|
||||
+ }
|
||||
+
|
||||
int slot = (comm->fifoHead) % MAX_REQUESTS;
|
||||
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
|
||||
- slots = comm->fifo[slot];
|
||||
- uint64_t idx = comm->fifoHead+1;
|
||||
- if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
|
||||
- nreqs = slots[0].nreqs;
|
||||
- // Wait until all data has arrived
|
||||
- for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
|
||||
- __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
|
||||
+ if (!rcclCtsOffloadEnabled) {
|
||||
+ slots = comm->fifo[slot];
|
||||
+ uint64_t idx = comm->fifoHead+1;
|
||||
+ if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
|
||||
+ nreqs = slots[0].nreqs;
|
||||
+ // Wait until all data has arrived
|
||||
+ for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
|
||||
+ __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
|
||||
+ }
|
||||
for (int r=0; r<nreqs; r++) {
|
||||
- if (reqs[r] != NULL || slots[r].tag != tag) continue;
|
||||
-
|
||||
- if (size > slots[r].size) size = slots[r].size;
|
||||
- // Sanity checks
|
||||
- if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
|
||||
- char line[SOCKET_NAME_MAXLEN + 1];
|
||||
- union ncclSocketAddress addr;
|
||||
- ncclSocketGetAddr(&comm->base.sock, &addr);
|
||||
- WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
|
||||
- r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
|
||||
- return ncclInternalError;
|
||||
+ if (!rcclCtsOffloadEnabled) {
|
||||
+ if (reqs[r] != NULL || slots[r].tag != tag) continue;
|
||||
+
|
||||
+ if (size > slots[r].size) size = slots[r].size;
|
||||
+ // Sanity checks
|
||||
+ if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
|
||||
+ char line[SOCKET_NAME_MAXLEN + 1];
|
||||
+ union ncclSocketAddress addr;
|
||||
+ ncclSocketGetAddr(&comm->base.sock, &addr);
|
||||
+ WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
|
||||
+ r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
|
||||
+ return ncclInternalError;
|
||||
+ }
|
||||
+ } else{
|
||||
+ if (reqs[r] != NULL) continue;
|
||||
}
|
||||
|
||||
struct ncclIbRequest* req;
|
||||
@@ -2467,10 +2628,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
|
||||
}
|
||||
|
||||
TIME_START(0);
|
||||
- NCCLCHECK(ncclIbMultiSend(comm, slot));
|
||||
+ NCCLCHECK(ncclIbMultiSend(comm, slot, use_write_op));
|
||||
|
||||
// Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
|
||||
- memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
|
||||
+ if (!rcclCtsOffloadEnabled) {
|
||||
+ memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
|
||||
+ }
|
||||
memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
|
||||
comm->fifoHead++;
|
||||
TIME_STOP(0);
|
||||
@@ -2483,30 +2646,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
|
||||
|
||||
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
|
||||
struct ibv_send_wr wr;
|
||||
+ struct ncclIbSendFifo* localElem = NULL;
|
||||
+ struct ncclIbSendFifoCtsInline* localElemCtsInline = NULL;
|
||||
+ uint64_t localElemRef;
|
||||
+ int qpIndex = 0;
|
||||
+ ncclIbQp* ctsQp = NULL;
|
||||
memset(&wr, 0, sizeof(wr));
|
||||
|
||||
int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
|
||||
req->recv.sizes = comm->sizesFifo[slot];
|
||||
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
|
||||
- struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ localElemCtsInline = comm->remFifo.elems_cts_inline[slot];
|
||||
+ } else {
|
||||
+ localElem = comm->remFifo.elems[slot];
|
||||
+ }
|
||||
|
||||
- // Select the next devIndex (local) and QP to use for posting this CTS message
|
||||
- // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
|
||||
- ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex;
|
||||
- comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
|
||||
+ if (rcclAinicRoce) {
|
||||
+ qpIndex = comm->base.qpIndex;
|
||||
+ ctsQp = comm->base.qps + qpIndex;
|
||||
+ } else {
|
||||
+ // Select the next devIndex (local) and QP to use for posting this CTS message
|
||||
+ // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
|
||||
+ ctsQp = comm->base.qps + comm->base.devIndex;
|
||||
+ comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
|
||||
+ }
|
||||
|
||||
for (int i=0; i<n; i++) {
|
||||
- localElem[i].addr = (uint64_t)data[i];
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ localElemCtsInline[i].addr = (uint64_t)data[i];
|
||||
+
|
||||
+ // Send all applicable rkeys
|
||||
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
|
||||
+ localElemCtsInline[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
|
||||
+
|
||||
+ localElemCtsInline[i].nreqs = n;
|
||||
+ localElemCtsInline[i].size = sizes[i]; // Sanity/Debugging
|
||||
+ localElemCtsInline[i].tag = tags[i];
|
||||
+ localElemCtsInline[i].idx = comm->remFifo.fifoTail+1;
|
||||
+ localElemRef = (uint64_t)localElemCtsInline;
|
||||
+
|
||||
+ } else {
|
||||
+ localElem[i].addr = (uint64_t)data[i];
|
||||
|
||||
- // Send all applicable rkeys
|
||||
- for (int j = 0; j < comm->base.vProps.ndevs; j++)
|
||||
- localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
|
||||
+ // Send all applicable rkeys
|
||||
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
|
||||
+ localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
|
||||
|
||||
- localElem[i].nreqs = n;
|
||||
- localElem[i].size = sizes[i]; // Sanity/Debugging
|
||||
- localElem[i].tag = tags[i];
|
||||
- localElem[i].idx = comm->remFifo.fifoTail+1;
|
||||
+ localElem[i].nreqs = n;
|
||||
+ localElem[i].size = sizes[i]; // Sanity/Debugging
|
||||
+ localElem[i].tag = tags[i];
|
||||
+ localElem[i].idx = comm->remFifo.fifoTail+1;
|
||||
+ localElemRef = (uint64_t)localElem;
|
||||
+ }
|
||||
}
|
||||
wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
|
||||
|
||||
@@ -2514,8 +2707,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
|
||||
wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey;
|
||||
|
||||
// Set the correct sge properties
|
||||
- comm->devs[ctsQp->devIndex].fifoSge.addr = (uint64_t)localElem;
|
||||
- comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
|
||||
+ comm->devs[ctsQp->devIndex].fifoSge.addr = localElemRef;
|
||||
+ if (rcclCtsInlineData) {
|
||||
+ comm->devs[ctsQp->devIndex].fifoSge.length = MAX_INLINE_DATA_SIZE;
|
||||
+ } else {
|
||||
+ comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
|
||||
+ }
|
||||
wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge;
|
||||
wr.num_sge = 1;
|
||||
|
||||
@@ -2545,7 +2742,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
|
||||
//
|
||||
// slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled.
|
||||
// This works out that each fifo posting QP gets drained
|
||||
- if (slot == ctsQp->devIndex) {
|
||||
+ if (rcclAinicRoce) {
|
||||
+ if (slot == ctsQp->ctsQpSlot) {
|
||||
+ wr.send_flags |= IBV_SEND_SIGNALED;
|
||||
+ wr.wr_id = req - comm->base.reqs;
|
||||
+ ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
|
||||
+ }
|
||||
+ } else if (slot == ctsQp->devIndex) {
|
||||
wr.send_flags |= IBV_SEND_SIGNALED;
|
||||
wr.wr_id = req - comm->base.reqs;
|
||||
ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
|
||||
@@ -2560,10 +2763,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
|
||||
|
||||
comm->remFifo.fifoTail++;
|
||||
|
||||
+ if (rcclAinicRoce) {
|
||||
+ // Select the next qpIndex
|
||||
+ comm->base.qpIndex = (comm->base.qpIndex+1) % comm->base.nqps;
|
||||
+ }
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
|
||||
+ ncclResult_t res = ncclSuccess;
|
||||
+ bool netOptRecvCompletionEnabled = false;
|
||||
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
||||
if (comm->base.ready == 0) {
|
||||
WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
|
||||
@@ -2573,6 +2782,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
|
||||
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
|
||||
|
||||
+ if (rcclAinicRoce) {
|
||||
+ if (*request == (void *) NCCL_NET_OPTIONAL_RECV_COMPLETION) {
|
||||
+ netOptRecvCompletionEnabled = true;
|
||||
+ }
|
||||
+ }
|
||||
struct ncclIbRequest* req;
|
||||
NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
|
||||
req->type = NCCL_NET_IB_REQ_RECV;
|
||||
@@ -2586,50 +2800,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
|
||||
req->devBases[i] = &comm->devs[i].base;
|
||||
}
|
||||
|
||||
- struct ibv_recv_wr wr;
|
||||
- memset(&wr, 0, sizeof(wr));
|
||||
- wr.wr_id = req - comm->base.reqs;
|
||||
- wr.sg_list = NULL;
|
||||
- wr.num_sge = 0;
|
||||
+ if (!netOptRecvCompletionEnabled) {
|
||||
+ struct ibv_recv_wr wr;
|
||||
+ memset(&wr, 0, sizeof(wr));
|
||||
+ wr.wr_id = req - comm->base.reqs;
|
||||
+ wr.sg_list = NULL;
|
||||
+ wr.num_sge = 0;
|
||||
|
||||
- TIME_START(1);
|
||||
- // Select either all QPs, or one qp per-device
|
||||
- const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
|
||||
+ TIME_START(1);
|
||||
+ // Select either all QPs, or one qp per-device
|
||||
+ const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
|
||||
|
||||
- // Post recvs
|
||||
- struct ibv_recv_wr* bad_wr;
|
||||
- for (int i = 0; i < nqps; i++) {
|
||||
- struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
|
||||
- ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
|
||||
+ // Post recvs
|
||||
+ struct ibv_recv_wr* bad_wr;
|
||||
+ int qpIndex = comm->base.qpIndex;
|
||||
+ for (int i = 0; i < nqps; i++) {
|
||||
+ struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
|
||||
+ ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
- // Start a QP event for every request in the multirecv and every qp
|
||||
- for (int r = 0; r < n; r++) {
|
||||
- int nEventHandles = req->pInfo[r].nEventHandles;
|
||||
- assert(nEventHandles < MAX_QPS_PER_REQ);
|
||||
- req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
|
||||
- // Store info for profiler
|
||||
- int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
- req->pInfo[r].data.type = ncclProfileQp;
|
||||
- req->pInfo[r].data.qp.device = qp->devIndex;
|
||||
- req->pInfo[r].data.qp.wr_id = wr.wr_id;
|
||||
- req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
|
||||
- NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
|
||||
- req->pInfo[r].nEventHandles++;
|
||||
- }
|
||||
+ // Start a QP event for every request in the multirecv and every qp
|
||||
+ for (int r = 0; r < n; r++) {
|
||||
+ int nEventHandles = req->pInfo[r].nEventHandles;
|
||||
+ assert(nEventHandles < MAX_QPS_PER_REQ);
|
||||
+ req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
|
||||
+ // Store info for profiler
|
||||
+ int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
+ req->pInfo[r].data.type = ncclProfileQp;
|
||||
+ req->pInfo[r].data.qp.device = qp->devIndex;
|
||||
+ req->pInfo[r].data.qp.wr_id = wr.wr_id;
|
||||
+ req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
|
||||
+ NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
|
||||
+ req->pInfo[r].nEventHandles++;
|
||||
+ }
|
||||
#endif
|
||||
- NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
|
||||
- comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
|
||||
- }
|
||||
+ NCCLCHECKGOTO(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr), res, err);
|
||||
+ // Don't update comm->base.qpIndex yet, we need to run through this same set of QPs
|
||||
+ // inside ncclIbPostFifo()
|
||||
+ if (rcclAinicRoce) {
|
||||
+ qpIndex = (qpIndex+1)%comm->base.nqps;
|
||||
+ } else {
|
||||
+ comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
- TIME_STOP(1);
|
||||
+ TIME_STOP(1);
|
||||
+ } // netOptRecvCompletionEnabled = false
|
||||
|
||||
// Post to FIFO to notify sender
|
||||
TIME_START(2);
|
||||
- NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
|
||||
+ NCCLCHECKGOTO(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req), res, err);
|
||||
TIME_STOP(2);
|
||||
|
||||
*request = req;
|
||||
return ncclSuccess;
|
||||
+err:
|
||||
+ if (req) {
|
||||
+ ncclIbFreeRequest(req);
|
||||
+ }
|
||||
+ return res;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
|
||||
@@ -2698,6 +2926,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
|
||||
}
|
||||
#endif
|
||||
|
||||
+#define NCCL_CQ_POLL_MAX_EVENT 16
|
||||
+
|
||||
ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
|
||||
*done = 0;
|
||||
@@ -2731,13 +2961,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
|
||||
int totalWrDone = 0;
|
||||
int wrDone = 0;
|
||||
- struct ibv_wc wcs[4];
|
||||
+ struct ibv_wc wcs[NCCL_CQ_POLL_MAX_EVENT];
|
||||
+ int cqMaxPollEvent = 4;
|
||||
+ if (rcclAinicRoce) {
|
||||
+ cqMaxPollEvent = NCCL_CQ_POLL_MAX_EVENT;
|
||||
+ }
|
||||
|
||||
for (int i = 0; i < NCCL_IB_MAX_DEVS_PER_NIC; i++) {
|
||||
TIME_START(3);
|
||||
// If we expect any completions from this device's CQ
|
||||
if (r->events[i]) {
|
||||
- NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, 4, wcs, &wrDone));
|
||||
+ NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, cqMaxPollEvent,
|
||||
+ wcs, &wrDone));
|
||||
totalWrDone += wrDone;
|
||||
if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
|
||||
if (wrDone == 0) continue;
|
||||
@@ -2889,7 +3124,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
|
||||
}
|
||||
|
||||
ncclNet_t ncclNetIb = {
|
||||
- "IB",
|
||||
+ "ROCM-IB",
|
||||
ncclIbInit,
|
||||
ncclIbDevices,
|
||||
ncclIbGetProperties,
|
||||
@@ -179,4 +179,4 @@ When developing new tuner plugins:
|
||||
- [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
|
||||
- Example plugin implementations in this directory
|
||||
|
||||
For questions and support, refer to the NCCL community resources and documentation.
|
||||
For questions and support, refer to the NCCL community resources and documentation.
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
# Compiled shared objects and binaries
|
||||
*.so
|
||||
*.o
|
||||
*.a
|
||||
*.out
|
||||
*.exe
|
||||
*.dll
|
||||
*.dylib
|
||||
*.bin
|
||||
*.elf
|
||||
|
||||
# Python cache
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
|
||||
# Build and test artifacts
|
||||
/build/
|
||||
*.log
|
||||
*.tmp
|
||||
*.swp
|
||||
|
||||
# Ignore all CSV files except scripts/sample_performance_data.csv
|
||||
*.csv
|
||||
!scripts/sample_performance_data.csv
|
||||
|
||||
# Ignore all .conf files except nccl_tuner.conf
|
||||
*.conf
|
||||
!nccl_tuner.conf
|
||||
|
||||
my_configs
|
||||
|
||||
# Ignore test binary
|
||||
test/test_plugin
|
||||
|
||||
# Editor/OS files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Backup files
|
||||
*~
|
||||
*.bak
|
||||
|
||||
# Ignore by convention
|
||||
*.old
|
||||
*.orig
|
||||
|
||||
# Git
|
||||
.git/
|
||||
@@ -0,0 +1,26 @@
|
||||
# Find all C source files in current directory
|
||||
set(SRC_FILES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
|
||||
)
|
||||
|
||||
# Create shared library
|
||||
add_library(nccl-tuner-example SHARED ${SRC_FILES})
|
||||
|
||||
# Set include directories
|
||||
target_include_directories(nccl-tuner-example PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nccl
|
||||
)
|
||||
|
||||
# Set output name to match Makefile
|
||||
set_target_properties(nccl-tuner-example PROPERTIES
|
||||
OUTPUT_NAME "nccl-tuner-example"
|
||||
PREFIX "lib"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
|
||||
)
|
||||
|
||||
# Add custom target for clean (equivalent to Makefile clean target)
|
||||
add_custom_target(clean-tuner-lib
|
||||
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so
|
||||
COMMENT "Cleaning libnccl-tuner-example.so"
|
||||
)
|
||||
@@ -45,6 +45,40 @@ typedef enum {
|
||||
|
||||
#define NCCL_ALGO_PROTO_IGNORE -1.0
|
||||
|
||||
#define NCCL_HW_NVLINK 0
|
||||
#define NCCL_HW_PCI 1
|
||||
#define NCCL_HW_NET 2
|
||||
#define NCCL_NUM_HW_LINKS 3
|
||||
|
||||
#define NCCL_VOLTA_COMPCAP_IDX 0
|
||||
#define NCCL_AMPERE_COMPCAP_IDX 1
|
||||
#define NCCL_HOPPER_COMPCAP_IDX 2
|
||||
#define NCCL_BLACKWELL_COMPCAP_IDX 3
|
||||
#define NCCL_NUM_COMPCAPS 4
|
||||
|
||||
#define NCCL_TUNING_SCALE_1NODE 0
|
||||
#define NCCL_TUNING_SCALE_2NODES 1
|
||||
#define NCCL_TUNING_SCALE_4NODES 2
|
||||
#define NCCL_NUM_TUNING_SCALES 3
|
||||
|
||||
typedef struct {
|
||||
int nNvlDomains; // number of NVLink domains
|
||||
int minRanksPerNvlDomain; // minimum ranks across all NVLink domains
|
||||
int maxRanksPerNvlDomain; // maximum ranks across all NVLink domains
|
||||
} ncclNvlDomainInfo_v5_t;
|
||||
|
||||
typedef struct {
|
||||
double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
|
||||
double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
|
||||
double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
|
||||
double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
|
||||
double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
|
||||
|
||||
|
||||
} ncclTunerConstants_v5_t;
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
@@ -52,12 +86,17 @@ typedef struct {
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - commId: communicator identifier
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// - nvlDomainInfo: NVL domain information struct
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
// Input/Output:
|
||||
// - constants: tuner constants
|
||||
ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
|
||||
ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
@@ -87,11 +126,13 @@ typedef struct {
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v4_t;
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclTuner_v5_t;
|
||||
|
||||
typedef ncclTuner_v4_t ncclTuner_t;
|
||||
typedef ncclTuner_v5_t ncclTuner_t;
|
||||
typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
|
||||
typedef ncclTunerConstants_v5_t ncclTunerConstants_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -51,6 +51,7 @@ typedef struct {
|
||||
size_t nRanks;
|
||||
size_t nNodes;
|
||||
ncclDebugLogger_t logFunction;
|
||||
ncclNvlDomainInfo_v5_t nvlDomainInfo;
|
||||
} TunerContext;
|
||||
|
||||
// Parse collective type from string
|
||||
@@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
|
||||
__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
|
||||
ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) {
|
||||
|
||||
if (NULL != constants) {
|
||||
// NCCL constants tuning
|
||||
// Tune NCCL's internal tuning model to improve base algo/proto selection.
|
||||
// Note: Example numbers are for reference only.
|
||||
// Actual numbers may vary depending on the hardware and network topology.
|
||||
// These numbers are not guaranteed to be optimal for all cases.
|
||||
// Limit the tree bandwidth to 15GB/s
|
||||
constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0;
|
||||
|
||||
// Limit the ring bandwidth to 20GB/s
|
||||
constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0;
|
||||
|
||||
// Set NVLSTree base network latency to 24us
|
||||
constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
|
||||
}
|
||||
|
||||
TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
|
||||
if (!ctx) return ncclSystemError;
|
||||
|
||||
@@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
|
||||
ctx->nRanks = nRanks;
|
||||
ctx->nNodes = nNodes;
|
||||
ctx->logFunction = logFunction;
|
||||
if (nvlDomainInfo) {
|
||||
ctx->nvlDomainInfo = *nvlDomainInfo;
|
||||
} else {
|
||||
memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t));
|
||||
}
|
||||
|
||||
if (logFunction) {
|
||||
logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
|
||||
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
|
||||
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains",
|
||||
nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains);
|
||||
}
|
||||
|
||||
// Try to load config file from environment variable or default location
|
||||
@@ -435,7 +460,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginDestroy(void* context) {
|
||||
__hidden ncclResult_t pluginFinalize(void* context) {
|
||||
if (context) {
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
if (ctx->configs) {
|
||||
@@ -446,11 +471,12 @@ __hidden ncclResult_t pluginDestroy(void* context) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
#define PLUGIN_NAME "Example"
|
||||
|
||||
const ncclTuner_v4_t ncclTunerPlugin_v4 = {
|
||||
const ncclTuner_v5_t ncclTunerPlugin_v5 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.getCollInfo = pluginGetCollInfo,
|
||||
.destroy = pluginDestroy
|
||||
.finalize = pluginFinalize
|
||||
};
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
# NCCL Tuner Configuration File (CSV Format)
|
||||
# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
|
||||
#
|
||||
# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
|
||||
# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
|
||||
# Protocols: ll, ll128, simple
|
||||
# Channels: number of channels to use, or -1 to keep default
|
||||
# nNodes: number of nodes to match, or -1 for any number of nodes
|
||||
# nRanks: number of ranks to match, or -1 for any number of ranks
|
||||
# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
|
||||
# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
|
||||
#
|
||||
# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
|
||||
#
|
||||
#AR 4PPN
|
||||
allreduce,33554432,4294967296,ring,simple,16,2,8,-1,-1
|
||||
allreduce,33554432,4294967296,ring,simple,16,4,16,-1,-1
|
||||
allreduce,67108864,4294967296,ring,simple,16,8,32,-1,-1
|
||||
#AR 2PPN
|
||||
allreduce,2097152,4294967296,ring,simple,4,2,4,-1,-1
|
||||
allreduce,16777216,4294967296,ring,simple,4,4,8,-1,-1
|
||||
allreduce,33554432,4294967296,ring,simple,4,8,16,-1,-1
|
||||
#AR 1PPN
|
||||
allreduce,134217728,4294967296,ring,simple,4,4,4,-1,-1
|
||||
allreduce,67108864,4294967296,ring,simple,4,8,8,-1,-1
|
||||
|
||||
|
||||
#AG 4PPN
|
||||
allgather,8388608,4294967296,ring,simple,16,2,8,-1,-1
|
||||
allgather,16777216,4294967296,ring,simple,16,4,16,-1,-1
|
||||
allgather,16777216,4294967296,ring,simple,16,8,32,-1,-1
|
||||
#AG 2PPN
|
||||
allgather,262144,4294967296,ring,simple,4,2,4,-1,-1
|
||||
allgather,16777216,4294967296,ring,simple,4,4,8,-1,-1
|
||||
allgather,33554432,4294967296,ring,simple,4,8,16,-1,-1
|
||||
#AG 1PPN
|
||||
allgather,262144,2097152,ring,simple,4,2,2,-1,-1
|
||||
allgather,262144,8388608,ring,simple,4,4,4,-1,-1
|
||||
allgather,67108864,4294967296,ring,simple,4,8,8,-1,-1
|
||||
|
||||
#RS 4PPN
|
||||
reducescatter,1048576,4294967296,ring,simple,16,2,8,-1,-1
|
||||
reducescatter,1048576,4294967296,ring,simple,16,4,16,-1,-1
|
||||
reducescatter,1048576,4294967296,ring,simple,16,8,32,-1,-1
|
||||
#RS 2PPN
|
||||
reducescatter,262144,33554432,ring,simple,4,2,4,-1,-1
|
||||
reducescatter,262144,4294967296,ring,simple,4,4,8,-1,-1
|
||||
reducescatter,262144,4294967296,ring,simple,4,8,16,-1,-1
|
||||
#RS 1PPN
|
||||
reducescatter,131072,262144,ring,simple,4,2,2,-1,-1
|
||||
reducescatter,1048576,2097152,ring,simple,4,2,2,-1,-1
|
||||
reducescatter,131072,4194304,ring,simple,4,4,4,-1,-1
|
||||
reducescatter,262144,8388608,ring,simple,4,8,8,-1,-1
|
||||
@@ -98,12 +98,12 @@ int test_plugin_init() {
|
||||
void* context = NULL;
|
||||
|
||||
// Test successful initialization
|
||||
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
|
||||
TEST_ASSERT(context != NULL, "Context should be allocated");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
@@ -123,11 +123,11 @@ int test_config_parsing_valid() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
|
||||
ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_valid.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -144,12 +144,12 @@ int test_config_parsing_invalid() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
|
||||
ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
// Should still succeed but with no valid configs loaded
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_invalid.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -165,7 +165,7 @@ int test_collective_matching() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
|
||||
// Create mock cost table
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
@@ -209,7 +209,7 @@ int test_collective_matching() {
|
||||
TEST_ASSERT(nChannels == 4, "Should set 4 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_match.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -226,7 +226,7 @@ int test_size_matching() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
@@ -280,7 +280,7 @@ int test_size_matching() {
|
||||
TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_size.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -298,7 +298,7 @@ int test_topology_matching() {
|
||||
|
||||
// Test with single node setup
|
||||
void* context1 = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node
|
||||
pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL); // 8 ranks, 1 node
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
@@ -316,11 +316,11 @@ int test_topology_matching() {
|
||||
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
|
||||
TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
|
||||
|
||||
pluginDestroy(context1);
|
||||
pluginFinalize(context1);
|
||||
|
||||
// Test with 4 nodes, 32 ranks setup
|
||||
void* context2 = NULL;
|
||||
pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes
|
||||
pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL); // 32 ranks, 4 nodes
|
||||
|
||||
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
|
||||
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
|
||||
@@ -349,7 +349,7 @@ int test_default_channels() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
@@ -369,7 +369,7 @@ int test_default_channels() {
|
||||
TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_default.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -386,7 +386,7 @@ int test_regbuff_matching() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
@@ -437,7 +437,7 @@ int test_regbuff_matching() {
|
||||
TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_regbuff.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -454,7 +454,7 @@ int test_pipeops_matching() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
@@ -504,7 +504,7 @@ int test_pipeops_matching() {
|
||||
TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_pipeops.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -519,7 +519,7 @@ int test_no_match_fallback() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
|
||||
|
||||
void* context = NULL;
|
||||
pluginInit(8, 1, mock_logger, &context);
|
||||
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
|
||||
|
||||
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
|
||||
@@ -543,7 +543,7 @@ int test_no_match_fallback() {
|
||||
TEST_ASSERT(nChannels == 1, "Should use default channels");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink("test_fallback.conf");
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
TEST_PASS();
|
||||
@@ -593,7 +593,7 @@ int test_large_config() {
|
||||
|
||||
// Initialize plugin with large config
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
|
||||
ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
|
||||
TEST_ASSERT(context != NULL, "Context should be allocated");
|
||||
|
||||
@@ -652,7 +652,7 @@ int test_large_config() {
|
||||
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink(large_config_file);
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
|
||||
@@ -684,7 +684,7 @@ int test_very_large_config_stress() {
|
||||
|
||||
// Test initialization with stress config
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
|
||||
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
@@ -705,7 +705,7 @@ int test_very_large_config_stress() {
|
||||
}
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink(stress_config_file);
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
|
||||
@@ -726,7 +726,7 @@ int test_empty_config() {
|
||||
setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
|
||||
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
|
||||
|
||||
TunerContext* ctx = (TunerContext*)context;
|
||||
@@ -751,13 +751,134 @@ int test_empty_config() {
|
||||
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
|
||||
|
||||
// Clean up
|
||||
pluginDestroy(context);
|
||||
pluginFinalize(context);
|
||||
unlink(empty_config_file);
|
||||
unsetenv("NCCL_TUNER_CONFIG_FILE");
|
||||
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test NVLink domain info handling
|
||||
int test_nvl_domain_info() {
|
||||
printf("Testing NVLink domain info handling...\n");
|
||||
|
||||
// Test NVLink domain structure with min/max ranks per domain
|
||||
ncclNvlDomainInfo_v5_t nvl_domain = {
|
||||
.nNvlDomains = 2, // 2 nodes = 2 domains
|
||||
.minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
|
||||
.maxRanksPerNvlDomain = 5 // maximum ranks across all domains (capacity)
|
||||
};
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
|
||||
|
||||
// Validate NVLD info structure
|
||||
TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
|
||||
TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
|
||||
TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
|
||||
|
||||
// Clean up
|
||||
pluginFinalize(context);
|
||||
printf("NVLink domain info test passed!\n");
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
int test_tuner_constants() {
|
||||
// Initialize constants to -1.0 for testing purposes
|
||||
ncclTunerConstants_v5_t constants = {
|
||||
// Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
|
||||
.baseLatencies = {
|
||||
{-1.0, -1.0, -1.0}, // NCCL_ALGO_TREE: LL, LL128, Simple
|
||||
{-1.0, -1.0, -1.0}, // NCCL_ALGO_RING: LL, LL128, Simple
|
||||
{-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_DIRECT
|
||||
{-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_CHAIN
|
||||
{-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS
|
||||
{-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS_TREE
|
||||
{-1.0, -1.0, -1.0} // NCCL_ALGO_PAT
|
||||
},
|
||||
|
||||
// Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
|
||||
.hwLatencies = {
|
||||
// NCCL_HW_NVLINK
|
||||
{
|
||||
{-1.0, -1.0, -1.0}, // TREE
|
||||
{-1.0, -1.0, -1.0}, // RING
|
||||
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
|
||||
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
|
||||
{-1.0, -1.0, -1.0}, // NVLS
|
||||
{-1.0, -1.0, -1.0}, // NVLS_TREE
|
||||
{-1.0, -1.0, -1.0} // PAT
|
||||
},
|
||||
// NCCL_HW_PCI
|
||||
{
|
||||
{-1.0, -1.0, -1.0}, // TREE
|
||||
{-1.0, -1.0, -1.0}, // RING
|
||||
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
|
||||
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
|
||||
{-1.0, -1.0, -1.0}, // NVLS
|
||||
{-1.0, -1.0, -1.0}, // NVLS_TREE
|
||||
{-1.0, -1.0, -1.0} // PAT
|
||||
},
|
||||
// NCCL_HW_NET
|
||||
{
|
||||
{-1.0, -1.0, -1.0}, // TREE
|
||||
{-1.0, -1.0, -1.0}, // RING
|
||||
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
|
||||
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
|
||||
{-1.0, -1.0, -1.0}, // NVLS
|
||||
{-1.0, -1.0, -1.0}, // NVLS_TREE
|
||||
{-1.0, -1.0, -1.0} // PAT
|
||||
}
|
||||
},
|
||||
|
||||
// LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
|
||||
.llMaxBws = {
|
||||
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
|
||||
},
|
||||
|
||||
// Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
|
||||
.perChMaxRingLL128Bws = {
|
||||
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
|
||||
},
|
||||
|
||||
// Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
|
||||
.perChMaxTreeLL128Bws = {
|
||||
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
|
||||
},
|
||||
|
||||
// Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
|
||||
.perChMaxTreeBws = {
|
||||
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
|
||||
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
|
||||
}
|
||||
};
|
||||
|
||||
void* context = NULL;
|
||||
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants);
|
||||
TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed");
|
||||
|
||||
// Test that the constants were set correctly
|
||||
TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s");
|
||||
TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s");
|
||||
TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us");
|
||||
|
||||
// Clean up
|
||||
pluginFinalize(context);
|
||||
TEST_PASS();
|
||||
}
|
||||
|
||||
// Test runner function pointer type
|
||||
typedef int (*TestFunction)(void);
|
||||
|
||||
@@ -783,6 +904,8 @@ TestCase test_cases[] = {
|
||||
{"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
|
||||
{"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
|
||||
{"empty-config", test_empty_config, "Empty configuration file handling"},
|
||||
{"nvl-domain", test_nvl_domain_info, "NVL domain info handling"},
|
||||
{"constants", test_tuner_constants, "Tuner constants initialization"},
|
||||
{NULL, NULL, NULL} // End marker
|
||||
};
|
||||
|
||||
@@ -826,6 +949,7 @@ int main(int argc, char* argv[]) {
|
||||
if (argc == 1) {
|
||||
// No arguments - run all tests
|
||||
for (int i = 0; test_cases[i].name != NULL; i++) {
|
||||
printf("Running test: %s\n", test_cases[i].name);
|
||||
total++;
|
||||
passed += test_cases[i].func();
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ install_dependencies=false
|
||||
install_library=false
|
||||
install_prefix="${ROCM_PATH}"
|
||||
log_trace=false
|
||||
msccl_kernel_enabled=true
|
||||
msccl_kernel_enabled=false
|
||||
mscclpp_enabled=false
|
||||
enable_mscclpp_clip=false
|
||||
num_parallel_jobs=$(nproc)
|
||||
@@ -39,7 +39,9 @@ run_tests_all=false
|
||||
time_trace=false
|
||||
force_reduce_pipeline=false
|
||||
generate_sym_kernels=false
|
||||
warp_speed_enabled=true # note that this flag will be overridden to false for non MI350/MI300 platforms
|
||||
quiet_warnings=false
|
||||
build_rocshmem_support=false
|
||||
|
||||
# #################################################
|
||||
# helper functions
|
||||
@@ -54,7 +56,7 @@ function display_help()
|
||||
echo " --debug Build debug library"
|
||||
echo " --enable_backtrace Build with custom backtrace support"
|
||||
echo " --disable-colltrace Build without collective trace"
|
||||
echo " --disable-msccl-kernel Build without MSCCL kernels"
|
||||
echo " --enable-msccl-kernel Build with MSCCL kernels"
|
||||
echo " --dump-asm Disassemble code and dump assembly with inline code"
|
||||
echo " --enable-mscclpp Build with MSCCL++ support"
|
||||
echo " --enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines"
|
||||
@@ -81,6 +83,7 @@ function display_help()
|
||||
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
|
||||
echo " --generate-sym-kernels Generate symmetric memory kernels"
|
||||
echo " -q|--quiet-warnings Suppress majority of compiler warnings (not recommended)"
|
||||
echo " --rocshmem Build with rocSHMEM support"
|
||||
}
|
||||
|
||||
# #################################################
|
||||
@@ -90,7 +93,7 @@ function display_help()
|
||||
# check if we have a modern version of getopt that can handle whitespace and long parameters
|
||||
getopt -T
|
||||
if [[ "$?" -eq 4 ]]; then
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,verbose -- "$@")
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,disable-warp-speed,verbose,rocshmem -- "$@")
|
||||
else
|
||||
echo "Need a new version of getopt"
|
||||
exit 1
|
||||
@@ -137,7 +140,9 @@ while true; do
|
||||
--verbose) build_verbose=true; shift ;;
|
||||
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
|
||||
--generate-sym-kernels) generate_sym_kernels=true; shift ;;
|
||||
--disable-warp-speed) warp_speed_enabled=false; shift ;;
|
||||
-q | --quiet-warnings) quiet_warnings=true; shift ;;
|
||||
--rocshmem) build_rocshmem_support=true; shift ;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Unexpected command line parameter received; aborting";
|
||||
exit 1
|
||||
@@ -316,12 +321,25 @@ if [[ "${npkit_enabled}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
|
||||
fi
|
||||
|
||||
# Enable WARP_SPEED only on MI350/MI300 platforms
|
||||
if [[ "${warp_speed_enabled}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DENABLE_WARP_SPEED=ON"
|
||||
fi
|
||||
|
||||
# Suppress Warnings
|
||||
if [[ "${quiet_warnings}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DQUIET_WARNINGS=ON"
|
||||
fi
|
||||
|
||||
|
||||
# Enable rocSHMEM support
|
||||
if [[ "${build_rocshmem_support}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=ON"
|
||||
cmake_common_options="${cmake_common_options} -DROCSHMEM_INSTALL_DIR=${ROCSHMEM_INSTALL_DIR}"
|
||||
else
|
||||
cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=OFF"
|
||||
fi
|
||||
|
||||
check_exit_code "$?"
|
||||
|
||||
# Enable ninja build for time tracing
|
||||
|
||||
@@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
|
||||
# You should define NVCC_GENCODE in your environment to the minimal set
|
||||
# of archs to reduce compile time.
|
||||
CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
|
||||
-gencode=arch=compute_60,code=sm_60 \
|
||||
CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
|
||||
# SM35 is deprecated from CUDA12.0 onwards
|
||||
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
|
||||
endif
|
||||
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
|
||||
CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
|
||||
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 27
|
||||
NCCL_PATCH := 7
|
||||
NCCL_MINOR := 28
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -10,7 +10,7 @@ build : debian.build txz.build
|
||||
|
||||
BUILDDIR ?= $(abspath ../build)
|
||||
ABSBUILDDIR := $(abspath $(BUILDDIR))
|
||||
TARGETS := debian txz
|
||||
TARGETS := debian txz doc
|
||||
all: ${TARGETS:%=%.build}
|
||||
prep: ${TARGETS:%=%.prep}
|
||||
build: ${TARGETS:%=%.build}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
bin/ncclras /usr/bin
|
||||
include/nccl.h /usr/include
|
||||
include/* /usr/include
|
||||
lib/libnccl.so /usr/lib/${pkg:MultiArch}
|
||||
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
|
||||
|
||||
@@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li
|
||||
# devel
|
||||
install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
|
||||
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
|
||||
cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/
|
||||
install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
|
||||
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
|
||||
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
|
||||
|
||||
# static
|
||||
@@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%doc LICENSE.txt
|
||||
%defattr(-,root,root,-)
|
||||
%{_bindir}/ncclras
|
||||
%{_includedir}/nccl.h
|
||||
%{_includedir}/*
|
||||
%{_libdir}/libnccl.so
|
||||
|
||||
%files static
|
||||
|
||||
@@ -22,7 +22,7 @@ prep: $(TXZTARGETS)
|
||||
build: prep
|
||||
$(MAKE) -C ../../src clean
|
||||
@printf "Building source tar.xz package\n"
|
||||
(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
|
||||
(cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh)
|
||||
mkdir -p $(PKGDIR)
|
||||
mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
|
||||
|
||||
|
||||
@@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix}
|
||||
NCCL_BUILD=${pkg:Revision}
|
||||
|
||||
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
|
||||
if [ "${SRCTXZ_APITESTS}" = "1" ]; then
|
||||
NCCLNAME+="-apitest"
|
||||
fi
|
||||
|
||||
tar --exclude build \
|
||||
|
||||
INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk")
|
||||
|
||||
if [ "${SRCTXZ_APITESTS}" = "1" ]; then
|
||||
# Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES
|
||||
for entry in $(ls $NCCLDIR/test); do
|
||||
if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then
|
||||
EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry"
|
||||
fi
|
||||
done
|
||||
else
|
||||
# Exclude the entire test directory
|
||||
EXCLUDE_TEST+=" --exclude test"
|
||||
fi
|
||||
|
||||
tar --exclude fortran \
|
||||
--exclude doc \
|
||||
--exclude plc \
|
||||
--exclude build \
|
||||
--exclude ".git*" \
|
||||
--exclude share \
|
||||
--exclude ompi \
|
||||
--exclude ext-net \
|
||||
--exclude pkg/srctxz \
|
||||
--exclude docker \
|
||||
$EXCLUDE_TEST \
|
||||
--transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
|
||||
|
||||
@@ -0,0 +1,180 @@
|
||||
# Source files
|
||||
set(LIBSRCFILES
|
||||
bootstrap.cc
|
||||
channel.cc
|
||||
ce_coll.cc
|
||||
collectives.cc
|
||||
debug.cc
|
||||
enqueue.cc
|
||||
group.cc
|
||||
init.cc
|
||||
init_nvtx.cc
|
||||
proxy.cc
|
||||
transport.cc
|
||||
mnnvl.cc
|
||||
allocator.cc
|
||||
sym_kernels.cc
|
||||
dev_runtime.cc
|
||||
)
|
||||
|
||||
# Add compatibility shim if using static cudart
|
||||
if(CUDARTLIB STREQUAL "cudart_static")
|
||||
list(APPEND LIBSRCFILES enhcompat.cc)
|
||||
endif()
|
||||
|
||||
# Configure pkg-config file
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
|
||||
${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
|
||||
@ONLY
|
||||
)
|
||||
|
||||
# Add files from subdirectories
|
||||
add_subdirectory(transport)
|
||||
add_subdirectory(misc)
|
||||
add_subdirectory(register)
|
||||
add_subdirectory(graph)
|
||||
add_subdirectory(plugin)
|
||||
add_subdirectory(device)
|
||||
add_subdirectory(nccl_device)
|
||||
add_subdirectory(ras)
|
||||
add_subdirectory(scheduler)
|
||||
|
||||
add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
|
||||
|
||||
# Add all source files
|
||||
list(APPEND LIBSRCFILES
|
||||
${TRANSPORT_SOURCES}
|
||||
${MISC_SOURCES}
|
||||
${REGISTER_SOURCES}
|
||||
${GRAPH_SOURCES}
|
||||
${PLUGIN_SOURCES}
|
||||
${RAS_SOURCES}
|
||||
${SYM_SOURCES}
|
||||
${SCHEDULER_SOURCES}
|
||||
)
|
||||
|
||||
###################### Create a shared NCCL library ############################
|
||||
add_library(nccl SHARED)
|
||||
|
||||
target_sources(nccl PRIVATE ${LIBSRCFILES})
|
||||
|
||||
# Include directories
|
||||
target_include_directories(nccl PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/device
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}/cccl
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
|
||||
COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
|
||||
-e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
|
||||
-e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
|
||||
-e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
|
||||
-e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
|
||||
BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
|
||||
)
|
||||
|
||||
add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
|
||||
|
||||
add_dependencies(nccl nccl_header)
|
||||
|
||||
# Set version and output name
|
||||
set_target_properties(nccl PROPERTIES
|
||||
VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
|
||||
SOVERSION ${NCCL_MAJOR}
|
||||
OUTPUT_NAME "nccl"
|
||||
PREFIX "lib"
|
||||
)
|
||||
|
||||
# Set CUDA specific flags
|
||||
set_target_properties(nccl PROPERTIES
|
||||
CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_RESOLVE_DEVICE_SYMBOLS ON
|
||||
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
)
|
||||
|
||||
# Link libraries
|
||||
target_link_libraries(nccl
|
||||
PRIVATE
|
||||
nccl_device
|
||||
pthread
|
||||
rt
|
||||
dl
|
||||
${CUDAToolkit_LIBRARIES}
|
||||
${EXTRA_LIBS}
|
||||
)
|
||||
|
||||
# Set output directories for nccl shared library
|
||||
set_target_properties(nccl PROPERTIES
|
||||
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
|
||||
)
|
||||
|
||||
###################### Create a ras binary executable ############################
|
||||
set(RAS_BINSRCFILES ras/client.cc)
|
||||
|
||||
add_executable(ncclras ${RAS_BINSRCFILES})
|
||||
|
||||
target_include_directories(ncclras PUBLIC
|
||||
${CMAKE_BINARY_DIR}/include
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
add_dependencies(ncclras nccl_header)
|
||||
|
||||
target_link_libraries(ncclras
|
||||
PRIVATE
|
||||
pthread
|
||||
rt
|
||||
dl
|
||||
)
|
||||
|
||||
# Set output directory for ncclras executable
|
||||
set_target_properties(ncclras PROPERTIES
|
||||
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
|
||||
)
|
||||
|
||||
###################### Create a static NCCL library ############################
|
||||
add_library(nccl_static STATIC ${LIBSRCFILES})
|
||||
|
||||
# Include directories
|
||||
target_include_directories(nccl_static PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/device
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}/cccl
|
||||
)
|
||||
|
||||
# Add dependency on nccl_header
|
||||
add_dependencies(nccl_static nccl_header)
|
||||
|
||||
# Link libraries
|
||||
target_link_libraries(nccl_static
|
||||
PRIVATE
|
||||
nccl_device
|
||||
pthread
|
||||
rt
|
||||
dl
|
||||
${CUDAToolkit_LIBRARIES}
|
||||
${EXTRA_LIBS}
|
||||
)
|
||||
|
||||
# Set CUDA specific flags
|
||||
set_target_properties(nccl_static PROPERTIES
|
||||
CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_RESOLVE_DEVICE_SYMBOLS ON
|
||||
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
)
|
||||
|
||||
# Set output directory for nccl_static library
|
||||
set_target_properties(nccl_static PROPERTIES
|
||||
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
|
||||
)
|
||||
@@ -7,10 +7,12 @@ include ../makefiles/common.mk
|
||||
include ../makefiles/version.mk
|
||||
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h
|
||||
INCEXPORTS := nccl.h nccl_device.h \
|
||||
$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
|
||||
|
||||
LIBSRCFILES := \
|
||||
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
|
||||
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
|
||||
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \
|
||||
$(wildcard graph/*.cc) \
|
||||
$(wildcard misc/*.cc) \
|
||||
$(wildcard transport/*.cc) \
|
||||
@@ -19,6 +21,8 @@ LIBSRCFILES := \
|
||||
$(wildcard plugin/net/*.cc) \
|
||||
$(wildcard plugin/tuner/*.cc) \
|
||||
$(wildcard plugin/profiler/*.cc) \
|
||||
$(wildcard nccl_device/*.cc) \
|
||||
$(wildcard scheduler/*.cc) \
|
||||
$(filter-out ras/client.cc,$(wildcard ras/*.cc))
|
||||
BINSRCFILES := ras/client.cc
|
||||
|
||||
@@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
|
||||
mkdir -p $(INCDIR)
|
||||
install -m 644 $< $@
|
||||
|
||||
$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(INCDIR)/nccl_device
|
||||
install -m 644 $< $@
|
||||
|
||||
$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(INCDIR)/nccl_device/impl
|
||||
install -m 644 $< $@
|
||||
|
||||
$(PKGDIR)/%.pc : %.pc
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(PKGDIR)
|
||||
@@ -149,7 +163,7 @@ install : build
|
||||
mkdir -p $(PREFIX)/bin
|
||||
cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
|
||||
cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
|
||||
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
|
||||
cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/
|
||||
cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
|
||||
|
||||
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
|
||||
|
||||
+332
-62
@@ -7,10 +7,11 @@
|
||||
#include "comm.h"
|
||||
#include "transport.h"
|
||||
#include "group.h"
|
||||
#include "nvtx.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
||||
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCL_NVTX3_FUNC_RANGE;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if ROCM_VERSION >= 70000
|
||||
@@ -99,7 +100,7 @@ fail:
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
|
||||
ncclResult_t ncclMemFree_impl(void *ptr) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
NCCL_NVTX3_FUNC_RANGE;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDevice;
|
||||
|
||||
@@ -129,70 +130,339 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// This is a collective function and should be called by all ranks in the communicator
|
||||
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
void* regSymAddr = NULL;
|
||||
size_t allocSize = size;
|
||||
size_t granularity;
|
||||
CUdevice cuDev;
|
||||
CUmemAllocationProp memprop = {};
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
int bit = 0, cnt = 0;
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// ncclSpace:
|
||||
//
|
||||
// This datastructure "cuts" the line of non-negative integers into segments
|
||||
// which alternate between "full" (allocated) and "empty" (not allocated). The
|
||||
// cuts are sorted ascending. The segment after the last cut must be empty
|
||||
// (the unallocated frontier). Knwoing this we can deduce whether the segment
|
||||
// ending at cut[i] is full or empty with this formula:
|
||||
// isFull(i) = (i%2 != ncuts%2)
|
||||
|
||||
// aligment must be power of 2 as an input
|
||||
while (bit < sizeof(size_t) * 8) {
|
||||
if (alignment & (1L << bit)) cnt++;
|
||||
if (cnt == 2) {
|
||||
WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
|
||||
goto fail;
|
||||
void ncclSpaceConstruct(struct ncclSpace* a) {
|
||||
memset(a, 0, sizeof(*a));
|
||||
}
|
||||
|
||||
void ncclSpaceDestruct(struct ncclSpace* a) {
|
||||
free(a->cuts);
|
||||
}
|
||||
|
||||
static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) {
|
||||
// Insert space for two cuts in `a->cuts[]` before `index`.
|
||||
if (a->count + 2 > a->capacity) {
|
||||
a->capacity *= 2;
|
||||
if (a->capacity == 0) a->capacity = 16;
|
||||
int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t));
|
||||
for (int i=0; i < index; i++) cuts1[i] = a->cuts[i];
|
||||
for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i];
|
||||
free(a->cuts);
|
||||
a->cuts = cuts1;
|
||||
} else {
|
||||
for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i];
|
||||
}
|
||||
a->cuts[index+0] = lo;
|
||||
a->cuts[index+1] = hi;
|
||||
a->count += 2;
|
||||
|
||||
// Filter pairs of adjacent repeated values from cuts[]. Since these mark
|
||||
// boundaries where segments transition between full<->empty, dropping such a
|
||||
// pair fuses two adjacent segments together. Examples:
|
||||
// [1,2,3,3,4] -> [1,2,4]
|
||||
// [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition
|
||||
// [1,2,3,3,3,3,4] -> [1,2,4]
|
||||
// Leading zeros don't have to be in pairs, they are always dropped:
|
||||
// [0,1,2] -> [1,2]
|
||||
// [0,0,1,2] -> [1,2]
|
||||
int r = index, w = index; // Read and write cursors.
|
||||
int64_t prev = r==0 ? 0 : a->cuts[r-1];
|
||||
while (r < a->count) {
|
||||
int64_t cur = a->cuts[r++];
|
||||
a->cuts[w++] = cur;
|
||||
if (prev == cur) { // Repeated value is an empty segment which can be deleted.
|
||||
// Erase last two cuts or just one if we're at the start.
|
||||
w -= w==1 ? 1 : 2;
|
||||
// Zeros can only occur at the beginning (due to being sorted). We want to
|
||||
// drop any number of zeros, but only even numbers of other repeated values.
|
||||
// So set to zero here, which will make prev=0, thus if next value is zero
|
||||
// it will be dropped but if its not zero then it will need to begin a new
|
||||
// pair to be dropped.
|
||||
cur = 0;
|
||||
}
|
||||
bit++;
|
||||
prev = cur;
|
||||
}
|
||||
// temporarily align the alignment to NCCL_REC_PAGE_SIZE
|
||||
ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
|
||||
|
||||
CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
|
||||
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
memprop.requestedHandleType = ncclCuMemHandleType;
|
||||
memprop.location.id = cuDev;
|
||||
CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
ALIGN_SIZE(allocSize, granularity);
|
||||
|
||||
CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
|
||||
ALIGN_SIZE(comm->symAllocHead, alignment);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, ®SymAddr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
||||
comm->symAllocHead += allocSize;
|
||||
*symPtr = regSymAddr;
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
*symPtr = NULL;
|
||||
goto exit;
|
||||
a->count = w;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDev = comm->cudaDev;
|
||||
CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
|
||||
if (ncclCuMemEnable()) {
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
ncclResult_t ncclSpaceAlloc(
|
||||
struct ncclSpace* a, int64_t limit, int64_t size, int align,
|
||||
int64_t* outOffset
|
||||
) {
|
||||
// When allocating we try to locate the first empty segment which can hold
|
||||
// the allocation and move its lower cut upward.
|
||||
int i = a->count%2; // First empty segment ends at cuts[i]
|
||||
size_t off;
|
||||
while (i <= a->count) {
|
||||
size_t lo = i == 0 ? 0 : a->cuts[i-1];
|
||||
size_t hi = i == a->count ? limit : a->cuts[i];
|
||||
off = alignUp(lo, align);
|
||||
if (off + size <= hi) {
|
||||
*outOffset = off;
|
||||
if (i == 0 || off + size == hi) { // Slow path required.
|
||||
insertSegment(a, i, off, off+size);
|
||||
} else { // We can just append to the end of a full segment.
|
||||
a->cuts[i-1] = off + size;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
i += 2; // Next empty segment
|
||||
}
|
||||
exit:
|
||||
CUDACHECK(cudaSetDevice(saveDev));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) {
|
||||
if (a->count == 0 || a->cuts[a->count-1] <= offset) {
|
||||
WARN("No allocation found at offset=0x%lx", (long)offset);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// This could be binary search, but since allocate is linear there's no point.
|
||||
int i = 1 - a->count%2; // First full segment ends at cuts[i]
|
||||
while (a->cuts[i] <= offset) i += 2;
|
||||
|
||||
int64_t lo = i==0 ? 0 : a->cuts[i-1];
|
||||
int64_t hi = a->cuts[i];
|
||||
|
||||
if (offset < lo || hi < offset + size) {
|
||||
WARN("Given size=0x%lx extends beyond allocation.", (long)size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// First try the two fast cases which just shrink a segment from one side.
|
||||
if (i != 0 && lo == offset && offset + size != hi) {
|
||||
a->cuts[i-1] = offset + size; // Bring bottom up.
|
||||
} else if (lo != offset && offset + size == hi) {
|
||||
a->cuts[i] = offset; // Bring top down.
|
||||
} else { // Slow path.
|
||||
insertSegment(a, i, offset, offset+size);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// ncclShadowPool:
|
||||
|
||||
struct ncclShadowPage { // A contiguous block of (at most) 64 objects
|
||||
struct ncclShadowPage* next;
|
||||
int objSize;
|
||||
uint64_t freeMask;
|
||||
void* devObjs;
|
||||
};
|
||||
struct ncclShadowObject {
|
||||
struct ncclShadowObject* next;
|
||||
void* devObj;
|
||||
void* hostObj;
|
||||
struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool.
|
||||
};
|
||||
|
||||
void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
|
||||
pool->hbits = 0;
|
||||
pool->count = 0;
|
||||
pool->table = nullptr;
|
||||
pool->pages = nullptr;
|
||||
}
|
||||
|
||||
ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
|
||||
if (pool->hbits != 0) {
|
||||
cudaStream_t stream;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
|
||||
if (pool->count != 0) {
|
||||
for (int i=0; i < 1<<pool->hbits; i++) {
|
||||
struct ncclShadowObject* obj = pool->table[i];
|
||||
while (obj != nullptr) {
|
||||
struct ncclShadowPage* page = obj->page;
|
||||
if (page != nullptr) {
|
||||
if (page->freeMask == 0) { // Put full pages back into page list.
|
||||
page->freeMask = 1;
|
||||
page->next = pool->pages;
|
||||
pool->pages = page;
|
||||
}
|
||||
} else {
|
||||
cudaFreeAsync(obj->devObj, stream);
|
||||
}
|
||||
struct ncclShadowObject* next = obj->next;
|
||||
free(obj);
|
||||
obj = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(pool->table);
|
||||
|
||||
while (pool->pages != nullptr) {
|
||||
cudaFreeAsync(pool->pages->devObjs, stream);
|
||||
struct ncclShadowPage* next = pool->pages->next;
|
||||
free(pool->pages);
|
||||
pool->pages = next;
|
||||
}
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaStreamDestroy(stream);
|
||||
cudaMemPoolDestroy(pool->memPool);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int hashBucket(int hbits, void* devObj) {
|
||||
uintptr_t h = reinterpret_cast<uintptr_t>(devObj);
|
||||
h ^= h>>32;
|
||||
h *= 0x9e3779b97f4a7c13;
|
||||
return (uint64_t)h >> (64-hbits);
|
||||
}
|
||||
|
||||
static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) {
|
||||
int b = hashBucket(pool->hbits, obj->devObj);
|
||||
obj->next = pool->table[b];
|
||||
pool->table[b] = obj;
|
||||
}
|
||||
|
||||
ncclResult_t ncclShadowPoolAlloc(
|
||||
struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj,
|
||||
cudaStream_t stream
|
||||
) {
|
||||
if (size == 0) {
|
||||
if (outDevObj) *outDevObj = nullptr;
|
||||
if (outHostObj) *outHostObj = nullptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int hbits = pool->hbits;
|
||||
if (hbits == 0) {
|
||||
cudaMemPoolProps props = {};
|
||||
props.allocType = cudaMemAllocationTypePinned;
|
||||
props.handleTypes = cudaMemHandleTypeNone;
|
||||
props.location.type = cudaMemLocationTypeDevice;
|
||||
cudaGetDevice(&props.location.id);
|
||||
CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props));
|
||||
|
||||
pool->hbits = hbits = 4;
|
||||
pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<hbits);
|
||||
for (int i=0; i < 1<<hbits; i++) pool->table[i] = nullptr;
|
||||
}
|
||||
|
||||
// Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio.
|
||||
if (pool->count+1 > 2<<hbits) {
|
||||
struct ncclShadowObject** table0 = pool->table;
|
||||
struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1));
|
||||
pool->table = table1;
|
||||
pool->hbits = hbits+1;
|
||||
for (int i1=0; i1 < 2<<hbits; i1++) table1[i1] = nullptr;
|
||||
for (int i0=0; i0 < 1<<hbits; i0++) {
|
||||
struct ncclShadowObject* obj = table0[i0];
|
||||
while (obj) {
|
||||
struct ncclShadowObject* next = obj->next;
|
||||
hashInsert(pool, obj);
|
||||
obj = next;
|
||||
}
|
||||
}
|
||||
hbits += 1; // match pool->hbits
|
||||
free(table0);
|
||||
}
|
||||
|
||||
struct ncclShadowPage* page;
|
||||
void *devObj;
|
||||
if ((64<<10)/size >= 3) {
|
||||
int shift = std::max<int>(0, (int)log2Down(size) + 1 - 4);
|
||||
int pageObjSize = ((size + (1<<shift)-1)>>shift)<<shift;
|
||||
struct ncclShadowPage** pagePtr = &pool->pages;
|
||||
while (true) {
|
||||
page = *pagePtr;
|
||||
if (page == nullptr) {
|
||||
size_t pageSize = std::min<size_t>(64<<10, 64*pageObjSize);
|
||||
page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage));
|
||||
page->objSize = pageObjSize;
|
||||
page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize);
|
||||
page->next = pool->pages;
|
||||
pool->pages = page;
|
||||
CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream));
|
||||
CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream));
|
||||
// fall through...
|
||||
}
|
||||
if (page->objSize == pageObjSize) {
|
||||
int slot = popFirstOneBit(&page->freeMask);
|
||||
devObj = (char*)page->devObjs + slot*pageObjSize;
|
||||
if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list.
|
||||
break;
|
||||
}
|
||||
pagePtr = &page->next;
|
||||
}
|
||||
} else {
|
||||
page = nullptr;
|
||||
CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream));
|
||||
CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream));
|
||||
}
|
||||
|
||||
struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc(
|
||||
sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size
|
||||
);
|
||||
obj->page = page;
|
||||
obj->devObj = devObj;
|
||||
obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t));
|
||||
memset(obj->hostObj, 0, size);
|
||||
hashInsert(pool, obj);
|
||||
pool->count += 1;
|
||||
if (outDevObj) *outDevObj = devObj;
|
||||
if (outHostObj) *outHostObj = obj->hostObj;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) {
|
||||
if (devObj == nullptr) return ncclSuccess;
|
||||
|
||||
int b = hashBucket(pool->hbits, devObj);
|
||||
struct ncclShadowObject** pobj = &pool->table[b];
|
||||
while (true) {
|
||||
if (*pobj == nullptr) {
|
||||
WARN("Device object does not exist in shadow pool.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
if ((*pobj)->devObj == devObj) break;
|
||||
pobj = &(*pobj)->next;
|
||||
}
|
||||
struct ncclShadowObject* obj = *pobj;
|
||||
*pobj = obj->next;
|
||||
if (obj->page != nullptr) {
|
||||
if (obj->page->freeMask == 0) {
|
||||
obj->page->next = pool->pages;
|
||||
pool->pages = obj->page;
|
||||
}
|
||||
int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize;
|
||||
obj->page->freeMask |= uint64_t(1)<<slot;
|
||||
} else {
|
||||
CUDACHECK(cudaFreeAsync(devObj, stream));
|
||||
}
|
||||
free(obj);
|
||||
pool->count -= 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) {
|
||||
if (devObj == nullptr) {
|
||||
*hostObj = nullptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int b = hashBucket(pool->hbits, devObj);
|
||||
struct ncclShadowObject* obj = pool->table[b];
|
||||
while (true) {
|
||||
if (obj == nullptr) {
|
||||
WARN("Device object does not exist in shadow pool.");
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (obj->devObj == devObj) break;
|
||||
obj = obj->next;
|
||||
}
|
||||
*hostObj = obj->hostObj;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "signals.h" // [RCCL]
|
||||
#include "param.h"
|
||||
#include "ras.h"
|
||||
#include <mutex>
|
||||
|
||||
#define BOOTSTRAP_N_CHECK_ABORT 10000
|
||||
#define BOOTSTRAP_TAG_CONNECT (0x1 << 31)
|
||||
@@ -86,13 +87,13 @@ struct bootstrapRootArgs {
|
||||
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
|
||||
static union ncclSocketAddress bootstrapNetIfAddr;
|
||||
static int bootstrapNetInitDone = 0;
|
||||
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static std::mutex bootstrapNetMutex;
|
||||
|
||||
NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);
|
||||
|
||||
ncclResult_t bootstrapNetInit() {
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
std::lock_guard<std::mutex> lock(bootstrapNetMutex);
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
const char* env = ncclGetEnv("NCCL_COMM_ID");
|
||||
int nIfs = 0;
|
||||
@@ -100,21 +101,18 @@ ncclResult_t bootstrapNetInit() {
|
||||
union ncclSocketAddress remoteAddr;
|
||||
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
|
||||
&nIfs));
|
||||
if (nIfs <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
return ncclSystemError;
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
|
||||
if (nIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
@@ -124,7 +122,6 @@ ncclResult_t bootstrapNetInit() {
|
||||
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
|
||||
bootstrapNetInitDone = 1;
|
||||
}
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -486,7 +483,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
|
||||
static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
|
||||
static int devOOB = -1;
|
||||
if (devOOB < 0) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
std::lock_guard<std::mutex> lock(bootstrapNetMutex);
|
||||
if (devOOB < 0) {
|
||||
const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
|
||||
if (userIfEnv && strlen(userIfEnv) > 0) {
|
||||
@@ -517,7 +514,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
|
||||
WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
|
||||
else
|
||||
WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
} else {
|
||||
@@ -530,13 +526,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
|
||||
bool hasProp = res == ncclSuccess;
|
||||
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
|
||||
}
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
}
|
||||
*dev = devOOB;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
|
||||
static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
|
||||
void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
|
||||
void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
|
||||
|
||||
@@ -544,7 +539,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
|
||||
do {
|
||||
NCCLCHECK(checkAbort(abortFlag, &abortCounter));
|
||||
if (!*sendComm)
|
||||
NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
|
||||
NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle));
|
||||
if (!*recvComm)
|
||||
NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
|
||||
} while (!*sendComm || !*recvComm);
|
||||
@@ -660,7 +655,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
|
||||
if (ncclParamBootstrapNetEnable()) {
|
||||
// Create net interface for other ranks to contact me (all gather)
|
||||
NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
|
||||
NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
|
||||
NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
|
||||
memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
|
||||
} else {
|
||||
// create socket for ring neightbor to contact mee
|
||||
@@ -714,7 +709,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
|
||||
|
||||
// accept and connect the ring network
|
||||
if (ncclParamBootstrapNetEnable()) {
|
||||
NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
|
||||
NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
|
||||
&STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
|
||||
&STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
|
||||
} else {
|
||||
@@ -807,7 +802,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
|
||||
// create a handle for the others to reach out to me
|
||||
if (ncclParamBootstrapNetEnable()) {
|
||||
NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
|
||||
NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
|
||||
NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
|
||||
memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
|
||||
} else {
|
||||
// create socket for ring neightbor to contact mee
|
||||
@@ -826,7 +821,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
|
||||
NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
|
||||
if (ncclParamBootstrapNetEnable()) {
|
||||
NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
|
||||
NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
|
||||
&STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
|
||||
&STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
|
||||
ret, fail);
|
||||
|
||||
@@ -0,0 +1,615 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "register_inline.h"
|
||||
#include <cuda.h>
|
||||
#include "rocmwrap.h"
|
||||
#include "ce_coll.h"
|
||||
#include "alloc.h"
|
||||
|
||||
// Static constant for graph synchronization
|
||||
static const uint32_t GRAPH_SYNC_VALUE = 1;
|
||||
|
||||
// Static constants for intra-batch synchronization to improve CE collective performance with large scale
|
||||
// Frequency of intra-batch synchronization
|
||||
static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8;
|
||||
// Message threshold for intra-batch synchronization
|
||||
static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024;
|
||||
|
||||
ncclResult_t ncclCeInit(struct ncclComm* comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
uint8_t* ceDevBase;
|
||||
size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2;
|
||||
ncclWindow_vidmem* ceWinDev;
|
||||
ncclWindow_vidmem* ceWinDevHost;
|
||||
|
||||
// Ensure symmetric memory runtime is initialized
|
||||
NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
|
||||
// Allocate and register memory for the symmetric memory
|
||||
NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail);
|
||||
NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail);
|
||||
NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail);
|
||||
// Get the ncclDevrWindow from the winHost field
|
||||
comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost;
|
||||
|
||||
comm->ceColl.baseUCSymReadyOffset = 0;
|
||||
comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16);
|
||||
comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset;
|
||||
comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset;
|
||||
comm->ceColl.ceSeqNum = 0;
|
||||
comm->ceColl.useCompletePtr = false;
|
||||
comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ;
|
||||
comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD;
|
||||
INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum);
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Clean up ceInitTaskQueue
|
||||
while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
|
||||
struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
|
||||
free(task);
|
||||
}
|
||||
|
||||
// Clean up CE resources
|
||||
if (comm->ceColl.baseUCSymReadyPtr != NULL) {
|
||||
if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
|
||||
NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail);
|
||||
NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail);
|
||||
}
|
||||
comm->ceColl.baseUCSymReadyPtr = NULL;
|
||||
comm->ceColl.baseUCSymComplPtr = NULL;
|
||||
comm->ceColl.ceSyncWin = NULL;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
|
||||
int driverVersion;
|
||||
if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false;
|
||||
|
||||
// CE is supported in CUDA 12.5 and later
|
||||
if (driverVersion >= 12050) {
|
||||
switch (coll) {
|
||||
case ncclFuncAllGather:
|
||||
case ncclFuncAlltoAll:
|
||||
case ncclFuncScatter:
|
||||
case ncclFuncGather:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
|
||||
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
|
||||
|
||||
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
|
||||
uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
|
||||
|
||||
// Source pointer is either the constant graph sync value or the sequence number
|
||||
void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)¤tSeq;
|
||||
// Wait value is either the constant graph sync value or the sequence number
|
||||
uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
|
||||
|
||||
// Use multi-cast address as destination pointer
|
||||
void* mcDstPtr;
|
||||
void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
|
||||
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
|
||||
NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
|
||||
|
||||
// Write our own ready/complete flag to the multi-cast address
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(
|
||||
mcDstPtr,
|
||||
srcPtr,
|
||||
sizeof(uint32_t),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream), ret, fail);
|
||||
|
||||
// Add local wait operations for every other rank
|
||||
for (int r = 0; r < comm->nRanks; ++r) {
|
||||
if (r == comm->rank) continue;
|
||||
batchParams[*opIdx] = {};
|
||||
// batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
|
||||
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
|
||||
batchParams[*opIdx].waitValue.value = waitValue;
|
||||
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
|
||||
(*opIdx)++;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
|
||||
hipStreamBatchMemOpParams* batchParams,
|
||||
size_t* opIdx) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
|
||||
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
|
||||
|
||||
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
|
||||
uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
|
||||
|
||||
// Write our own ready/complete flag to remote ranks
|
||||
uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
|
||||
for (int r = 0; r < comm->nRanks; ++r) {
|
||||
if (r == comm->rank) continue;
|
||||
void * peerDstPtr;
|
||||
void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
|
||||
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
|
||||
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
|
||||
batchParams[*opIdx] = {};
|
||||
// batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
|
||||
batchParams[*opIdx].writeValue.address = (CUdeviceptr)peerDstPtr;
|
||||
batchParams[*opIdx].writeValue.value = waitValue;
|
||||
// batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
|
||||
(*opIdx)++;
|
||||
}
|
||||
|
||||
// Add local wait operations for every other rank
|
||||
for (int r = 0; r < comm->nRanks; ++r) {
|
||||
if (r == comm->rank) continue;
|
||||
batchParams[*opIdx] = {};
|
||||
// batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
|
||||
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
|
||||
batchParams[*opIdx].waitValue.value = waitValue;
|
||||
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
|
||||
(*opIdx)++;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Get pointers to the ready and complete synchronization arrays
|
||||
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
|
||||
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
|
||||
|
||||
// Allocate enough slots for all possible ops
|
||||
size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
|
||||
size_t opIdx = 0;
|
||||
|
||||
// Prepare batch memory operations for synchronization
|
||||
hipStreamBatchMemOpParams* batchParams = nullptr;
|
||||
NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
|
||||
|
||||
if (comm->nvlsSupport) {
|
||||
NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail);
|
||||
}
|
||||
|
||||
// For CUDA graph capture, add reset operation
|
||||
if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
batchParams[opIdx] = {};
|
||||
// batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
|
||||
batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
|
||||
batchParams[opIdx].writeValue.value = 0;
|
||||
// batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
|
||||
opIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
// Execute all memory operations in a single batch
|
||||
CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
|
||||
|
||||
// Toggle the flag for next call
|
||||
comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;
|
||||
|
||||
exit:
|
||||
if (batchParams) free(batchParams);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
params->srcs = nullptr;
|
||||
params->dsts = nullptr;
|
||||
params->sizes = nullptr;
|
||||
params->numOps = 0;
|
||||
params->intraBatchSync = false;
|
||||
#if CUDART_VERSION >= 12080
|
||||
params->attrs = nullptr;
|
||||
params->attrIdxs = nullptr;
|
||||
params->numAttrs = 0;
|
||||
#endif
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(¶ms->srcs, nRanks), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(¶ms->dsts, nRanks), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(¶ms->sizes, nRanks), ret, fail);
|
||||
#if CUDART_VERSION >= 12080
|
||||
NCCLCHECKGOTO(ncclCalloc(¶ms->attrs, nRanks), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(¶ms->attrIdxs, nRanks), ret, fail);
|
||||
#endif
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
|
||||
if (params->srcs) free(params->srcs);
|
||||
if (params->dsts) free(params->dsts);
|
||||
if (params->sizes) free(params->sizes);
|
||||
#if CUDART_VERSION >= 12080
|
||||
if (params->attrs) free(params->attrs);
|
||||
if (params->attrIdxs) free(params->attrIdxs);
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Check if there are any operations to perform
|
||||
if (params->numOps == 0) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Check if we are in a CUDA graph capture
|
||||
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
|
||||
|
||||
int driverVersion;
|
||||
NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
|
||||
|
||||
//--------------Graph capture--------------
|
||||
// cudaMemcpyBatchAsync is not supported during CUDA graph capture
|
||||
if (capturing) {
|
||||
for (int i =0; i < params->numOps; i++) {
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(
|
||||
(void*)params->dsts[i],
|
||||
(void*)params->srcs[i],
|
||||
params->sizes[i],
|
||||
cudaMemcpyDeviceToDevice,
|
||||
stream), ret, fail);
|
||||
|
||||
if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
}
|
||||
}
|
||||
}
|
||||
//--------------No graph capture--------------
|
||||
else {
|
||||
if (/*CUDART_VERSION >= 12080 &&*/ driverVersion >= 12080) {
|
||||
#if CUDART_VERSION >= 12080
|
||||
// For CUDA 12.8+, use batch memory copy for better performance
|
||||
params->attrs[0] = {};
|
||||
params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
|
||||
params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
|
||||
params->attrIdxs[0] = 0;
|
||||
params->numAttrs = 1;
|
||||
|
||||
if (params->intraBatchSync) {
|
||||
// Break into multiple batches with sync between them
|
||||
int batchSize = comm->ceColl.intraBatchSyncFreq;
|
||||
for (int i = 0; i < params->numOps; i += batchSize) {
|
||||
int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i;
|
||||
|
||||
#if CUDART_VERSION >= 13000
|
||||
CUDACHECKGOTO(cudaMemcpyBatchAsync(
|
||||
¶ms->dsts[i], ¶ms->srcs[i], ¶ms->sizes[i], currentBatchSize,
|
||||
params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
|
||||
#else
|
||||
CUDACHECKGOTO(cudaMemcpyBatchAsync(
|
||||
¶ms->dsts[i], ¶ms->srcs[i], ¶ms->sizes[i], currentBatchSize,
|
||||
params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
|
||||
#endif
|
||||
|
||||
// Sync after each batch
|
||||
if (i + batchSize < params->numOps) {
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Use single batch for all operations
|
||||
#if CUDART_VERSION >= 13000
|
||||
CUDACHECKGOTO(cudaMemcpyBatchAsync(
|
||||
params->dsts, params->srcs, params->sizes, params->numOps,
|
||||
params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
|
||||
#else
|
||||
CUDACHECKGOTO(cudaMemcpyBatchAsync(
|
||||
params->dsts, params->srcs, params->sizes, params->numOps,
|
||||
params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
// For older CUDA versions, fall back to individual transfers
|
||||
for (int i = 0; i < params->numOps; i++) {
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(
|
||||
(void*)params->dsts[i],
|
||||
(void*)params->srcs[i],
|
||||
params->sizes[i],
|
||||
cudaMemcpyDeviceToDevice,
|
||||
stream), ret, fail);
|
||||
|
||||
if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Calculate the size of each rank's data chunk
|
||||
const size_t chunkBytes = args->nElts * args->eltSize;
|
||||
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
|
||||
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
|
||||
void* peerRecvBuff;
|
||||
size_t offset;
|
||||
|
||||
struct ncclCeBatchOpsParams batchOpsParams = {};
|
||||
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
|
||||
|
||||
// Ensure all ranks are ready before starting transfers
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
// Copy own data to receive buffer if operation is out-of-place
|
||||
if (myRecvBuff != mySendBuff) {
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
|
||||
// Copy data to other ranks
|
||||
for (int r = 1; r < comm->nRanks; r++) {
|
||||
int targetRank = (comm->rank + r) % comm->nRanks;
|
||||
offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr;
|
||||
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail);
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
|
||||
// Check if we need to perform intra-batch synchronization
|
||||
batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
|
||||
|
||||
// Launch the batch operations
|
||||
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
|
||||
|
||||
// Ensure all transfers are complete across all ranks
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclCeFreeBatchOpsParams(&batchOpsParams);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Calculate the size of data each rank sends to every other rank
|
||||
const size_t chunkBytes = args->nElts * args->eltSize;
|
||||
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
|
||||
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
|
||||
void* peerRecvBuff;
|
||||
size_t offset;
|
||||
|
||||
struct ncclCeBatchOpsParams batchOpsParams = {};
|
||||
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail);
|
||||
|
||||
// Ensure all ranks are ready before starting transfers
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
// Copy data to other ranks: send data chunk for each destination rank
|
||||
for (int r = 0; r < comm->nRanks; r++) {
|
||||
int dstRank = (comm->rank + r) % comm->nRanks;
|
||||
uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
|
||||
uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
|
||||
|
||||
if (dstRank == comm->rank) {
|
||||
// Local copy for own data
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
} else {
|
||||
// Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank
|
||||
offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
|
||||
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail);
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we need to perform intra-batch synchronization
|
||||
batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
|
||||
|
||||
// Launch the batch operations
|
||||
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
|
||||
|
||||
// Ensure all transfers are complete across all ranks
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclCeFreeBatchOpsParams(&batchOpsParams);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Calculate the size of data root sends to each rank
|
||||
const size_t chunkBytes = args->nElts * args->eltSize;
|
||||
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
|
||||
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
|
||||
int rootRank = args->rootRank;
|
||||
void* peerDstPtr;
|
||||
size_t offset;
|
||||
|
||||
struct ncclCeBatchOpsParams batchOpsParams = {};
|
||||
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
|
||||
|
||||
// Ensure all ranks are ready before starting transfers
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
if (comm->rank == rootRank) {
|
||||
// Check if this is an in-place scatter operation
|
||||
bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes);
|
||||
|
||||
// Copy root's own data first if not in-place
|
||||
if (!isInPlace) {
|
||||
uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes;
|
||||
uint8_t* dstPtr = myRecvBuff;
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
|
||||
// Root rank distributes data to other ranks
|
||||
for (int r = 1; r < comm->nRanks; r++) {
|
||||
int dstRank = (comm->rank + r) % comm->nRanks;
|
||||
uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
|
||||
uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff;
|
||||
|
||||
offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
|
||||
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail);
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
}
|
||||
// Non-root ranks don't need to perform any copy operations
|
||||
|
||||
// Launch the batch operations
|
||||
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
|
||||
|
||||
// Ensure all transfers are complete across all ranks
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclCeFreeBatchOpsParams(&batchOpsParams);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Calculate the size of data each rank sends to root
|
||||
const size_t chunkBytes = args->nElts * args->eltSize;
|
||||
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
|
||||
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
|
||||
int rootRank = args->rootRank;
|
||||
void* peerRecvBuff;
|
||||
size_t offset;
|
||||
|
||||
struct ncclCeBatchOpsParams batchOpsParams = {};
|
||||
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail);
|
||||
|
||||
// Ensure all ranks are ready before starting transfers
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
if (comm->rank == rootRank) {
|
||||
// Root rank copies its own data to the correct position in receive buffer
|
||||
uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
|
||||
if (mySendBuff != dstPtr) {
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
} else {
|
||||
// Non-root ranks send their data to root's receive buffer
|
||||
uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
|
||||
offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr;
|
||||
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail);
|
||||
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
|
||||
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
|
||||
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
|
||||
batchOpsParams.numOps++;
|
||||
}
|
||||
|
||||
// Launch the batch operations
|
||||
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
|
||||
|
||||
// Ensure all transfers are complete across all ranks
|
||||
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclCeFreeBatchOpsParams(&batchOpsParams);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
cudaStream_t stream = comm->planner.streams->stream;
|
||||
struct ncclCeCollArgs* args = plan->ceCollArgs;
|
||||
|
||||
switch (args->func) {
|
||||
case ncclFuncAllGather:
|
||||
NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail);
|
||||
break;
|
||||
case ncclFuncAlltoAll:
|
||||
NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail);
|
||||
break;
|
||||
case ncclFuncScatter:
|
||||
NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail);
|
||||
break;
|
||||
case ncclFuncGather:
|
||||
NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail);
|
||||
break;
|
||||
default:
|
||||
ret = ncclInvalidUsage;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
+141
-163
@@ -13,16 +13,23 @@
|
||||
#include "nvtx_payload_schemas.h"
|
||||
#include "msccl/msccl_lifecycle.h"
|
||||
|
||||
#ifdef ENABLE_ROCSHMEM
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
#endif
|
||||
|
||||
using namespace rccl;
|
||||
|
||||
const char* ncclFuncToString(ncclFunc_t fn) {
|
||||
switch (fn) {
|
||||
case ncclFuncAllGather: return "AllGather";
|
||||
case ncclFuncAllReduce: return "AllReduce";
|
||||
case ncclFuncAlltoAll: return "AlltoAll";
|
||||
case ncclFuncBroadcast: return "Broadcast";
|
||||
case ncclFuncGather: return "Gather";
|
||||
case ncclFuncRecv: return "Recv";
|
||||
case ncclFuncReduce: return "Reduce";
|
||||
case ncclFuncReduceScatter: return "ReduceScatter";
|
||||
case ncclFuncScatter: return "Scatter";
|
||||
case ncclFuncSendRecv: return "SendRecv";
|
||||
case ncclFuncSend: return "Send";
|
||||
default: return "Invalid";
|
||||
@@ -81,7 +88,6 @@ const char* ncclProtoToString(int proto) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
|
||||
@@ -91,9 +97,12 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLGATHER_CHUNKSTEPS, comm -> rcclUseOneSlice ? ALLGATHER_SLICESTEPS_SINGLE_NODE : ALLGATHER_SLICESTEPS, nullptr };
|
||||
|
||||
int nRanks;
|
||||
int nRanks, rank;
|
||||
int in_place = 0;
|
||||
const void* srcBuf;
|
||||
void* dstBuf;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
size_t msgSize = sendcount * ncclTypeSize(datatype) * nRanks;
|
||||
|
||||
if (!mscclIsCaller())
|
||||
@@ -108,21 +117,28 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
|
||||
}
|
||||
|
||||
if (rcclUseAllGatherDirect(comm, msgSize)) {
|
||||
INFO(NCCL_INIT, "RCCL DIRECT ALLGATHER count = %zu, msgSize = %zu, comm = %p, stream = %p, rank = %d, sendbuff = %p, recvbuff = %p",
|
||||
sendcount, msgSize, comm, stream, rank, sendbuff, recvbuff);
|
||||
// use direct allgather
|
||||
if (sendcount == 0) return ncclSuccess;
|
||||
size_t rankOffset = sendcount * ncclTypeSize(datatype);
|
||||
if (((char*)sendbuff) == (((char*)recvbuff) + comm->rank * rankOffset)) {
|
||||
if (sendbuff == (((char*)recvbuff) + rank * rankOffset)) {
|
||||
srcBuf = ((char*)recvbuff) + rank * rankOffset;
|
||||
dstBuf = recvbuff;
|
||||
in_place = 1;
|
||||
}
|
||||
} else {
|
||||
srcBuf = sendbuff;
|
||||
dstBuf = recvbuff;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
|
||||
for (int r = 0; r < nRanks; r++) {
|
||||
int peer = (comm->rank + r) % nRanks;
|
||||
if (in_place && (peer == comm->rank)) {
|
||||
continue;
|
||||
}
|
||||
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, peer, comm, stream));
|
||||
NCCLCHECK(ncclRecv(((char*)recvbuff) + peer * rankOffset, sendcount, datatype, peer, comm, stream));
|
||||
if (r == rank && in_place)
|
||||
continue;
|
||||
|
||||
NCCLCHECK(ncclSend(((char*)srcBuf), sendcount, datatype, r, comm, stream));
|
||||
NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
return ncclSuccess;
|
||||
@@ -132,10 +148,101 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
|
||||
}
|
||||
}
|
||||
|
||||
RCCL_PARAM(AlltoAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream);
|
||||
ncclResult_t ncclAlltoAll_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
|
||||
}
|
||||
|
||||
size_t rankOffset = count * ncclTypeSize(datatype);
|
||||
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
|
||||
size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
|
||||
|
||||
struct ncclInfo info;
|
||||
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
|
||||
rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAlltoAllPivotEnable()) {
|
||||
info = { ncclFuncAlltoAllPivot, "AlltoAllPivot",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
|
||||
} else {
|
||||
#ifdef ENABLE_ROCSHMEM
|
||||
if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {
|
||||
struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
|
||||
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
|
||||
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
#endif ENABLE_ROCSHMEM
|
||||
info = { ncclFuncAlltoAll, "AlltoAll",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
|
||||
}
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAlltoAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAlltoAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AlltoAllv, NcclNvtxParamsAlltoAllv,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
|
||||
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
|
||||
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
|
||||
}
|
||||
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(true);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int r=0; r<nRanks; r++) {
|
||||
NCCLCHECK(ncclSend(
|
||||
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
|
||||
sendcounts[r],
|
||||
datatype,
|
||||
r,
|
||||
comm,
|
||||
stream));
|
||||
NCCLCHECK(ncclRecv(
|
||||
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
|
||||
recvcounts[r],
|
||||
datatype,
|
||||
r,
|
||||
comm,
|
||||
stream));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(false);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
|
||||
@@ -186,104 +293,8 @@ ncclResult_t ncclAllReduceWithBias_impl(const void* sendbuff, void* recvbuff, si
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
|
||||
RCCL_PARAM(AllToAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
|
||||
}
|
||||
|
||||
size_t rankOffset = count * ncclTypeSize(datatype);
|
||||
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
|
||||
// Determine Pivot A2A support now that we know number of channels
|
||||
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
|
||||
rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAllToAllPivotEnable()) {
|
||||
struct ncclInfo info = { ncclFuncAllToAllPivot, "AllToAllPivot",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
|
||||
return ncclEnqueueCheck(&info);
|
||||
} else {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
if (count == 0) return ncclSuccess;
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(true);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int r=0; r<nRanks; r++) {
|
||||
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
|
||||
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(false);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
|
||||
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
|
||||
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
|
||||
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
|
||||
}
|
||||
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(true);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int r=0; r<nRanks; r++) {
|
||||
NCCLCHECK(ncclSend(
|
||||
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
|
||||
sendcounts[r],
|
||||
datatype,
|
||||
r,
|
||||
comm,
|
||||
stream));
|
||||
NCCLCHECK(ncclRecv(
|
||||
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
|
||||
recvcounts[r],
|
||||
datatype,
|
||||
r,
|
||||
comm,
|
||||
stream));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(false);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
|
||||
@@ -315,46 +326,32 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
|
||||
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm* comm, cudaStream_t stream);
|
||||
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm* comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
|
||||
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, count, datatype, comm, stream, root));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
|
||||
count, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
|
||||
}
|
||||
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = sendcount * ncclTypeSize(datatype);
|
||||
if (sendcount == 0) return ncclSuccess;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(true);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++)
|
||||
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(false);
|
||||
return ncclSuccess;
|
||||
struct ncclInfo info = { ncclFuncGather, "Gather",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
|
||||
@@ -380,8 +377,6 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
|
||||
@@ -405,48 +400,32 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream);
|
||||
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
|
||||
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));
|
||||
|
||||
if (!mscclIsCaller()) // when msccl falls back to
|
||||
{
|
||||
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
|
||||
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, count, datatype, comm, stream, root));
|
||||
}
|
||||
|
||||
if (mscclAvailable(comm) && !mscclIsCaller()) {
|
||||
return mscclEnqueueCheck(
|
||||
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
|
||||
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
|
||||
count, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
|
||||
}
|
||||
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
size_t rankOffset = recvcount * ncclTypeSize(datatype);
|
||||
if (recvcount == 0) return ncclSuccess;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(true);
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
if (rank == root) {
|
||||
for (int r=0; r<nRanks; r++)
|
||||
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
|
||||
}
|
||||
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
if (!mscclIsCaller()) Recorder::instance().skip(false);
|
||||
return ncclSuccess;
|
||||
struct ncclInfo info = { ncclFuncScatter, "Scatter",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
|
||||
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
|
||||
@@ -472,7 +451,6 @@ ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t da
|
||||
|
||||
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
|
||||
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
|
||||
|
||||
#include "nccl.h"
|
||||
#include <cstring>
|
||||
#include "comm.h"
|
||||
#include "device.h"
|
||||
#include "archinfo.h"
|
||||
|
||||
__attribute__ ((visibility("default")))
|
||||
ncclResult_t ncclCommDump(
|
||||
const ncclComm_t comm,
|
||||
std::unordered_map<std::string, std::string>& map) {
|
||||
if (comm == nullptr) {
|
||||
WARN("ncclCommDump comm is null");
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (comm->proxyState->proxyTrace == nullptr) {
|
||||
WARN("ncclCommDump comm->proxyState->proxyTrace is null");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
WARN("ncclCommDump() ProxyTrace:");
|
||||
WARN("%s", comm->proxyState->proxyTrace->dump().c_str());
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -28,7 +28,7 @@ static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
static uint64_t ncclDebugMask = 0;
|
||||
uint64_t ncclDebugMask = 0;
|
||||
FILE *ncclDebugFile = stdout;
|
||||
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static std::chrono::steady_clock::time_point ncclEpoch;
|
||||
@@ -419,4 +419,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
|
||||
va_end(vargs);
|
||||
pthread_setname_np(thread, threadName);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,60 @@
|
||||
# Run the scripts once during configuration to get the file lists
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
|
||||
OUTPUT_VARIABLE files
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
string(STRIP "${files}" files)
|
||||
list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/)
|
||||
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
|
||||
OUTPUT_VARIABLE symmetric_files
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
string(STRIP "${symmetric_files}" symmetric_files)
|
||||
list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/)
|
||||
|
||||
# Create custom commands to generate source files with proper dependencies
|
||||
add_custom_command(
|
||||
OUTPUT ${files}
|
||||
BYPRODUCTS ${files}
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "Generating device source files"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${symmetric_files}
|
||||
BYPRODUCTS ${symmetric_files}
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "Generating symmetric device source files"
|
||||
)
|
||||
|
||||
# Add library target
|
||||
add_library(nccl_device OBJECT
|
||||
${files}
|
||||
${symmetric_files}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/common.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu
|
||||
)
|
||||
|
||||
set_target_properties(nccl_device PROPERTIES
|
||||
CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_RESOLVE_DEVICE_SYMBOLS ON
|
||||
)
|
||||
|
||||
# Set include directories for the target
|
||||
target_include_directories(nccl_device PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_SOURCE_DIR}/src/include
|
||||
${CMAKE_SOURCE_DIR}/src/include/plugin
|
||||
${CMAKE_BINARY_DIR}/include
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}/cccl
|
||||
)
|
||||
|
||||
add_dependencies(nccl_device nccl_header)
|
||||
@@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device
|
||||
MANIFEST := $(OBJDIR)/manifest
|
||||
DEVGLUE_OBJ := $(OBJDIR)/device_glue.o
|
||||
|
||||
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include
|
||||
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
|
||||
NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
|
||||
CXXFLAGS += $(INCFLAGS)
|
||||
|
||||
@@ -47,7 +47,11 @@ endif
|
||||
define COMPILE_SYM
|
||||
@$(SAY) "Compiling" $2;\
|
||||
mkdir -p $(dir $1);\
|
||||
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
|
||||
if [[ -n "$3" ]]; then\
|
||||
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\
|
||||
else\
|
||||
touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\
|
||||
fi
|
||||
endef
|
||||
|
||||
DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
|
||||
|
||||
@@ -20,11 +20,20 @@ namespace {
|
||||
const int bid = ncclShmem.channelId - work->channelLo;
|
||||
int npKitCtxIdx = bid; // unused variable - compiler warning
|
||||
#endif
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warp = threadIdx.x / WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
|
||||
#else
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
#endif
|
||||
const int *ringRanks = ring->userRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
ssize_t count, partOffset, partCount, chunkCount;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
|
||||
#else
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
|
||||
#endif
|
||||
ssize_t offset;
|
||||
ssize_t dataOffset;
|
||||
int nelem;
|
||||
@@ -142,7 +151,7 @@ namespace {
|
||||
#endif
|
||||
// Final wait/copy.
|
||||
prims.directRecv(offset, nelem);
|
||||
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
@@ -671,4 +680,4 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@@ -20,8 +20,14 @@ namespace {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#endif
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warp = threadIdx.x / WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
|
||||
#else
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
#endif
|
||||
int ringIx = ring->index;
|
||||
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
#if defined(ENABLE_NPKIT)
|
||||
const int bid = ncclShmem.channelId - work->channelLo;
|
||||
@@ -31,7 +37,11 @@ namespace {
|
||||
ssize_t gridOffset;
|
||||
ssize_t channelCount;
|
||||
ssize_t chunkCount;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
|
||||
#else
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
|
||||
#endif
|
||||
const ssize_t loopCount = nranks * chunkCount;
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "device.h"
|
||||
#include "collectives.h"
|
||||
#include "primitives.h"
|
||||
|
||||
#ifdef ENABLE_ROCSHMEM
|
||||
#include <rocshmem/rocshmem.hpp>
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllToAllGda, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
|
||||
if (blockIdx.x == 0) {
|
||||
int num_pes = rocshmem::rocshmem_n_pes();
|
||||
|
||||
reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
|
||||
tid, nThreads, 0, nullptr, false, 1, (void **)&work->sendbuff, 1, (void **)&work->sndbuff,
|
||||
(work->size*num_pes));
|
||||
|
||||
rocshmem::rocshmem_char_alltoall_wg(work->team, ((char*)work->tempbuff), ((char*)work->sndbuff), work->size);
|
||||
|
||||
reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
|
||||
tid, nThreads, 0, nullptr, false, 1, (void **)&work->tempbuff, 1, (void **)&work->recvbuff,
|
||||
(work->size*num_pes));
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -75,7 +75,7 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
struct RunWorkColl<ncclFuncAlltoAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(tid, nThreads, work);
|
||||
|
||||
@@ -19,7 +19,12 @@ namespace {
|
||||
const int bid = ncclShmem.channelId - work->channelLo;
|
||||
int npKitCtxIdx = bid; // unused variable - compiler warning
|
||||
#endif
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warp = threadIdx.x / WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
|
||||
#else
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
#endif
|
||||
const int rank = ring->userRanks[0];
|
||||
const int nextRank = ring->userRanks[1];
|
||||
const int root = work->root;
|
||||
@@ -27,7 +32,11 @@ namespace {
|
||||
ssize_t chunkCount;
|
||||
ssize_t channelCount;
|
||||
ssize_t gridOffset;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
|
||||
#else
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
|
||||
#endif
|
||||
size_t offset;
|
||||
int nelem;
|
||||
int workNthreads;
|
||||
|
||||
@@ -17,24 +17,24 @@ struct RunWorkNop {
|
||||
__device__ void run() {}
|
||||
};
|
||||
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&argsStorage.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&argsStorage.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&argsStorage.args);
|
||||
}
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&argsStorage.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&argsStorage.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&args4K.args);
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&argsStorage.args);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -27,17 +27,30 @@
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
|
||||
#define __trace_hwreg()
|
||||
#define __trace_hwreg() \
|
||||
collTrace->data_0 = 0;
|
||||
#else
|
||||
#define __trace_hwreg() \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
|
||||
{ int32_t hwid; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwid)); \
|
||||
collTrace->data_0 = hwid >> 4; }
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__) || defined(__gfx950__)
|
||||
#define __trace_xccid() \
|
||||
{ int32_t xccId; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xccId)); \
|
||||
collTrace->xccId = xccId; }
|
||||
#else
|
||||
#define __trace_xccid() \
|
||||
collTrace->xccId = 0;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define INC_COLL_TRACE \
|
||||
uint32_t pos = __hip_atomic_fetch_add(&ncclShmem.collTraceTail->tail, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_WORKGROUP)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->tid = threadIdx.x; \
|
||||
collTrace->channelId = ncclShmem.channelId;
|
||||
// TODO: switch to atomicInc after llvm crash is fixed
|
||||
@@ -46,7 +59,8 @@
|
||||
#define traceKernelLaunch(launch_type, ix) { \
|
||||
INC_COLL_TRACE \
|
||||
collTrace->funcIndex = ncclShmem.funcId; \
|
||||
__trace_hwreg()\
|
||||
__trace_hwreg() \
|
||||
__trace_xccid() \
|
||||
collTrace->batchIx = ix; \
|
||||
if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
|
||||
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
|
||||
@@ -63,7 +77,7 @@
|
||||
collTrace->p2p.recvRegistered = p2pWork->recvNetReg; \
|
||||
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
|
||||
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
|
||||
__hip_atomic_store(&collTrace->type, (launch_type) | ncclCollTraceP2pElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
|
||||
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
|
||||
collTrace->coll.nWarps = collWork->nWarps; \
|
||||
@@ -71,7 +85,7 @@
|
||||
collTrace->coll.bid = ncclShmem.channelId - collWork->channelLo; \
|
||||
collTrace->coll.root = collWork->root; \
|
||||
collTrace->opCount = collWork->opCount; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
|
||||
__hip_atomic_store(&collTrace->type, (launch_type) | ncclCollTraceCollElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
} \
|
||||
}
|
||||
#define traceKernelEnd(end_type) { \
|
||||
@@ -81,11 +95,11 @@
|
||||
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
|
||||
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
|
||||
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
|
||||
collTrace->type = (end_type) | ncclCollTraceP2pElemType; \
|
||||
__hip_atomic_store(&collTrace->type, (end_type) | ncclCollTraceP2pElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
|
||||
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
|
||||
collTrace->opCount = collWork->opCount; \
|
||||
collTrace->type = (end_type) | ncclCollTraceCollElemType; \
|
||||
__hip_atomic_store(&collTrace->type, (end_type) | ncclCollTraceCollElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
} \
|
||||
}
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
@@ -94,12 +108,12 @@
|
||||
collTrace->data_0 = data4; \
|
||||
collTrace->opCount = data8_0; \
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
__hip_atomic_store(&collTrace->type, ncclCollTraceDataType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
}
|
||||
#define traceAbort(){\
|
||||
INC_COLL_TRACE\
|
||||
collTrace->funcIndex = ncclShmem.funcId;\
|
||||
collTrace->type = ncclCollTraceAbortType;\
|
||||
__hip_atomic_store(&collTrace->type, ncclCollTraceAbortType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
}
|
||||
#else
|
||||
#define traceKernelLaunch(launch_type, batchIx)
|
||||
@@ -136,9 +150,13 @@ struct ncclShmemData {
|
||||
struct ncclDevKernelArgs args;
|
||||
int channelId;
|
||||
int aborted;
|
||||
alignas(16) struct ncclDevComm comm;
|
||||
alignas(16) struct ncclKernelComm comm;
|
||||
alignas(16) struct ncclDevChannel channel;
|
||||
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warpComm;
|
||||
alignas(16) struct ncclDevChannel warpChannel[NCCL_MAX_GROUPS];
|
||||
int warpChannelId[NCCL_MAX_GROUPS];
|
||||
#endif
|
||||
int batchIx, nextBatchIx;
|
||||
enum ncclDevWorkType workType;
|
||||
uint8_t directMode;
|
||||
@@ -284,10 +302,10 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
|
||||
|
||||
if (WARP_SIZE == 64) {
|
||||
if (uint64_t(batch.offsetBitset) & (1ull<<lane)) {
|
||||
int nWorksBelow = __popc(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
|
||||
int nWorksBelow = __popcll(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
|
||||
fnsOfBitset[nWorksBelow] = lane;
|
||||
}
|
||||
nWorks = __popc(uint64_t(batch.offsetBitset));
|
||||
nWorks = __popcll(uint64_t(batch.offsetBitset));
|
||||
} else {
|
||||
// WARP_SIZE == 32
|
||||
if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
|
||||
@@ -442,10 +460,17 @@ struct RunWorkBatch {
|
||||
if (work->nWarps != workPrev->nWarps) __syncthreads();
|
||||
}
|
||||
int subtn = work->nWarps*WARP_SIZE;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
if (tid < subtn) {
|
||||
if(ncclShmem.warpComm == 0 || Algo != NCCL_ALGO_RING) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
|
||||
else if (ncclShmem.warpChannelId[tid / WARP_SIZE] >= 0) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid % WARP_SIZE, WARP_SIZE, work);
|
||||
}
|
||||
#else
|
||||
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
|
||||
// However, the code ensures that the participation is on a per-warp basis.
|
||||
// coverity[device_thread_diverged:FALSE]
|
||||
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -477,7 +502,7 @@ __device__ __forceinline__ void profiler(int action) {
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
|
||||
}
|
||||
ncclShmem.channel.workCounter += ncclShmem.nWorks;
|
||||
if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
|
||||
if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -489,7 +514,12 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
int x = tid;
|
||||
int total = 0, y;
|
||||
int num = MAXCHANNELS/64 > 0 ? MAXCHANNELS/64 : 1;
|
||||
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warpCount = tn / WARP_SIZE;
|
||||
int localWarpId = tid / WARP_SIZE;
|
||||
int globalWarpId = (warpCount * blockIdx.x) + localWarpId;
|
||||
int laneId = tid % WARP_SIZE;
|
||||
#endif
|
||||
// Copy kernel args to shmem and then only read those. Otherwise the compiler
|
||||
// will end up putting the args into thread local stack which is very wasteful.
|
||||
if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
|
||||
@@ -549,7 +579,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
/* set abort flag to 0 */
|
||||
if (tid == 0) {
|
||||
ncclShmem.aborted = 0;
|
||||
ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
|
||||
ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
|
||||
}
|
||||
|
||||
// Use first 2 warps to load comm and channel, and remaining load work batch.
|
||||
@@ -557,14 +587,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
case 0:
|
||||
{ void* dst = &ncclShmem.comm;
|
||||
void* src = ncclShmem.args.comm;
|
||||
int bytes = sizeof(ncclDevComm);
|
||||
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
|
||||
int bytes = sizeof(ncclKernelComm);
|
||||
static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn.");
|
||||
copyToShmem16(tid, dst, src, bytes);
|
||||
} break;
|
||||
case 1:
|
||||
{ // Get address of channel without incurring indirect load from ncclDevComm::channels
|
||||
{ // Get address of channel without incurring indirect load from ncclKernelComm::channels
|
||||
void* dst = &ncclShmem.channel;
|
||||
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
|
||||
void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
|
||||
int bytes = sizeof(ncclDevChannel);
|
||||
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
|
||||
copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
|
||||
@@ -583,9 +613,52 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
ncclShmem.collTrace = args->comm->collTrace + COLLTRACE_NUM_ITEMS*ncclShmem.channelId;
|
||||
ncclShmem.collTraceTail = args->comm->collTraceTail + ncclShmem.channelId;
|
||||
}
|
||||
#endif
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
if(tid == 0) {
|
||||
ncclShmem.warpComm = args->comm->warpLevelComm;
|
||||
}
|
||||
#endif
|
||||
__syncthreads(); // publish shmem
|
||||
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
// Determine per-warp channel assignment for WarpSpeed enablement
|
||||
total = 0;
|
||||
if(ncclShmem.warpComm == 1) { // If warpComm is enabled, assign warps to channels that have the corresponding channel mask enabled
|
||||
ncclShmem.warpChannelId[localWarpId] = -1;
|
||||
__syncthreads();
|
||||
for (int i = 0; i < num; i++) {
|
||||
if (args->channelMask.masks[i] & (1ull<<laneId)) {
|
||||
y = __popcll(args->channelMask.masks[i] & ((1ull<<laneId)-1));
|
||||
y = total + y;
|
||||
if (globalWarpId == y) {
|
||||
ncclShmem.warpChannelId[localWarpId] = laneId + total;
|
||||
break;
|
||||
}
|
||||
}
|
||||
total = total + __popcll(args->channelMask.masks[i]);
|
||||
}
|
||||
__syncthreads();
|
||||
if(ncclShmem.warpChannelId[localWarpId] >= 0) {
|
||||
void* dst = &ncclShmem.warpChannel[localWarpId];
|
||||
void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
|
||||
int bytes = sizeof(ncclDevChannel);
|
||||
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
|
||||
// assert((tid-localWarpId*WARP_SIZE) >= 0 && (tid-localWarpId*WARP_SIZE) < WARP_SIZE);
|
||||
copyToShmem16(tid-localWarpId*WARP_SIZE, dst, src, bytes);
|
||||
}
|
||||
} else { // If warpComm is disabled, all warps use the same channel as the block
|
||||
if(laneId == 0) {
|
||||
ncclShmem.warpChannelId[localWarpId] = ncclShmem.channelId;
|
||||
}
|
||||
// Use all threads in the warp to copy the channel data in parallel
|
||||
void* dst = &ncclShmem.warpChannel[localWarpId];
|
||||
void* src = &ncclShmem.channel;
|
||||
int bytes = sizeof(ncclDevChannel);
|
||||
copyToShmem16(laneId, dst, src, bytes);
|
||||
}
|
||||
__syncthreads();
|
||||
#endif
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
ncclShmem.prof.count = 0;
|
||||
@@ -648,17 +721,17 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
|
||||
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
|
||||
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
|
||||
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
|
||||
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
|
||||
#endif
|
||||
|
||||
#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
|
||||
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
|
||||
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {}
|
||||
|
||||
#ifdef USE_INDIRECT_FUNCTION_CALL
|
||||
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
|
||||
|
||||
@@ -3,9 +3,10 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
import shutil
|
||||
|
||||
# Order of colls, redops, tys, protos, algos must match src/include/device.h
|
||||
all_colls = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AllToAllPivot"]
|
||||
all_colls = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AlltoAllPivot", "AllToAllGda"]
|
||||
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
|
||||
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
|
||||
all_protos = ["LL","LL128","SIMPLE"]
|
||||
@@ -24,8 +25,11 @@ gensrc = sys.argv[1]
|
||||
|
||||
if os.path.exists(gensrc):
|
||||
for name in os.listdir(gensrc):
|
||||
os.remove(os.path.join(gensrc, name))
|
||||
#os.truncate(os.path.join(gensrc, name), 0)
|
||||
path = os.path.join(gensrc, name)
|
||||
if os.path.isfile(path):
|
||||
os.remove(path)
|
||||
elif os.path.isdir(path):
|
||||
shutil.rmtree(path)
|
||||
else:
|
||||
os.makedirs(gensrc)
|
||||
|
||||
@@ -64,7 +68,7 @@ else:
|
||||
# make ONLY_FUNCS="AllReduce RING SIMPLE * *|ReduceScatter RING LL * f32"
|
||||
# --- or ---
|
||||
# make ONLY_FUNCS="AllReduce RING SIMPLE|ReduceScatter RING LL * f32"
|
||||
# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AllToAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
|
||||
# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AlltoAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
|
||||
|
||||
# Paste all non-None arguments together with `sep`.
|
||||
def paste(sep, *args):
|
||||
@@ -79,14 +83,15 @@ func_pattern = sys.argv[6:7]
|
||||
if func_pattern and func_pattern[0]:
|
||||
func_pattern = func_pattern[0]
|
||||
else:
|
||||
func_pattern = "AllGather|AllReduce|AllToAllPivot|Broadcast|Reduce|ReduceScatter|SendRecv"
|
||||
func_pattern = "AllGather|AllReduce|AlltoAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"
|
||||
|
||||
################################################################################
|
||||
|
||||
algos_of_coll = {
|
||||
"AllGather": ["RING", "PAT"],
|
||||
"AllReduce": ["RING", "TREE"],
|
||||
"AllToAllPivot": ["RING"],
|
||||
"AlltoAllPivot": ["RING"],
|
||||
"AllToAllGda": ["RING"],
|
||||
"Broadcast": ["RING"],
|
||||
"Reduce": ["RING"],
|
||||
"ReduceScatter": ["RING", "PAT"],
|
||||
@@ -96,7 +101,8 @@ algos_of_coll = {
|
||||
protos_of_coll = {
|
||||
"AllGather": all_protos,
|
||||
"AllReduce": all_protos,
|
||||
"AllToAllPivot": ["SIMPLE"],
|
||||
"AlltoAllPivot": ["SIMPLE"],
|
||||
"AllToAllGda": ["SIMPLE"],
|
||||
"Broadcast": all_protos,
|
||||
"Reduce": all_protos,
|
||||
"ReduceScatter": all_protos,
|
||||
@@ -106,7 +112,8 @@ protos_of_coll = {
|
||||
redops_of_coll = {
|
||||
"AllGather": ["Sum"],
|
||||
"AllReduce": all_redops,
|
||||
"AllToAllPivot": ["Sum"],
|
||||
"AlltoAllPivot": ["Sum"],
|
||||
"AllToAllGda": ["Sum"],
|
||||
"Broadcast": ["Sum"],
|
||||
"Reduce": all_redops,
|
||||
"ReduceScatter": all_redops,
|
||||
@@ -116,7 +123,8 @@ redops_of_coll = {
|
||||
tys_of_coll = {
|
||||
"AllGather": ["i8"],
|
||||
"AllReduce": all_tys,
|
||||
"AllToAllPivot": ["i8"],
|
||||
"AlltoAllPivot": ["i8"],
|
||||
"AllToAllGda": ["i8"],
|
||||
"Broadcast": ["i8"],
|
||||
"Reduce": all_tys,
|
||||
"ReduceScatter": all_tys,
|
||||
@@ -126,7 +134,8 @@ tys_of_coll = {
|
||||
acc_of_coll = {
|
||||
"AllGather": ["0"],
|
||||
"AllReduce": all_accs,
|
||||
"AllToAllPivot": ["0"],
|
||||
"AlltoAllPivot": ["0"],
|
||||
"AllToAllGda": ["0"],
|
||||
"Broadcast": ["0"],
|
||||
"Reduce": ["0"],
|
||||
"ReduceScatter": ["0"],
|
||||
@@ -136,7 +145,8 @@ acc_of_coll = {
|
||||
pipelines_of_coll = {
|
||||
"AllGather": ["0"],
|
||||
"AllReduce": all_pipelines,
|
||||
"AllToAllPivot": ["0"],
|
||||
"AlltoAllPivot": ["0"],
|
||||
"AllToAllGda": ["0"],
|
||||
"Broadcast": ["0"],
|
||||
"Reduce": all_pipelines,
|
||||
"ReduceScatter": all_pipelines,
|
||||
@@ -147,7 +157,8 @@ pipelined_types = ["bf16"]
|
||||
coll_camel_to_lower = {
|
||||
"AllGather": "all_gather",
|
||||
"AllReduce": "all_reduce",
|
||||
"AllToAllPivot": "alltoall_pivot",
|
||||
"AlltoAllPivot": "alltoall_pivot",
|
||||
"AllToAllGda": "alltoall_gda",
|
||||
"Broadcast": "broadcast",
|
||||
"Reduce": "reduce",
|
||||
"ReduceScatter": "reduce_scatter",
|
||||
@@ -503,7 +514,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
|
||||
)
|
||||
if fn.coll == "Broadcast":
|
||||
key = ((coll_idx & 0x3F) | ((proto_idx & 0x3F) << 8))
|
||||
if fn.coll in ["SendRecv", "AllToAllPivot"]:
|
||||
if fn.coll in ["SendRecv", "AlltoAllPivot", "AllToAllGda"]:
|
||||
key = ((coll_idx & 0x3F))
|
||||
|
||||
out(f' {{{key}, {fn_id}}}, {comment}\n')
|
||||
|
||||
@@ -93,7 +93,7 @@ __device__ __forceinline__ static void mscclReduce(int c, int numReductions, int
|
||||
|
||||
template<typename T, typename RedOp, typename Proto, bool fullOps>
|
||||
__device__ __forceinline__ void mscclRunInterpreter(
|
||||
struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
|
||||
struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = blockIdx.x;
|
||||
const int nthreads = MSCCL_MAX_NTHREADS;
|
||||
@@ -120,12 +120,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
case 0:
|
||||
dst = &ncclShmem.comm;
|
||||
src = comm;
|
||||
bytes = sizeof(ncclDevComm);
|
||||
bytes = sizeof(ncclKernelComm);
|
||||
break;
|
||||
case 1:
|
||||
// Get address of channel without incurring indirect load from ncclDevComm::channels
|
||||
// Get address of channel without incurring indirect load from ncclKernelComm::channels
|
||||
dst = &ncclShmem.channel;
|
||||
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
|
||||
src = &((ncclKernelCommAndChannels*)comm)->channels[channelId];
|
||||
bytes = sizeof(ncclDevChannel);
|
||||
break;
|
||||
case 2:
|
||||
@@ -146,6 +146,9 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
}
|
||||
if (bytes) copyToShmem8(tid%WARP_SIZE, dst, src, bytes);
|
||||
}
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclShmem.warpComm = 0;
|
||||
#endif
|
||||
__syncthreads(); // publish shmem
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -369,13 +372,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
}
|
||||
|
||||
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
|
||||
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
|
||||
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
|
||||
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL, fullOps>(comm, algo, work); \
|
||||
} \
|
||||
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
|
||||
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
|
||||
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL128, fullOps>(comm, algo, work); \
|
||||
} \
|
||||
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
|
||||
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
|
||||
mscclRunInterpreter<type, Func##devredop<type>, ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS, 0, 2>, fullOps>(comm, algo, work); \
|
||||
}
|
||||
|
||||
|
||||
@@ -654,7 +654,11 @@ public:
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), threadsPerBlock(blockDim.x),
|
||||
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[threadIdx.x / WARP_SIZE];
|
||||
#else
|
||||
auto *channel = &ncclShmem.channel;
|
||||
#endif
|
||||
barriers = &ncclShmem.groups[group].barrier;
|
||||
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
|
||||
int nrecv=0, nsend=0;
|
||||
|
||||
@@ -579,7 +579,11 @@ public:
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), /*compiler warnings*/
|
||||
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)),
|
||||
warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group), threadsPerBlock(blockDim.x){
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[warpInBlock];
|
||||
#else
|
||||
auto *channel = &ncclShmem.channel;
|
||||
#endif
|
||||
barriers = &ncclShmem.groups[group].barrier;
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
|
||||
@@ -502,14 +502,22 @@ private:
|
||||
|
||||
public:
|
||||
static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) {
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
|
||||
#else
|
||||
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
|
||||
#endif
|
||||
peerPtr->send[connIndex].step += steps;
|
||||
st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step);
|
||||
}
|
||||
|
||||
static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) {
|
||||
int spins = 0;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
|
||||
#else
|
||||
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
|
||||
#endif
|
||||
peerPtr->recv[connIndex].step += steps;
|
||||
st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
|
||||
while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
|
||||
@@ -770,13 +778,20 @@ public:
|
||||
struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
|
||||
):
|
||||
tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(ncclShmem.warpComm? tidInBlock / WARP_SIZE : group), threadsPerBlock(blockDim.x){
|
||||
#else
|
||||
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group), threadsPerBlock(blockDim.x){
|
||||
|
||||
#endif
|
||||
barriers = &ncclShmem.groups[group].barrier;
|
||||
// PAT uses the same barrier for each group
|
||||
barriers_pat = &ncclShmem.barrier_pat;
|
||||
this->nworkers = nthreads;
|
||||
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[tidInBlock/WARP_SIZE];
|
||||
#else
|
||||
auto *channel = &ncclShmem.channel;
|
||||
#endif
|
||||
int peer = -1;
|
||||
flags = 0;
|
||||
index = -1;
|
||||
@@ -831,9 +846,9 @@ public:
|
||||
}
|
||||
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(channel->peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
|
||||
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(channel->peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
|
||||
|
||||
// if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
// flags |= AnyNetDeviceUnpack;
|
||||
@@ -861,7 +876,7 @@ public:
|
||||
// Load recv peer
|
||||
int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
|
||||
struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
|
||||
struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
|
||||
struct ncclConnInfo* conn = peer->conn = channel->peers[recvPeer]->recv+connIndexRecv;
|
||||
peer->step = conn->step;
|
||||
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
|
||||
@@ -871,7 +886,7 @@ public:
|
||||
// Load send peer
|
||||
int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
|
||||
peer = ((struct ncclPatPeer*)sendPeers)+tid;
|
||||
conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
|
||||
conn = peer->conn = channel->peers[sendPeer]->send+connIndexSend;
|
||||
peer->step = conn->step;
|
||||
peer->connFifo = conn->connFifo;
|
||||
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
|
||||
@@ -16,7 +16,12 @@ namespace {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#endif
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warp = threadIdx.x / WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
|
||||
#else
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
#endif
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int prevRank = ring->userRanks[nranks-1];
|
||||
@@ -24,7 +29,11 @@ namespace {
|
||||
size_t chunkCount;
|
||||
size_t channelCount;
|
||||
size_t gridOffset;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
#else
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
|
||||
#endif
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
|
||||
@@ -414,7 +414,7 @@ SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __h
|
||||
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
|
||||
#elif ROCM_VERSION < 60000
|
||||
#else
|
||||
SPECIALIZE_REDUCE(FuncSum, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) + (float)(y)))
|
||||
SPECIALIZE_REDUCE(FuncProd, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) * (float)(y)))
|
||||
SPECIALIZE_REDUCE(FuncMinMax, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)(fn.isMinNotMax ? fminf((float)(x), (float)(y)) : fmaxf((float)(x), (float)(y))))
|
||||
|
||||
@@ -16,14 +16,23 @@ namespace {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#endif
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
int warp = threadIdx.x / WARP_SIZE;
|
||||
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
|
||||
#else
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
#endif
|
||||
int const *ringRanks = ring->userRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
size_t count;
|
||||
size_t gridOffset;
|
||||
size_t channelCount;
|
||||
size_t chunkCount;
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
#else
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
#endif
|
||||
size_t offset;
|
||||
size_t dataOffset;
|
||||
uint32_t nelem;
|
||||
|
||||
@@ -1,35 +1,36 @@
|
||||
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "symmetric.h"
|
||||
#include "sym_kernels.h"
|
||||
#include "symmetric/kernel.h"
|
||||
#include "symmetric/primitives.h"
|
||||
|
||||
template<int BytePerPack, int UnrollPacks, int UnrollPeers>
|
||||
static __device__ void bcastDeep(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
char* inputHere, char* outputRank0, bool inPlace, int nIters
|
||||
ncclSymkArgsHandler const& handler, int tn, int t,
|
||||
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
|
||||
ncclSymPtr<char> input, ncclSymPtr<char> output, bool inPlace, int nIters
|
||||
) {
|
||||
using Pack = BytePack<BytePerPack>;
|
||||
int wn = tn/WARP_SIZE;
|
||||
int w = t/WARP_SIZE;
|
||||
int lane = t%WARP_SIZE;
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
|
||||
Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack tmp[UnrollPacks];
|
||||
|
||||
nIters -= w;
|
||||
if (0 < nIters) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = inpHere[u*WARP_SIZE];
|
||||
tmp[u] = inpPacks[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
if (0 < nIters) {
|
||||
while (true) {
|
||||
@@ -47,21 +48,21 @@ static __device__ void bcastDeep(
|
||||
if (partial && dr == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
|
||||
outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u];
|
||||
}
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
nIters -= wn;
|
||||
if (nIters <= 0) break;
|
||||
|
||||
// Load data for next iteration.
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = inpHere[u*WARP_SIZE];
|
||||
tmp[u] = inpPacks[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -69,18 +70,17 @@ static __device__ void bcastDeep(
|
||||
|
||||
template<int UnrollPeers, typename T>
|
||||
static __device__ void bcastEnds(
|
||||
ncclSymPrims& prim, int tn, int t,
|
||||
T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
ncclSymkArgsHandler const& handler, int tn, int t,
|
||||
ncclSymPtr<T> input, ncclSymPtr<T> output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
) {
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
|
||||
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
BytePack<sizeof(T)>* inpPacks = (BytePack<sizeof(T)>*)input.localPtr();
|
||||
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
|
||||
#pragma unroll 1
|
||||
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
|
||||
size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
|
||||
BytePack<sizeof(T)> tmp = inpHere[elt];
|
||||
BytePack<sizeof(T)> tmp = inpPacks[elt];
|
||||
int dr = inPlace ? 1 : 0;
|
||||
int r = rank + dr;
|
||||
if (r == nRanks) r = 0;
|
||||
@@ -88,14 +88,14 @@ static __device__ void bcastEnds(
|
||||
for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
|
||||
#pragma unroll UnrollPeers
|
||||
for (int u=0; u < UnrollPeers; u++) {
|
||||
*add4G(outRank0+elt, r*stride4G) = tmp;
|
||||
outPacks.lsaPtr(r)[elt] = tmp;
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
#pragma unroll UnrollPeers
|
||||
for (int u=0; u < UnrollPeers; u++) {
|
||||
if (dr+u == nRanks) break;
|
||||
*add4G(outRank0+elt, r*stride4G) = tmp;
|
||||
outPacks.lsaPtr(r)[elt] = tmp;
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
@@ -103,95 +103,95 @@ static __device__ void bcastEnds(
|
||||
|
||||
template<typename T>
|
||||
static __device__ void bcast(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
|
||||
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
|
||||
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
|
||||
ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
|
||||
) {
|
||||
bool inPlace = (input == output);
|
||||
// Mpve to rank=0
|
||||
output = prim.peerPtr(0, output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
|
||||
|
||||
uint32_t nPreBytes = (128u - inputUptr)%128u;
|
||||
uint32_t nPreBytes = (16 - input.offset)%16;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t cursor = nPreBytes;
|
||||
|
||||
constexpr int MinWarpPerBlock = 4;
|
||||
|
||||
if ((inputUptr-outputUptr)%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 1, UnrollPeers = 1;
|
||||
if ((input.offset - output.offset)%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
|
||||
prim, tn, t, waitNeeded,
|
||||
(char*)input + cursor, (char*)output + cursor, inPlace,
|
||||
chunks*MinWarpPerBlock
|
||||
handler, tn, t, waitNeeded, bar,
|
||||
(ncclSymPtr<char>)input + cursor,
|
||||
(ncclSymPtr<char>)output + cursor,
|
||||
inPlace, chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
|
||||
constexpr int BytePerPack = 4, UnrollPacks = 1, UnrollPeers = 1;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
|
||||
chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
|
||||
prim, tn, t, waitNeeded,
|
||||
(char*)input + cursor, (char*)output + cursor, inPlace,
|
||||
chunks*MinWarpPerBlock
|
||||
handler, tn, t, waitNeeded, bar,
|
||||
(ncclSymPtr<char>)input + cursor,
|
||||
(ncclSymPtr<char>)output + cursor,
|
||||
inPlace, chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
constexpr int UnrollPeers = 8;
|
||||
size_t nSufElts = (nBytes-cursor)/sizeof(T);
|
||||
bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
bcastEnds<UnrollPeers>(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
|
||||
int const& rank = prim.rank;
|
||||
__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkArgsHandler handler{args};
|
||||
ncclLsaBarrierSession<ncclCoopCta> bar{
|
||||
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
|
||||
};
|
||||
int const& rank = handler.comm.rank;
|
||||
|
||||
// Threads numbered over rank.
|
||||
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int btn = prim.nBlocks*blockDim.x;
|
||||
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
bool waitNeeded = true;
|
||||
handler.forEachWork<char>(
|
||||
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
|
||||
ncclSymPtr<char> input, ncclSymPtr<char> output) {
|
||||
// Threads numbered over rank.
|
||||
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
block, nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int btn = nBlocks*blockDim.x;
|
||||
|
||||
bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
|
||||
bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
waitNeeded = false;
|
||||
}
|
||||
);
|
||||
|
||||
bar.sync(ncclCoopCta(), cuda::memory_order_release);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static __device__ void bcastMultimem(
|
||||
ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
|
||||
ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
|
||||
) {
|
||||
// Move output to multimem
|
||||
output = prim.multimemPtr(output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
uint32_t nPreBytes = (16-inputUptr)%16;
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input.localPtr());
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output.multimemPtr(handler.comm.lsaMultimem));
|
||||
uint32_t nPreBytes = (16 - input.offset)%16;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t nSufBytes;
|
||||
|
||||
@@ -230,51 +230,52 @@ static __device__ void bcastMultimem(
|
||||
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
|
||||
BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
|
||||
multimem_st_global(outputUptr + cursor, val);
|
||||
cursor += tn*sizeof(T);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
|
||||
int const& rank = prim.rank;
|
||||
__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkArgsHandler handler{args};
|
||||
ncclLsaBarrierSession<ncclCoopCta> bar(
|
||||
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
|
||||
);
|
||||
int const& rank = handler.comm.rank;
|
||||
|
||||
char* input = args->input;
|
||||
char* output = args->output;
|
||||
size_t bytes = args->nElts;
|
||||
// Round robin memory to blocks.
|
||||
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int tn = prim.nBlocks*blockDim.x;
|
||||
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
handler.forEachWork<char>(
|
||||
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
|
||||
ncclSymPtr<char> input, ncclSymPtr<char> output) {
|
||||
// Round robin memory to blocks.
|
||||
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
block, nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int tn = nBlocks*blockDim.x;
|
||||
|
||||
bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
|
||||
bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts);
|
||||
}
|
||||
);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
bar.sync(ncclCoopCta(), cuda::memory_order_release);
|
||||
}
|
||||
|
||||
template<typename EltType>
|
||||
static __device__ void allgather_LL_body(
|
||||
ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
|
||||
ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
|
||||
EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
|
||||
) {
|
||||
using Pack = BytePack<8>;
|
||||
constexpr int EltPerPack = 8/sizeof(EltType);
|
||||
|
||||
ncclCoopCta cta;
|
||||
int rank = prim.rank;
|
||||
int nRanks = prim.nRanks;
|
||||
constexpr int tn = ncclSymMaxThreads;
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
int t = threadIdx.x;
|
||||
constexpr int tn = ncclSymkMaxThreads;
|
||||
|
||||
#pragma unroll 1
|
||||
while (0 < nElts) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
if (t < nIterPacks) {
|
||||
Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
|
||||
prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
|
||||
lla2a.bcast(/*slot=*/nIterPacks*rank + t, x);
|
||||
}
|
||||
|
||||
int tn_div_nPacks = tn/nIterPacks;
|
||||
@@ -287,7 +288,7 @@ static __device__ void allgather_LL_body(
|
||||
#pragma unroll 1
|
||||
for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
|
||||
Pack got[Unroll];
|
||||
prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
|
||||
lla2a.template recvUnrolled<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
|
||||
#pragma unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
|
||||
@@ -302,7 +303,7 @@ static __device__ void allgather_LL_body(
|
||||
if (i + n*tn < nRanks*nIterPacks) n += 1;
|
||||
if (n != 0) {
|
||||
Pack got[Unroll];
|
||||
prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
|
||||
lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got);
|
||||
#pragma unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
if (u != 0 && u == n) break;
|
||||
@@ -316,7 +317,7 @@ static __device__ void allgather_LL_body(
|
||||
// The non-unrolled but "obviously correct" implementation for reference.
|
||||
#pragma unroll 1
|
||||
for (int i = t; i < nRanks*nIterPacks; i += tn) {
|
||||
Pack got = prim.template recvLL<Pack>(i);
|
||||
Pack got = lla2a.template recv<Pack>(i);
|
||||
storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
|
||||
peer += tn_div_nPacks;
|
||||
pack += tn_mod_nPacks;
|
||||
@@ -324,7 +325,7 @@ static __device__ void allgather_LL_body(
|
||||
}
|
||||
#endif
|
||||
|
||||
prim.endLL(cta);
|
||||
lla2a.endEpoch(ncclCoopCta());
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
@@ -333,38 +334,41 @@ static __device__ void allgather_LL_body(
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
|
||||
static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
|
||||
ncclSymkArgsHandler handler{args};
|
||||
ncclLLA2ASession<ncclCoopCta> lla2a(
|
||||
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
|
||||
);
|
||||
|
||||
using Pack = BytePack<8>;
|
||||
constexpr int BytePerPack = 8;
|
||||
int nElts = args->nElts;
|
||||
int nPacks = divUp(nElts, BytePerPack);
|
||||
|
||||
uint32_t nPackPerBlock, nPackModBlock;
|
||||
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
|
||||
int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
|
||||
int nBlockPacks = blockPackEnd - blockPackBegin;
|
||||
int nBlockElts = nElts - blockPackBegin*BytePerPack;
|
||||
nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
|
||||
char* blockInput = args->input + blockPackBegin*BytePerPack;
|
||||
char* blockOutput = args->output + blockPackBegin*BytePerPack;
|
||||
handler.singleWork<char>(
|
||||
[&]__device__(int nElts, int nAllElts,
|
||||
ncclSymPtr<char> input, ncclSymPtr<char> output) {
|
||||
int nPacks = divUp(nElts, BytePerPack);
|
||||
|
||||
uint32_t lowBits = args->nElts;
|
||||
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
|
||||
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
|
||||
if (__builtin_expect(lowBits%8 == 0, true)) {
|
||||
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
|
||||
allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
|
||||
} else {
|
||||
allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
|
||||
}
|
||||
char* blockInput = input.localPtr();
|
||||
char* blockOutput = output.localPtr();
|
||||
|
||||
uint32_t lowBits = nElts;
|
||||
lowBits |= (uintptr_t)blockInput;
|
||||
lowBits |= (uintptr_t)blockOutput;
|
||||
if (__builtin_expect(lowBits%8 == 0, true)) {
|
||||
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
|
||||
allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput,
|
||||
nElts/8, nPacks, nAllElts/8);
|
||||
} else {
|
||||
allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts);
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true);
|
||||
}
|
||||
|
||||
@@ -1,38 +1,41 @@
|
||||
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "symmetric.h"
|
||||
#include "sym_kernels.h"
|
||||
#include "nccl_device.h"
|
||||
#include "symmetric/kernel.h"
|
||||
#include "symmetric/primitives.h"
|
||||
|
||||
template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
|
||||
static __device__ __forceinline__ void allreduceDeep(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
Red red, char* inputRank0, char* outputRank0, int32_t nIters
|
||||
ncclSymkArgsHandler const& handler, int tn, int t,
|
||||
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
|
||||
Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
|
||||
) {
|
||||
using Pack = BytePack<BytePerPack>;
|
||||
using Acc = typename Red::EltType;
|
||||
using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
|
||||
|
||||
ncclTeam world = ncclTeamWorld(handler.comm);
|
||||
int wn = tn/WARP_SIZE;
|
||||
int w = t/WARP_SIZE;
|
||||
int lane = t%WARP_SIZE;
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
|
||||
ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack acc0[UnrollPacks];
|
||||
|
||||
nIters -= w;
|
||||
if (0 < nIters) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
|
||||
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
if (0 < nIters) {
|
||||
while (true) {
|
||||
@@ -42,7 +45,7 @@ static __device__ __forceinline__ void allreduceDeep(
|
||||
{ Pack tmp1[UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
|
||||
tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
@@ -67,7 +70,7 @@ static __device__ __forceinline__ void allreduceDeep(
|
||||
if (partial && ur!=0 && dr+ur == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
|
||||
tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
|
||||
}
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
@@ -98,22 +101,22 @@ static __device__ __forceinline__ void allreduceDeep(
|
||||
if (partial && dr == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
|
||||
outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u];
|
||||
}
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
nIters -= wn;
|
||||
if (nIters <= 0) break;
|
||||
|
||||
// Load data for next iteration.
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
|
||||
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -121,21 +124,23 @@ static __device__ __forceinline__ void allreduceDeep(
|
||||
|
||||
template<int UnrollPeers, typename Red, typename T>
|
||||
static __device__ __forceinline__ void allreduceEnds(
|
||||
ncclSymPrims& prim, int tn, int t, Red red,
|
||||
T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
ncclSymkArgsHandler const& handler, int tn, int t, Red red,
|
||||
ncclSymPtr<T> input, ncclSymPtr<T> output,
|
||||
size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
) {
|
||||
using Acc = typename Red::EltType;
|
||||
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
|
||||
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
|
||||
ncclTeam world = ncclTeamWorld(handler.comm);
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
|
||||
ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
|
||||
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
|
||||
|
||||
#pragma unroll 1
|
||||
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
|
||||
size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
|
||||
BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
|
||||
BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
|
||||
BytePack<sizeof(Acc)> acc1;
|
||||
BytePack<sizeof(T)> tmp[UnrollPeers];
|
||||
int dr = 1;
|
||||
@@ -154,7 +159,7 @@ static __device__ __forceinline__ void allreduceEnds(
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && u!=0 && dr+u == nRanks) break;
|
||||
tmp[u] = *add4G(inpRank0+elt, r*stride4G);
|
||||
tmp[u] = inpPacks.peerPtr(world, r)[elt];
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
@@ -182,7 +187,7 @@ static __device__ __forceinline__ void allreduceEnds(
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && dr+u == nRanks) break;
|
||||
*add4G(outRank0+elt, r*stride4G) = acc0;
|
||||
outPacks.peerPtr(world, r)[elt] = acc0;
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
@@ -193,35 +198,33 @@ static __device__ __forceinline__ void allreduceEnds(
|
||||
|
||||
template<typename Red, typename T>
|
||||
static __device__ void allreduce(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
Red red, T* input, T* output, size_t nElts
|
||||
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
|
||||
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
|
||||
Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
|
||||
) {
|
||||
int nRanks = prim.nRanks;
|
||||
int nBlocks = prim.nBlocks;
|
||||
// Mpve to rank=0
|
||||
input = prim.peerPtr(0, input);
|
||||
output = prim.peerPtr(0, output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
int const& nRanks_rcp32 = handler.nRanks_rcp32;
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
|
||||
uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);
|
||||
|
||||
uint32_t nPreBytes = (16u - inputUptr)%16u;
|
||||
uint32_t nPreBytes = (16u - input.offset)%16u;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t cursor = nPreBytes;
|
||||
|
||||
constexpr int MinWarpPerBlock = 4;
|
||||
|
||||
if ((inputUptr-outputUptr)%16 == 0) {
|
||||
if ((input.offset - output.offset)%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
|
||||
prim, tn, t, waitNeeded, red,
|
||||
(char*)input + cursor, (char*)output + cursor,
|
||||
handler, tn, t, waitNeeded, bar, red,
|
||||
(ncclSymPtr<char>)input + cursor,
|
||||
(ncclSymPtr<char>)output + cursor,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
@@ -229,16 +232,17 @@ static __device__ void allreduce(
|
||||
}
|
||||
}
|
||||
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
|
||||
constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
|
||||
prim, tn, t, waitNeeded, red,
|
||||
(char*)input + cursor, (char*)output + cursor,
|
||||
handler, tn, t, waitNeeded, bar, red,
|
||||
(ncclSymPtr<char>)input + cursor,
|
||||
(ncclSymPtr<char>)output + cursor,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
@@ -246,46 +250,51 @@ static __device__ void allreduce(
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
constexpr int UnrollPeers = 8;
|
||||
size_t nSufElts = (nBytes-cursor)/sizeof(T);
|
||||
allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
allreduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
}
|
||||
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
|
||||
int /*const&*/ rank = prim.rank;
|
||||
int /*const&*/ nRanks = prim.nRanks;
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkArgsHandler handler{args};
|
||||
ncclLsaBarrierSession<ncclCoopCta> bar{
|
||||
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
|
||||
};
|
||||
|
||||
// Threads numbered globally such that we round robin warps by rank then block.
|
||||
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
rank, nRanks,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int gtn = nRanks*prim.nBlocks*blockDim.x;
|
||||
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
|
||||
allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
|
||||
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
bool waitNeeded = true;
|
||||
handler.forEachWork<T>(
|
||||
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
|
||||
ncclSymPtr<T> input, ncclSymPtr<T> output) {
|
||||
// Threads numbered globally such that we round robin warps by rank then block.
|
||||
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
rank, nRanks,
|
||||
block, nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int gtn = nRanks*nBlocks*blockDim.x;
|
||||
|
||||
allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts);
|
||||
|
||||
waitNeeded = false;
|
||||
}
|
||||
);
|
||||
|
||||
bar.sync(ncclCoopCta(), cuda::memory_order_release);
|
||||
}
|
||||
|
||||
|
||||
template<typename Red, typename T>
|
||||
static __device__ void allreduceMultimem(
|
||||
ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
|
||||
int tn, int t, Red red, T* input, T* output, size_t nElts
|
||||
) {
|
||||
// Mpve to multimem
|
||||
input = prim.multimemPtr(input);
|
||||
output = prim.multimemPtr(output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
@@ -330,106 +339,132 @@ static __device__ void allreduceMultimem(
|
||||
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
|
||||
BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
|
||||
multimem_st_global(outputUptr + cursor, val);
|
||||
cursor += tn*sizeof(T);
|
||||
}
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkArgsHandler handler{args};
|
||||
ncclLsaBarrierSession<ncclCoopCta> bar{
|
||||
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
|
||||
};
|
||||
|
||||
// Threads numbered globally such that we round robin warps by rank then block.
|
||||
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.rank, prim.nRanks,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
|
||||
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
auto const& multimem = handler.comm.lsaMultimem;
|
||||
|
||||
allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
|
||||
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
handler.forEachWork<T>(
|
||||
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
|
||||
ncclSymPtr<T> input, ncclSymPtr<T> output) {
|
||||
// Threads numbered globally such that we round robin warps by rank then block.
|
||||
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
rank, nRanks,
|
||||
block, nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int gtn = nRanks*nBlocks*blockDim.x;
|
||||
|
||||
allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts);
|
||||
}
|
||||
);
|
||||
|
||||
bar.sync(ncclCoopCta(), cuda::memory_order_release);
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
|
||||
int /*const&*/ rank = prim.rank;
|
||||
using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
|
||||
Red<Acc> red(args->redOpArg);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
|
||||
ncclSymkArgsHandler handler{args};
|
||||
ncclLLA2ASession<ncclCoopCta> lla2a(
|
||||
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A,
|
||||
blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
|
||||
);
|
||||
|
||||
int const& rank = handler.comm.rank;
|
||||
int const& nRanks = handler.comm.nRanks;
|
||||
using Acc = typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type;
|
||||
Red<Acc> red(handler.devWork->redOpArg);
|
||||
|
||||
using Pack = BytePack<8>;
|
||||
using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
|
||||
constexpr int EltPerPack = 8/sizeof(T);
|
||||
int nElts = args->nElts;
|
||||
int nPacks = divUp(nElts, EltPerPack);
|
||||
|
||||
bool packAligned = 8 <= alignof(T) || (
|
||||
args->nElts*sizeof(T) |
|
||||
(uint32_t)reinterpret_cast<uintptr_t>(args->input) |
|
||||
(uint32_t)reinterpret_cast<uintptr_t>(args->output)
|
||||
)%8 == 0;
|
||||
handler.singleWork<T>(
|
||||
[&]__device__(int nElts, int nAllElts,
|
||||
ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
|
||||
int nPacks = divUp(nElts, EltPerPack);
|
||||
|
||||
uint32_t nPackPerBlock, nPackModBlock;
|
||||
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
|
||||
int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
|
||||
T* input = (T*)inputPtr.localPtr();
|
||||
T* output = (T*)outputPtr.localPtr();
|
||||
|
||||
nPacks = end - begin;
|
||||
nElts -= begin*EltPerPack;
|
||||
nElts = min(nElts, nPacks*EltPerPack);
|
||||
T* input = (T*)args->input + begin*EltPerPack;
|
||||
T* output = (T*)args->output + begin*EltPerPack;
|
||||
bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0;
|
||||
|
||||
ncclCoopCta cta;
|
||||
int t = threadIdx.x;
|
||||
int tn = ncclSymMaxThreads;
|
||||
ncclCoopCta cta;
|
||||
int t = threadIdx.x;
|
||||
int tn = ncclSymkMaxThreads;
|
||||
|
||||
if (__builtin_expect(packAligned, true)) {
|
||||
#pragma unroll 1
|
||||
while (0 < nPacks) {
|
||||
if (t < nPacks) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
|
||||
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
|
||||
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
|
||||
storePack((Pack*)output, t, nPacks, out);
|
||||
if (__builtin_expect(packAligned, true)) {
|
||||
#pragma unroll 1
|
||||
while (0 < nPacks) {
|
||||
if (t < nPacks) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
|
||||
lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
|
||||
AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
|
||||
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
|
||||
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
|
||||
return applyCast<T, Acc>(x);
|
||||
},
|
||||
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
|
||||
return applyReduce(red, a, b);
|
||||
}
|
||||
);
|
||||
storePack((Pack*)output, t, nPacks, applyCast<Acc, T>(out));
|
||||
}
|
||||
lla2a.endEpoch(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll 1
|
||||
while (0 < nElts) {
|
||||
if (t*EltPerPack < nElts) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
|
||||
lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
|
||||
AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
|
||||
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
|
||||
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
|
||||
return applyCast<T, Acc>(x);
|
||||
},
|
||||
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
|
||||
return applyReduce(red, a, b);
|
||||
}
|
||||
);
|
||||
storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(out));
|
||||
}
|
||||
lla2a.endEpoch(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nElts -= tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
}
|
||||
}
|
||||
prim.endLL(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll 1
|
||||
while (0 < nElts) {
|
||||
if (t*EltPerPack < nElts) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
|
||||
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
|
||||
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
|
||||
storePack(output, t*EltPerPack, nElts, out);
|
||||
}
|
||||
prim.endLL(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nElts -= tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
|
||||
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) {
|
||||
ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user