Merge commit '3d4813d99196bb349eccd50a925e2addc8f1622c' into develop

This commit is contained in:
Ameya Keshava Mallya
2026-01-21 20:28:14 +00:00
melakukan 8d996cc05f
295 mengubah file dengan 27704 tambahan dan 3800 penghapusan
+2 -2
Melihat File
@@ -34,7 +34,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
- name: Checkout rccl repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -143,5 +143,5 @@ jobs:
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
artifact_group: ${{ inputs.artifact_group }}
test_runs_on: linux-mi325-1gpu-ossci-rocm-frac
test_runs_on: linux-mi325-4gpu-ossci-rocm
artifact_run_id: ${{ github.run_id }}
@@ -39,14 +39,15 @@ jobs:
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
OUTPUT_ARTIFACTS_DIR: /apps/cvs_tests/dist_new/dist/rocm
THEROCK_BIN_DIR: "./build/bin"
AWS_SHARED_CREDENTIALS_FILE: /apps/cvs_tests/awsconfig/credentials.ini
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
@@ -61,20 +62,11 @@ jobs:
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
# sbatch script runs rccl_heatmap_cvs script which validates and generates a bandwidth heatmap file for different rccl collectives
- name: Test gfx950
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
run: |
salloc -N 4 -p meta64 -t 04:00:00 --exclusive bash -c "
source /home/arravikum/TheRock/.venv/bin/activate &&
cd /home/arravikum/cvs &&
python input/setup.py &&
pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
--cluster_file ./input/cluster.json \
--config_file ./input/mi350_config.json \
--log-file=/tmp/rccl_log.log \
--html=/home/arravikum/cvs/test_reports/ci_test_report.html \
--capture=tee-sys \
--self-contained-html"
SETUP_NODES=1 sbatch --wait -N4 /apps/cvs_tests/cvs-sbatch/sbatch/default.sbatch
- name: Configure AWS Credentials for non-forked repos
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
@@ -91,6 +83,6 @@ jobs:
python3 build_tools/github_actions/upload_test_report_script.py \
--run-id "${{ github.run_id }}" \
--amdgpu-family "${{ inputs.amdgpu_families }}" \
--report-path "/home/arravikum/cvs/test_reports" \
--report-path "/apps/cvs_tests/test_reports" \
--log-destination "/logs/gfx950-dcgpu" \
--index-file-name "index_rccl_test_report.html"
@@ -30,13 +30,16 @@ jobs:
name: 'Test single-node'
runs-on: ${{ inputs.test_runs_on }}
container:
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98
options: --ipc host
--group-add video
--device /dev/kfd
--device /dev/dri
--group-add 110
--ulimit memlock=-1:-1
--security-opt seccomp=unconfined
--env-file /etc/podinfo/gha-gpu-isolation-settings
--user 0:0
defaults:
run:
shell: bash
@@ -50,7 +53,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
@@ -70,5 +73,5 @@ jobs:
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
run: |
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
--log-cli-level=info \
-k "not test_rccl_correctness_tests"
-k "not test_rccl_correctness_tests" \
--log-cli-level=info
+2 -2
Melihat File
@@ -3,6 +3,6 @@
/coverage/
build/
ext/
src/transport/net_ib_rocm.cc
# Visual Studio Code
.vscode
.vscode
+4
Melihat File
@@ -8,3 +8,7 @@
url = https://github.com/nlohmann/json.git
ignore = dirty
shallow = true
[submodule "ext-src/rocSHMEM"]
path = ext-src/rocSHMEM
url = https://github.com/ROCm/rocSHMEM.git
branch = develop
+18 -1
Melihat File
@@ -2,12 +2,29 @@
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
## Unreleased - RCCL 2.28.3 for ROCm 7.11
### Known issues
* AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm.
* ROCTx feature needs to be verified.
* Profiler plugin needs to be verified.
### Changed
* Compatibility with NCCL 2.28.3.
* The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script.
* MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL.
## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
### Changed
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
* Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
* Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
* The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.
### Known issues
* AllToAllv/AlltoAll for single GPU is hanging.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
+180 -33
Melihat File
@@ -26,7 +26,7 @@ option(BUILD_TESTS "Build unit test programs"
option(COLLTRACE "Collective Trace Option" ON)
option(DUMP_ASM "Disassemble and dump" OFF)
option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" OFF)
option(ENABLE_MSCCLPP "Enable MSCCL++" OFF)
option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF)
option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF)
@@ -42,6 +42,7 @@ option(TIMETRACE "Enable time-trace during compila
option(TRACE "Enable additional tracing" OFF)
option(FAULT_INJECTION "Enable fault injection" ON)
option(QUIET_WARNINGS "Supress compiler warnings" OFF)
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
# Default GPU architectures to build
#==================================================================================================
@@ -65,6 +66,11 @@ include(CheckSymbolExists)
include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets
include(cmake/CheckSymbolExistsNoWarn.cmake)
# Include rocSHMEM build module only if enabled
if(ENABLE_ROCSHMEM)
include(cmake/ROCSHMEM.cmake)
endif()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Build only for local GPU architecture
@@ -80,6 +86,9 @@ endif()
# Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# ROCM NetIB patch
include(cmake/rocmIb.cmake)
# Modify GPU architectures for Address Sanitizer builds by appending "xnack+"
if (BUILD_ADDRESS_SANITIZER)
SET(amdgpu_targets "")
@@ -252,26 +261,56 @@ find_package(hsa-runtime64 REQUIRED)
get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
## Check for ROCM-smi
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if (rocm_smi_FOUND)
message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
else()
message(STATUS "Checking old include directory structure for rocm_smi")
set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
set(ROCM_SMI_LIBRARIES rocm_smi64)
## Check for amd-smi if ROCm 7.11.0 or newer
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
if(amd_smi_FOUND)
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}")
set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory")
set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory")
set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging")
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIBRARIES amd_smi)
set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi")
endif()
endif()
if(NOT USE_AMDSMI)
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if(rocm_smi_FOUND)
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
else()
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
endif()
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
set(SMI_LIBRARIES rocm_smi64)
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
endif()
check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
## Check for BFD library if custom backtrace is requested
if(BUILD_BFD)
@@ -318,6 +357,8 @@ if(BUILD_BFD)
endif()
endif()
# Check for --amdgpu-kernarg-preload-count
check_cxx_compiler_flag("-mllvm --amdgpu-kernarg-preload-count=16" HAVE_KERNARG_PRELOAD)
if (HAVE_KERNARG_PRELOAD)
@@ -333,6 +374,7 @@ endif()
## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
# Check if any of the supported architectures are in GPU_TARGETS
set(ARCH_MATCH_FOUND OFF)
set(MSCCLPP_GPU_TARGETS "")
@@ -355,6 +397,20 @@ if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build")
endif()
## Disable WARP_SPEED if the build environment is invalid
set(WARP_SPEED_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
set(ARCH_MATCH_FOUND OFF)
foreach(ARCH IN LISTS GPU_TARGETS)
if(ARCH IN_LIST WARP_SPEED_SUPPORTED_ARCHS)
set(ARCH_MATCH_FOUND ON)
endif()
endforeach()
if (NOT ARCH_MATCH_FOUND)
set(ENABLE_WARP_SPEED OFF)
message(WARNING "Can only build WARP_SPEED for supported GPU_TARGETS: ${WARP_SPEED_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling WARP_SPEED build")
endif()
# cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22
execute_process(
COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
@@ -437,9 +493,12 @@ configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used b
set(SRC_FILES
src/allocator.cc
src/bootstrap.cc
src/ce_coll.cc
src/channel.cc
src/collectives.cc
src/commDump.cc
src/debug.cc
src/dev_runtime.cc
src/enqueue.cc
src/group.cc
src/init.cc
@@ -448,11 +507,12 @@ set(SRC_FILES
src/msccl.cc
src/proxy.cc
src/rccl_wrap.cc
src/symmetric.cc
src/sym_kernels.cc
src/transport.cc
src/device/all_gather.h
src/device/all_reduce.h
src/device/alltoall_pivot.h
src/device/alltoall_gda.h
src/device/broadcast.h
src/device/common.h
src/device/common_kernel.h
@@ -498,6 +558,7 @@ set(SRC_FILES
src/include/BfdBacktrace.hpp
src/include/bitops.h
src/include/bootstrap.h
src/include/ce_coll.h
src/include/channel.h
src/include/checks.h
src/include/collectives.h
@@ -507,6 +568,7 @@ set(SRC_FILES
src/include/cpuset.h
# src/include/cudawrap.h
src/include/debug.h
src/include/dev_runtime.h
src/include/device.h
src/include/enqueue.h
src/include/gdrwrap.h
@@ -521,6 +583,7 @@ set(SRC_FILES
src/include/ipcsocket.h
src/include/mnnvl.h
src/include/nccl_common.h
src/include/nccl_device.h
src/include/net_device.h
src/include/net.h
src/include/nvmlwrap.h
@@ -537,16 +600,16 @@ set(SRC_FILES
src/include/register.h
src/include/register_inline.h
src/include/rccl_float8.h
src/include/rocm_smi_wrap.h
src/include/rocmwrap.h
src/include/roctx.h
src/include/recorder.h
src/include/scheduler.h
src/include/shm.h
src/include/shmutils.h
src/include/signals.h
src/include/socket.h
src/include/strongstream.h
src/include/symmetric.h
src/include/sym_kernels.h
src/include/timer.h
src/include/transport.h
src/include/trees.h
@@ -555,12 +618,32 @@ set(SRC_FILES
src/include/mlx5/mlx5dvcore.h
src/include/mlx5/mlx5dvsymbols.h
src/include/mlx5/mlx5dvwrap.h
src/include/ionic/ionicdvcore.h
src/include/ionic/ionicdvsymbols.h
src/include/ionic/ionicdvwrap.h
src/include/msccl/msccl_lifecycle.h
src/include/msccl/msccl_parser.h
src/include/msccl/msccl_scheduler.h
src/include/msccl/msccl_setup.h
src/include/msccl/msccl_status.h
src/include/msccl/msccl_struct.h
src/include/nccl_device/comm.h
src/include/nccl_device/coop.h
src/include/nccl_device/core.h
src/include/nccl_device/ll_a2a.h
src/include/nccl_device/mem_barrier.h
src/include/nccl_device/ptr.h
src/include/nccl_device/utility.h
src/include/nccl_device/impl/comm__funcs.h
src/include/nccl_device/impl/comm__types.h
src/include/nccl_device/impl/core__funcs.h
src/include/nccl_device/impl/core__types.h
src/include/nccl_device/impl/ll_a2a__funcs.h
src/include/nccl_device/impl/ll_a2a__types.h
src/include/nccl_device/impl/mem_barrier__funcs.h
src/include/nccl_device/impl/mem_barrier__types.h
src/include/nccl_device/impl/ptr__funcs.h
src/include/nccl_device/impl/ptr__types.h
src/include/npkit/npkit.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit_struct.h
@@ -608,6 +691,7 @@ set(SRC_FILES
src/include/plugin/net/net_v8.h
src/include/plugin/net/net_v9.h
src/include/plugin/net/net_v10.h
src/include/plugin/net/net_v11.h
src/include/plugin/profiler/net_ib_v1.h
src/include/plugin/profiler/net_ib.h
src/include/plugin/profiler/net_socket_v1.h
@@ -616,9 +700,11 @@ set(SRC_FILES
src/include/plugin/profiler/profiler_v2.h
src/include/plugin/profiler/profiler_v3.h
src/include/plugin/profiler/profiler_v4.h
src/include/plugin/profiler/profiler_v5.h
src/include/plugin/tuner/tuner_v2.h
src/include/plugin/tuner/tuner_v3.h
src/include/plugin/tuner/tuner_v4.h
src/include/plugin/tuner/tuner_v5.h
src/misc/alt_rsmi.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
@@ -631,11 +717,12 @@ set(SRC_FILES
src/misc/ipcsocket.cc
src/misc/mlx5dvsymbols.cc
src/misc/mlx5dvwrap.cc
src/misc/ionicdvsymbols.cc
src/misc/ionicdvwrap.cc
src/misc/npkit.cc
# src/misc/nvmlwrap.cc
src/misc/nvmlwrap_stub.cc
src/misc/param.cc
src/misc/rocm_smi_wrap.cc
src/misc/rocmwrap.cc
src/misc/roctx.cc
src/misc/recorder.cc
@@ -649,6 +736,9 @@ set(SRC_FILES
src/misc/msccl/msccl_setup.cc
src/misc/msccl/msccl_status.cc
src/misc/proxy_trace/proxy_trace.cc
src/nccl_device/core.cc
src/nccl_device/ll_a2a.cc
src/nccl_device/mem_barrier.cc
src/plugin/net.cc
src/plugin/plugin_open.cc
src/plugin/profiler.cc
@@ -658,13 +748,16 @@ set(SRC_FILES
src/plugin/net/net_v8.cc
src/plugin/net/net_v9.cc
src/plugin/net/net_v10.cc
src/plugin/net/net_v11.cc
src/plugin/profiler/profiler_v1.cc
src/plugin/profiler/profiler_v2.cc
src/plugin/profiler/profiler_v3.cc
src/plugin/profiler/profiler_v4.cc
src/plugin/profiler/profiler_v5.cc
src/plugin/tuner/tuner_v2.cc
src/plugin/tuner/tuner_v3.cc
src/plugin/tuner/tuner_v4.cc
src/plugin/tuner/tuner_v5.cc
src/ras/client.cc
src/ras/client_support.cc
src/ras/collectives.cc
@@ -675,10 +768,12 @@ set(SRC_FILES
src/register/coll_reg.cc
src/register/register.cc
src/register/sendrecv_reg.cc
src/scheduler/symmetric_sched.cc
src/transport/coll_net.cc
src/transport/generic.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_ib_rocm.cc
src/transport/net_socket.cc
src/transport/nvls.cc
src/transport/p2p.cc
@@ -695,6 +790,19 @@ set(SRC_FILES
src/misc/latency_profiler/CollTraceUtils.cc
)
if(USE_AMDSMI)
set(SMI_SOURCES
src/include/amdsmi_wrap.h
src/misc/amdsmi_wrap.cc
)
else()
set(SMI_SOURCES
src/include/rocm_smi_wrap.h
src/misc/rocm_smi_wrap.cc
)
endif()
list(APPEND SRC_FILES ${SMI_SOURCES})
if (ENABLE_MSCCL_KERNEL)
set(MSCCL_KERNEL_SOURCES
src/device/msccl_kernel_impl.h
@@ -846,6 +954,8 @@ target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})
@@ -858,26 +968,59 @@ if(ROCTX_ENABLE)
target_include_directories(rccl PRIVATE ${ROCTRACER_INCLUDE_DIR})
endif()
## Set RCCL compile definitions
if(COLLTRACE)
target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
endif()
if(ENABLE_MSCCL_KERNEL)
message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.")
target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
endif()
if(ENABLE_MSCCLPP)
target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
endif()
if(HAVE_ROCM_SMI64CONFIG)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
if(USE_AMDSMI)
target_compile_definitions(rccl PRIVATE USE_AMDSMI)
else()
if(HAVE_ROCM_SMI64CONFIG)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
endif()
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
endif()
endif()
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
if(ENABLE_WARP_SPEED)
target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
endif()
if(ENABLE_ROCSHMEM)
target_compile_definitions(rccl PRIVATE ENABLE_ROCSHMEM)
endif()
# ==== rocSHMEM integration (optional) ====
if (ENABLE_ROCSHMEM)
add_rocshmem_targets()
# Ensure rocSHMEM is fully built/installed before compiling rccl
if (TARGET rocshmem_ext)
add_dependencies(rccl rocshmem_ext)
endif()
if (ROCSHMEM_INCLUDE_DIR)
target_include_directories(rccl PRIVATE ${ROCSHMEM_INCLUDE_DIR})
endif()
# Moved to where MSCCL target_links
## target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
target_link_libraries(rccl PRIVATE ${IBVERBS})
endif()
# NPKit flags
## May be better to move these to a separate file
if(ENABLE_NPKIT)
message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.")
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -1099,8 +1242,7 @@ if(ENABLE_CODE_COVERAGE)
message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.")
target_compile_options(rccl PRIVATE
-fvisibility=default -Xarch_host -fprofile-instr-generate
-Xarch_host -fcoverage-mapping)
-fvisibility=default -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping)
set(COVERAGE_SHARED_LINKER_FLAGS
-fprofile-generate
@@ -1169,7 +1311,7 @@ if (FAULT_INJECTION)
endif()
## Set RCCL linked library directories
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
target_link_directories(rccl PRIVATE ${SMI_LIB_DIR})
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
@@ -1201,11 +1343,15 @@ target_link_libraries(rccl PRIVATE Threads::Threads)
target_link_libraries(rccl INTERFACE hip::host)
target_link_libraries(rccl PRIVATE hip::device)
target_link_libraries(rccl PRIVATE dl)
target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES})
target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
if(ENABLE_MSCCLPP)
target_link_libraries(rccl PRIVATE mscclpp_nccl)
endif()
if(ENABLE_ROCSHMEM)
target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
target_link_libraries(rccl PRIVATE ${IBVERBS})
endif()
## Set RCCL link options
## Find out available memory
@@ -1317,7 +1463,8 @@ if(BUILD_ADDRESS_SANITIZER)
else()
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
endif()
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0")
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}")
set(CPACK_DEB_COMPONENT_INSTALL ON)
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
set(CPACK_RPM_COMPONENT_INSTALL ON)
+1 -1
Melihat File
@@ -42,7 +42,7 @@ RCCL build & installation helper script
--debug Build debug library
--enable_backtrace Build with custom backtrace support
--disable-colltrace Build without collective trace
--disable-msccl-kernel Build without MSCCL kernels
--enable-msccl-kernel Build with MSCCL kernels
--enable-mscclpp Build with MSCCL++ support
--enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
--disable-roctx Build without ROCTX logging
@@ -0,0 +1,35 @@
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(ROCSHMEM_INCLUDE_DIR
NAMES rocshmem/rocshmem.hpp rocshmem/rocshmem.h
HINTS ${ROCSHMEM_INSTALL_DIR}/include/)
find_library(ROCSHMEM_LIBRARY
NAMES rocshmem
HINTS ${ROCSHMEM_INSTALL_DIR}/lib)
## -- todo --- what to do with verbs? add to handle args call below? -- ##
find_library(IBVERBS ibverbs)
find_package_handle_standard_args(rocshmem_static DEFAULT_MSG ROCSHMEM_INCLUDE_DIR ROCSHMEM_LIBRARY)
## mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_NCCL_STATIC_LIB) add this for Rocshmem?
+113
Melihat File
@@ -0,0 +1,113 @@
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
include(ExternalProject)
function(add_rocshmem_targets)
# Check for an existing installation via the user-provided prefix ROCSHMEM_INSTALL DIR
if(ROCSHMEM_INSTALL_DIR)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
find_package(rocshmem_static)
if(NOT IBVERBS)
find_library(IBVERBS ibverbs)
if(IBVERBS)
set(IBVERBS ${IBVERBS} PARENT_SCOPE)
endif()
endif()
endif()
# If no pre-existing installation, build from submodule into ext/rocshmem
if(NOT rocshmem_static_FOUND)
set(_rccl_root "${CMAKE_SOURCE_DIR}")
set(ROCSHMEM_SOURCE "${_rccl_root}/ext-src/rocSHMEM")
set(ROCSHMEM_INSTALL_DIR "${_rccl_root}/ext/rocshmem")
# Make sure submodule exists (same style as MSCCL++: custom rule + target)
add_custom_command(
OUTPUT "${ROCSHMEM_SOURCE}/CMakeLists.txt"
COMMAND git submodule update --init --recursive ext-src/rocSHMEM
WORKING_DIRECTORY "${_rccl_root}"
COMMENT "Checking out submodule: ext-src/rocSHMEM"
VERBATIM
)
add_custom_target(rocshmem_checkout_submodule
DEPENDS "${ROCSHMEM_SOURCE}/CMakeLists.txt")
# Where our patch files live (like MSCCL++)
set(EXT_SOURCE "${_rccl_root}/ext-src")
# Build and install rocSHMEM. We run `../build_scripts/gdx_bxnt`
# from a 'build' dir just like the README shows.
ExternalProject_Add(rocshmem_ext
SOURCE_DIR "${ROCSHMEM_SOURCE}"
INSTALL_DIR "${ROCSHMEM_INSTALL_DIR}"
UPDATE_DISCONNECTED TRUE
LOG_DOWNLOAD FALSE
LOG_CONFIGURE FALSE
LOG_BUILD FALSE
LOG_INSTALL FALSE
BUILD_IN_SOURCE TRUE
DOWNLOAD_COMMAND "" # using the submodule checkout above
TEST_COMMAND ""
DEPENDS rocshmem_checkout_submodule
# Rocshmem submodule commit hash -> commit b28a56bd54ccc581d05a439ffa466c3dacb3385
# The project has its own scripts; we replicate the README sequence:
CONFIGURE_COMMAND ""
BUILD_COMMAND
${CMAKE_COMMAND} -E make_directory build
&& ${CMAKE_COMMAND} -E chdir build bash -lc "../scripts/build_configs/gda_bnxt -DUSE_EXTERNAL_MPI=OFF -DUSE_IPC=ON -DBUILD_EXAMPLES=OFF "
&& ${CMAKE_COMMAND} -E chdir build ${CMAKE_COMMAND}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DBUILD_EXAMPLES=OFF ..
&& ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} -j
INSTALL_COMMAND
${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} install
)
# After build, define the variables RCCL expects
set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INSTALL_DIR}/include" PARENT_SCOPE)
set(ROCSHMEM_LIBRARY "${ROCSHMEM_INSTALL_DIR}/lib/librocshmem.a" PARENT_SCOPE)
find_library(_IBVERBS ibverbs)
if(NOT _IBVERBS)
message(FATAL_ERROR "libibverbs not found (install rdma-core/libibverbs-dev)")
endif()
set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
# Provide a dummy target other code can depend on
add_custom_target(rocshmem_static ALL DEPENDS rocshmem_ext)
else()
# We found a prebuilt rocSHMEM; export variables upward as-is
set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INCLUDE_DIR}" PARENT_SCOPE)
set(ROCSHMEM_LIBRARY "${ROCSHMEM_LIBRARY}" PARENT_SCOPE)
find_library(_IBVERBS ibverbs)
if(NOT _IBVERBS)
message(FATAL_ERROR "libibverbs not found")
endif()
set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
endif()
endfunction()
+257
Melihat File
@@ -0,0 +1,257 @@
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
message(STATUS "Generating ROCM NetIB... ")
# -------------------------
# Configurable paths
# -------------------------
# Path to RCCL source tree (local clone)
set(RCCL_SRC_DIR "${CMAKE_SOURCE_DIR}" CACHE PATH "Path to RCCL source directory")
# Path to patch file
set(ROCM_NETIB_PATCH_FILE "${CMAKE_SOURCE_DIR}/ext-src/rocm_netib.patch" CACHE FILEPATH "ROCM NETIB Patch file to apply to RCCL")
set(ROCM_NETIB_FILE "${CMAKE_SOURCE_DIR}/src/transport/net_ib_rocm.cc" CACHE FILEPATH "Generated ROCM NETIB file")
# -------------------------
# Find tools
# -------------------------
find_program(PATCH_EXECUTABLE patch)
find_program(SED_EXECUTABLE sed)
execute_process(
COMMAND ${CMAKE_COMMAND} -E echo "Applying RCCL ROCM NetIB patch... to ${CMAKE_SOURCE_DIR}"
COMMAND bash -c "patch -p1 -i ${ROCM_NETIB_PATCH_FILE} -o ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/NCCL_PARAM(Ib/NCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/RCCL_PARAM(Ib/RCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclParamIb/ncclParamRocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/rcclParamIb/rcclParamRocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbMergedDevs/rocmIbMergedDevs/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDevs/rocmIbDevs/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbLock/rocmIbLock/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ibProviderName/rocmIbProviderName/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbAsyncThread/rocmIbAsyncThread/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGdrSupport/rocmIbGdrSupport/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDmaBufSupport/rocmIbDmaBufSupport/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbInitCommDevBase/rocmIbInitCommDevBase/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDestroyBase/rocmIbDestroyBase/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRtrQp/rocmIbRtrQp/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRtsQp/rocmIbRtsQp/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ForceEnableGdrdma/RocmForceEnableGdrdma/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCheckVProps/rocmIbCheckVProps/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetRequest/rocmIbGetRequest/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbFreeRequest/rocmIbFreeRequest/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBufInternal/rocmIbRegMrDmaBufInternal/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetNetCommDevBase/rocmIbGetNetCommDevBase/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDeregMrInternal/rocmIbDeregMrInternal/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbPostFifo/rocmIbPostFifo/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/reqTypeStr/rocmIbReqTypeStr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/rcclNetP2pPolicy/rcclRocmNetP2pPolicy/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbMakeVDeviceInternal/rocmIbMakeVDeviceInternal/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbMakeVDevice/rocmIbMakeVDevice/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbInit/rocmIbInit/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDevices/rocmIbDevices/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetPhysProperties/rocmIbGetPhysProperties/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetProperties/rocmIbGetProperties/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbListen\(/rocmIbListen\(/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbListen,/rocmIbListen,/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbConnect\(/rocmIbConnect\(/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbConnect /rocmIbConnect /g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbConnect,/rocmIbConnect,/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbAccept/rocmIbAccept/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbTest/rocmIbTest/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBuf/rocmIbRegMrDmaBuf/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRegMr/rocmIbRegMr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDeregMr/rocmIbDeregMr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbIsend/rocmIbIsend/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbIrecv/rocmIbIrecv/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbIflush/rocmIbIflush/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCloseSend/rocmIbCloseSend/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCloseRecv/rocmIbCloseRecv/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCloseListen/rocmIbCloseListen/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
@@ -38,13 +38,15 @@ Collect this information about the ROCm version, GPU/accelerator, platform, and
rocminfo
* Run these ``rocm-smi`` commands to display the system topology.
* Run these ``amd-smi`` commands to display the system topology.
.. code:: shell
rocm-smi
rocm-smi --showtopo
rocm-smi --showdriverversion
amd-smi
amd-smi topology
amd-smi static --driver
amd-smi firmware
amd-smi xgmi
* Determine the values of the ``PATH`` and ``LD_LIBRARY_PATH`` environment variables.
+1 -1
Melihat File
@@ -1 +1 @@
rocm-docs-core==1.26.0
rocm-docs-core==1.29.0
@@ -25,7 +25,7 @@ breathe==4.35.0
# via rocm-docs-core
certifi==2024.7.4
# via requests
cffi==1.16.0
cffi==2.0.0
# via
# cryptography
# pynacl
@@ -164,7 +164,7 @@ pygments==2.18.0
# sphinx
pyjwt[crypto]==2.8.0
# via pygithub
pynacl==1.5.0
pynacl==1.6.2
# via pygithub
python-dateutil==2.9.0.post0
# via jupyter-client
@@ -187,7 +187,7 @@ requests==2.32.4
# via
# pygithub
# sphinx
rocm-docs-core==1.26.0
rocm-docs-core==1.29.0
# via -r requirements.in
rpds-py==0.22.3
# via
@@ -265,7 +265,7 @@ typing-extensions==4.12.0
# pygithub
# referencing
# sqlalchemy
urllib3==2.5.0
urllib3==2.6.3
# via
# pygithub
# requests
+16 -9
Melihat File
@@ -60,36 +60,36 @@ of newer ones.
The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v10)
# API (v11)
Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
Below is the main `ncclNet_v11` struct. Each function is explained in later sections.
```
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us
`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
internal ones.
Every call to `init` returns an opaque context that the plugin uses internally to allocate resources
and manage state. Such context is passed to other net plugin calls that create further resources,
such as `listen` and `connect`. Every context is uniquely associated to a communicator
using the commId. The network can also be initialized with a per communicator configuration using
the `config` argument.
To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
the plugin code adding the following definitions:
@@ -282,7 +288,7 @@ side.
`listen`
To create a connection, NCCL will start by calling `listen` on the receiver side. This function
takes a device number as input argument, and should return a local `listenComm` object, and a
takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a
`handle` to pass to the other side, so that the sender side can connect to the receiver.
The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
@@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
succeeds.
The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference
the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field.
This field can be used by the network plugin to specify the QoS level of the connection. By default,
`trafficClass` is set to -1 but can be configured by the application during communicator initialization
to select a plugin-supported QoS level.
@@ -0,0 +1,19 @@
set(SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
)
# Create shared library
add_library(nccl-net-example SHARED ${SRC_FILES})
# Set include directories
target_include_directories(nccl-net-example PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/nccl
)
# Set output name to match Makefile
set_target_properties(nccl-net-example PROPERTIES
OUTPUT_NAME "nccl-net-example"
PREFIX "lib"
POSITION_INDEPENDENT_CODE ON
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
)
+6 -4
Melihat File
@@ -22,7 +22,9 @@
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32
#define NCCL_NET_MAX_DEVS_PER_NIC 4
#include "net_v11.h"
#include "net_v10.h"
#include "net_v9.h"
#include "net_v8.h"
@@ -33,9 +35,9 @@
#include "net_v3.h"
#include "net_v2.h"
typedef ncclNet_v10_t ncclNet_t;
typedef ncclNetProperties_v10_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
typedef ncclNet_v11_t ncclNet_t;
typedef ncclNetProperties_v11_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
#endif // end include guard
@@ -12,7 +12,7 @@
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
@@ -27,6 +27,7 @@ typedef struct {
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
#endif
@@ -5,10 +5,9 @@
#ifndef NET_V10_H_
#define NET_V10_H_
#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
} ncclNetVDeviceProps_v10_t;
@@ -0,0 +1,120 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V11_H_
#define NET_V11_H_
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
} ncclNetVDeviceProps_v11_t;
#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
typedef struct {
// Plugin-specific TC value
int trafficClass;
} ncclNetCommConfig_v11_t;
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int forceFlush; // Force a flush on receives
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
ncclNetVDeviceProps_v11_t vProps;
size_t maxP2pBytes; // Max transfer size for point-to-point operations
size_t maxCollBytes; // Max transfer size for collective operations
int maxMultiRequestSize; // Maximum number of requests supported in a single multi-request.
} ncclNetProperties_v11_t;
typedef struct {
int32_t maxConcurrentPeers;
int32_t minConcurrentPeers;
int32_t maxFlowsPerPeer;
int32_t minFlowsPerPeer;
} ncclNetCommAttr_v11_t;
typedef struct {
ncclNetCommAttr_v11_t sendCommAttr;
ncclNetCommAttr_v11_t recvCommAttr;
uint32_t op;
uint32_t algo;
uint32_t proto;
} ncclNetAttr_v11_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
// what index this new vNIC exists at
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
// Finalize the network.
ncclResult_t (*finalize)(void* ctx);
ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
} ncclNet_v11_t;
#endif // end include guard
@@ -5,10 +5,9 @@
#ifndef NET_V9_H_
#define NET_V9_H_
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
} ncclNetVDeviceProps_v9_t;
typedef struct {
+84 -17
Melihat File
@@ -11,7 +11,7 @@
int max_requests = NCCL_NET_MAX_REQUESTS;
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
return ncclSuccess;
}
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
@@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }
#define PLUGIN_NAME "Plugin"
const ncclNet_v10_t ncclNetPlugin_v10 = {
const ncclNet_v11_t ncclNetPlugin_v11 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
@@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice,
.finalize = pluginFinalize,
};
__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) {
// Below are default values, if unsure don't change.
props->name = "Example";
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
props->pciPath = NULL;
// Only used to detect NICs with multiple PCI attachments.
props->guid = 0;
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
props->ptrSupport = NCCL_PTR_HOST;
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
props->regIsGlobal = 0;
// Force flush after receive. Needed if the control path and data path use a different path to the GPU
props->forceFlush = 0;
// Speed in *Mbps*. 100000 means 100G
props->speed = 100000;
// Port number, used in conjunction with guid
props->port = 0;
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
props->latency = 0;
// Maximum number of comm objects we can create.
props->maxComms = 1024*1024;
// Maximum number of receive operations taken by irecv().
props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
// Coupling with NCCL network device-side code.
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
// Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
props->vProps.ndevs = 1;
props->vProps.devs[0] = dev;
// maximum transfer sizes the plugin can handle
props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
return ncclSuccess;
}
__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
const ncclNet_v10_t ncclNetPlugin_v10 = {
.name = PLUGIN_NAME,
.init = pluginInit_v10,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v10,
.listen = pluginListen_v10,
.connect = pluginConnect_v10,
.accept = pluginAccept,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice_v10,
};
__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
return pluginInit(logFunction, NULL);
return pluginInit_v10(logFunction, NULL);
}
__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
return pluginGetProperties(dev, (ncclNetProperties_t*)props);
return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props);
}
__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm);
}
__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
@@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v9,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr,
@@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v8,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr,
@@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v7,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr_v7,
@@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr_v7,
@@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr_v7,
@@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
ncclResult_t ret;
do {
ncclNetDeviceHandle_v7_t* handle = NULL;
ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle);
} while (ret == ncclSuccess && *sendComm == NULL);
return ret;
}
@@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v4,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
@@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
}
static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
max_requests = NCCL_NET_MAX_REQUESTS_V3;
return pluginInit(logFunction, NULL);
return pluginInit_v10(logFunction, NULL);
}
#include <string.h>
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm);
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
return ret;
}
@@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
.devices = pluginDevices,
.pciPath = pluginPciPath,
.ptrSupport = pluginPtrSupport,
.listen = pluginListen,
.listen = pluginListen_v3,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
+84 -37
Melihat File
@@ -49,9 +49,9 @@ of newer ones.
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v4)
# API (v5)
Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections.
```
typedef struct {
@@ -60,15 +60,15 @@ typedef struct {
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commId : communicator id
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
@@ -76,7 +76,7 @@ typedef struct {
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
@@ -88,13 +88,13 @@ typedef struct {
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
} ncclProfiler_v5_t;
```
## Error codes
@@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct.
```
typedef struct {
uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
int rank; // rank that generated the event
uint64_t type; // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ...
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
int rank; // rank that generated the event
union {
struct { // GroupAPI event metadata
bool graphCaptured; // Set to true if the Group API event is emitted inside a CUDA graph capture
int groupDepth; // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL)
// and not called by the user. Any depth greater than 1 means that the user made the Group API call.
} groupApi;
struct { // Collective API call metadata
const char* func; // string containing name of the collective operation during
size_t count; // data count
const char* datatype; // string containing the name of the datatype
int root; // root rank
void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in
bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture
} collApi;
struct { // Point-to-point API call metadata
const char* func; // string containing name of the p2p operation
size_t count; // data count
const char* datatype; // string containing the name of the datatype
void* stream; // Opaque handle that points to a CUDA stream object
bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture
} p2pApi;
struct { // Kernel Launch event metadata
void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in
} kernelLaunch;
struct { // collective events metadata
uint64_t seqNumber; // sequence number of this collective operation in the communicator
const char* func; // string containing name of the collective
@@ -164,6 +191,7 @@ typedef struct {
uint8_t nWarps; // number of GPU warps for this collective
const char* algo; // string containing name of the algorithm for this collective
const char* proto; // string containing name of the protocol for this collective
void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent
} coll;
struct { // point-to-point events metadata
@@ -173,6 +201,7 @@ typedef struct {
size_t count;
int peer; // peer rank for this point-to-point
uint8_t nChannels; // number of channels for this p2p
void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent
} p2p;
struct { // proxyOp events metadata
@@ -198,12 +227,12 @@ typedef struct {
void* data; // pointer to network plugin defined event
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
} ncclProfilerEventDescr_v5_t;
```
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
`ncclProfileNetPlugin`.
NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`,
`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`,
`ncclProfileKernelCh` and `ncclProfileNetPlugin`.
#### stopEvent
@@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior.
#### recordEventState
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`,
`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`.
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
The state of these events can be updated, along with event attributes, using `recordEventState`.
@@ -258,9 +287,21 @@ typedef enum {
// ncclProfileKernelCh event states
ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update
} ncclProfilerEventState_v4_t;
// Group API States
ncclProfilerGroupStartApiStop = 23,// state marks the end of a ncclGroupStart() API call
ncclProfilerEndGroupApiStart = 24 // state marks the start of a ncclGroupEnd() API call
} ncclProfilerEventState_v5_t;
```
NCCL profile API events are generated when the API calls are made, right after NCCL checks
for graph capture information. They parent collective, point-to-point and kernel launch events
and persist across multiple operations in a group.
`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the
case of graph capture, the event start indicates that the kernel launch operation has been recorded,
not launched.
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
provide a summary of the activity of the proxy progress thread for that channel. Most of the
@@ -379,7 +420,7 @@ typedef union {
struct { // attribute to update for ncclProfileKernelCh events
uint64_t pTimer; // timestamp provided by the NCCL kernel
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
} ncclProfilerEventStateArgs_v5_t;
```
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur
NCCL core events (reported above) are organized into a hierarchy as reported below:
```
Group event
Group API event
|
+- Collective event
+- Collective API event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
| +- Collective event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
|
+- Point-to-point event
|
+- ProxyOp event
| |
| +- ProxyStep event
| |
| +- NetPlugin event
|
+- KernelCh event
+- Point-to-point API event
| |
| +- Point-to-point event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
|
+- Kernel Launch event
ProxyCtrl event
```
@@ -0,0 +1,34 @@
# Find all C source files in current directory
set(SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc
${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc
)
# Create shared library
add_library(nccl-profiler-example SHARED ${SRC_FILES})
# Set include directories
target_include_directories(nccl-profiler-example PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/nccl
${CUDAToolkit_INCLUDE_DIRS}
)
# Set output name to match Makefile
set_target_properties(nccl-profiler-example PROPERTIES
OUTPUT_NAME "nccl-profiler-example"
PREFIX "lib"
POSITION_INDEPENDENT_CODE ON
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
)
add_custom_command(TARGET nccl-profiler-example POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins
)
# Add custom target for clean (equivalent to Makefile clean target)
add_custom_target(clean-profiler-lib
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so
COMMENT "Cleaning libnccl-profiler-example.so"
)
+15 -8
Melihat File
@@ -4,19 +4,26 @@
# See LICENSE.txt for license information
#
.DEFAULT_GOAL: build
include ../../makefiles/common.mk
SRCDIR ?= $(abspath ../..)
ROCM_PATH ?= $(wildcard /opt/rocm)
CXX = $(ROCM_PATH)/lib/llvm/bin/amdclang++
BUILDDIR ?= .
NCCLDIR := $(BUILDDIR)
HIPIFY_DIR := hipify-profiler
SRC_FILES := $(wildcard *.c)
SRC_FILES := $(wildcard *.cc)
HIPIFY_SRC := $(addprefix $(HIPIFY_DIR)/,$(SRC_FILES))
build: ${BUILDDIR}/librccl-profiler.so
build: ${BUILDDIR}/librccl-profiler-example.so
${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
${BUILDDIR}/librccl-profiler-example.so: $(HIPIFY_SRC)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${BUILDDIR}
$(CC) -Inccl -fPIC -shared -o $@ $^
$(CXX) -D__HIP_PLATFORM_AMD__ -I$(HIPIFY_DIR) -I$(HIPIFY_DIR)/nccl -I$(ROCM_PATH)/include -fPIC -shared -o $@ $^
$(HIPIFY_DIR)/%.cc: %.cc
@mkdir -p $(HIPIFY_DIR)/nccl
@cp *.cc *.h $(HIPIFY_DIR)/
@cp nccl/*.h $(HIPIFY_DIR)/nccl/
@hipify-perl -inplace -quiet-warnings $(HIPIFY_DIR)/*.cc $(HIPIFY_DIR)/*.h
clean:
rm -f ${BUILDDIR}/librccl-profiler.so
rm -rf ${BUILDDIR}/librccl-profiler-example.so $(HIPIFY_DIR)
+59 -121
Melihat File
@@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of.
## Building the profiler plugin
To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
You can override `NCCL_HOME` to where the NCCL installation is on your system.
To build the example plugin shipped as part of NCCL, just type `make`.
## Using the profiler plugin
@@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system.
As an example, setting:
`NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
`NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
enables the profiling of the group, the collective and the proxy op events. The same events can be
enables the profiling of the group API, the collective and the proxy op events. The same events can be
expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
is that the profiler can easily correlate events that belong to the same NCCL operation and present
them accordingly.
them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler.
3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
@@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events
contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
generated by remote proxies. A list of pools and their size is reported below:
- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256)
- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256)
- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256)
- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256)
- `NCCL_PROFILE_COLL_POOL_SIZE` (256)
- `NCCL_PROFILE_P2P_POOL_SIZE` (256)
- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256)
Remote proxy operations are generated when PXN is in use. Refer to this article for more information
about PXN and how it works:
@@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace
```
[
{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}},
{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}},
{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990},
{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}},
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}},
{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}},
{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995},
{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997},
{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995},
{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979},
{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993},
{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}},
{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992},
... [ trace truncated for brevity ]
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980},
{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981},
{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001},
{}]
```
Details about the fields used in the trace can be found at this link:
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through
the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
one collective and this is what is presented in the traces above).
@@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation.
- datatype : NCCL datatype
- algorithm : algorithm used to process the ncclAllReduce
- protocol : protocol used to process the ncclAllReduce
- nMaxChannels: max number of channels used to process the ncclAllReduce
- nChannels : Number of channels used to process the ncclAllReduce
If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
of collective and p2p operations`.
### Proxy Send
The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to send data to the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes to the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send
- DONE : struct containing the number of network sends completed and the time stamp of the last send completed
In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
which could help identify at which point the network problem occurred.
The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy SendBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
#### Proxy SendGPUWait
#### Proxy SendGpuWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
buffer.
@@ -201,31 +164,6 @@ buffer.
Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
### Proxy Recv
The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to recv data from the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes from the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted
- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
- DONE : struct containing the number of flush completed and the time stamp for the last flush completed
The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy RecvBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
become available.
#### Proxy RecvWait
Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
@@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po
Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
#### Proxy RecvGPUWait
#### Proxy RecvGpuWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
@@ -1,30 +0,0 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "event.h"
int taskEventQueueEmpty(struct group* g) {
return g->eventHead == NULL;
}
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
event->next = NULL;
if (g->eventHead) g->eventTail->next = event;
else g->eventHead = event;
g->eventTail = event;
}
struct taskEventBase* taskEventQueueHead(struct group* g) {
return g->eventHead;
}
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
struct taskEventBase* tmp = g->eventHead;
g->eventHead = g->eventHead->next;
if (g->eventHead == NULL) g->eventTail = NULL;
return tmp;
}
+142 -13
Melihat File
@@ -10,10 +10,14 @@
#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include <cstring>
#include "err.h"
#include "profiler.h"
#include "queue.h"
#include <cuda_runtime.h>
#define MAX_CHANNELS 128 // Match RCCL's MAXCHANNELS
#define MAX_STEPS 16
#define MAX_STEPS 1024
#define MAX_OPS 16 // Up to 64K ranks for PAT
#define MAX_EVENTS_PER_REQ (8)
@@ -21,7 +25,7 @@ struct proxyOp;
struct proxyStep;
struct netPlugin {
uint8_t type;
uint64_t type;
int pluginType;
int pluginVer;
uint8_t pluginEvent;
@@ -63,7 +67,7 @@ struct kernelCh {
#define PROXY_STEP_MAX_STATES 3
struct proxyStep {
uint8_t type; // type of event: network transfer
uint64_t type; // type of event: network transfer
int state;
int step; // network transfer id in given channel
int isSend; // send/recv channel operation
@@ -76,7 +80,7 @@ struct proxyStep {
};
struct proxyOp {
uint8_t type; // type of event: proxy operation
uint64_t type; // type of event: proxy operation
uint8_t channelId; // channel id for this proxy operation
pid_t pid;
int rank;
@@ -97,7 +101,7 @@ struct group;
struct context;
struct proxyCtrl {
uint8_t type;
uint64_t type;
struct context* ctx; // profiler context
double startTs;
double stopTs;
@@ -107,12 +111,12 @@ struct proxyCtrl {
// task level event base structure
struct taskEventBase {
uint8_t type; // event type: collective/p2p
uint64_t type; // event type: collective/p2p
int rank; // rank of the operation in NCCL communicator
const char* func; // ncclFunc*
int refCount; // number of references for this operation
struct group* parent; // parent event group
struct taskEventBase* next; // next top level event in group
void* parent; // parent API event
struct taskEventBase* next; // next top level event
double startTs;
double stopTs;
};
@@ -147,7 +151,7 @@ struct p2p {
};
struct group {
uint8_t type;
uint64_t type;
struct context* ctx; // profiler context
int groupId;
int refCount;
@@ -158,6 +162,70 @@ struct group {
struct group* next; // next group event in queue
};
struct collApi {
uint64_t type;
struct groupApi* parent;
struct context* ctx; // profiler context
int collApiId;
int refCount;
cudaStream_t stream;
const char* func;
size_t count;
const char* datatype;
int root;
bool graphCaptured;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct collApi* next;
};
struct p2pApi {
uint64_t type;
struct groupApi* parent;
struct context* ctx; // profiler context
int p2pApiId;
int refCount;
const char* func;
cudaStream_t stream;
size_t count;
const char* datatype;
bool graphCaptured;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct p2pApi* next;
};
struct kernelLaunch {
uint64_t type;
struct groupApi* parent;
cudaStream_t stream;
int kernelLaunchId;
double startTs;
double stopTs;
struct kernelLaunch* next;
};
struct groupApi {
uint64_t type;
struct context* ctx;
int groupApiId;
int refCount;
bool graphCaptured;
int groupDepth;
struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
double endOfncclGroupStartTs;
double startOfncclGroupEndTs;
double startTs;
double stopTs;
struct groupApi* next;
};
// arrays for different event objects
struct context {
const char* commName;
@@ -165,6 +233,26 @@ struct context {
int nranks;
int rank;
int groupApiPoolSize;
int groupApiPoolBase;
int groupApiPoolIndex;
struct groupApi* groupApiPool;
int collApiPoolSize;
int collApiPoolBase;
int collApiPoolIndex;
struct collApi* collApiPool;
int p2pApiPoolSize;
int p2pApiPoolBase;
int p2pApiPoolIndex;
struct p2pApi* p2pApiPool;
int kernelLaunchPoolSize;
int kernelLaunchPoolBase;
int kernelLaunchPoolIndex;
struct kernelLaunch* kernelLaunchPool;
int groupPoolSize;
int groupPoolBase;
int groupPoolIndex;
@@ -186,9 +274,50 @@ struct context {
struct proxyCtrl* proxyCtrlPool;
};
int taskEventQueueEmpty(struct group* g);
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
struct taskEventBase* taskEventQueueHead(struct group* g);
struct taskEventBase* taskEventQueueDequeue(struct group* g);
template <typename T>
inline int taskEventQueueEmpty(T *obj) {
return obj->eventHead == NULL;
}
template <typename T>
inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
event->next = NULL;
if (obj->eventHead) obj->eventTail->next = event;
else obj->eventHead = event;
obj->eventTail = event;
}
template <typename T>
inline struct taskEventBase* taskEventQueueHead(T *obj) {
return obj->eventHead;
}
template <typename T>
inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
struct taskEventBase* tmp = obj->eventHead;
obj->eventHead = obj->eventHead->next;
if (obj->eventHead == NULL) obj->eventTail = NULL;
return tmp;
}
template <typename T>
inline void resetTaskEvents(T *obj, struct context* ctx) {
while (!taskEventQueueEmpty(obj)) {
struct taskEventBase* base = taskEventQueueDequeue(obj);
if (base->type == ncclProfileColl) {
struct collective* c = (struct collective *)base;
// reset event proxyOps & proxySteps
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
// release collective events in the group and return them to the collective pool
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
} else if (base->type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)base;
// reset event proxyOp and proxySteps
memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
// release p2p events in the group and return them to the p2p pool
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
}
}
}
#endif
@@ -11,17 +11,20 @@
#include <stdlib.h>
#include "common.h"
#include "err.h"
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
ncclProfileGroupApi = (1 << 8), // Group API events
ncclProfileCollApi = (1 << 9), // Collective API events
ncclProfileP2pApi = (1 << 10), // Point-to-Point API events
ncclProfileKernelLaunch = (1 << 11), // Kernel launch events
};
typedef enum {
@@ -56,21 +59,27 @@ typedef enum {
/* Kernel event states */
ncclProfilerKernelChStop = 22,
/* Group API States */
ncclProfilerEndGroupApiStart = 23,
ncclProfilerBeginGroupApiEnd = 24
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
#include "profiler_v5.h"
#include "profiler_v4.h"
#include "profiler_v3.h"
#include "profiler_v2.h"
#include "profiler_v1.h"
#include "profiler_net.h"
typedef ncclProfiler_v4_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v5_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
#endif // end include guard
@@ -0,0 +1,152 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V5_H_
#define PROFILER_V5_H_
#include <stdbool.h>
typedef struct {
uint64_t type; // event type descriptor: ncclProfileGroupApi, ...
void* parentObj; // pointer to the profiler parent object
int rank; // originating rank
union {
struct {
int graphCaptured;
int groupDepth;
} groupApi;
struct {
const char* func;
size_t count;
const char* datatype;
int root;
void* stream;
bool graphCaptured;
} collApi;
struct {
const char* func;
size_t count;
const char* datatype;
void* stream;
bool graphCaptured;
} p2pApi;
struct {
void* stream;
} kernelLaunch;
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
void* parentGroup; // for backward compatibility with v4
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
void* parentGroup; // for backward compatibility with v4
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v5_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v5_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commId : communicator id
// - commName : user assigned communicator name
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v5_t;
#endif
@@ -6,7 +6,7 @@
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <cstring>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
@@ -22,12 +22,20 @@ static int initialized; // initialization counter for profiler
static double startTime; // profiler start time
static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
static const int defaultGroupPoolSize = 16;
static const int defaultCollPoolSize = 16;
static const int defaultP2pPoolSize = 1024;
static const int defaultGroupApiPoolSize = 256;
static const int defaultCollApiPoolSize = 256;
static const int defaultP2pApiPoolSize = 256;
static const int defaultKernelLaunchPoolSize = 256;
static const int defaultGroupPoolSize = 256;
static const int defaultCollPoolSize = 256;
static const int defaultP2pPoolSize = 256;
static const int defaultProxyCtrlPoolSize = 16;
static const int defaultDetachPoolSize = 128;
static const int defaultDetachPoolSize = 256;
static int groupApiPoolSize;
static int collApiPoolSize;
static int p2pApiPoolSize;
static int kernelLaunchPoolSize;
static int groupPoolSize;
static int collPoolSize;
static int p2pPoolSize;
@@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pid_t pid;
static int* eActivationMaskPtr;
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
pthread_mutex_lock(&lock);
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
// first thread initializes event mask, environment and detach pool
@@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
str = getenv("NCCL_PROFILE_EVENT_MASK");
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE");
groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize;
str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE");
collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize;
str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE");
p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize;
str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE");
kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize;
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
@@ -95,12 +115,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
if (ctx == nullptr) return ncclSystemError;
ctx->commName = commName;
ctx->commHash = commHash;
ctx->commHash = commId;
ctx->nranks = nranks;
ctx->rank = rank;
logFn = logfn;
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank);
ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool));
if (ctx->groupApiPool == NULL) goto fail;
ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool));
if (ctx->collApiPool == NULL) goto fail;
ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool));
if (ctx->p2pApiPool == NULL) goto fail;
ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool));
if (ctx->kernelLaunchPool == NULL) goto fail;
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
if (ctx->groupPool == NULL) goto fail;
@@ -130,16 +163,22 @@ fail:
if (ctx->p2pPool) free(ctx->p2pPool);
if (ctx->collPool) free(ctx->collPool);
if (ctx->groupPool) free(ctx->groupPool);
if (ctx->collApiPool) free(ctx->collApiPool);
if (ctx->p2pApiPool) free(ctx->p2pApiPool);
if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool);
if (ctx->groupApiPool) free(ctx->groupApiPool);
free(ctx);
if (detachPool) free(detachPool);
return ncclSystemError;
}
static const char* profilerDumpFile;
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
FILE* fh = NULL;
char filename[PATH_MAX] = { 0 };
struct context* ctx = (struct context *)context;
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE");
if (dump) {
sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
fh = fopen(filename, "w");
@@ -148,10 +187,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
// print last N groups/collectives/p2ps
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
int end = ctx->groupPoolIndex;
// Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy.
// Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example.
int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0;
int end = ctx->groupApiPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]);
}
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
@@ -161,6 +202,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
}
free(ctx->groupPool);
free(ctx->collApiPool);
free(ctx->p2pApiPool);
free(ctx->kernelLaunchPool);
free(ctx->groupApiPool);
free(ctx->collPool);
free(ctx->p2pPool);
free(ctx->proxyCtrlPool);
@@ -187,7 +232,113 @@ __hidden void updateEvent(void* handle);
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
*eHandle = NULL;
struct context* ctx = (struct context *)context;
if (eDescr->type == ncclProfileGroup) {
if (eDescr->type == ncclProfileGroupApi) {
struct groupApi* event;
int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) {
// if there are available group API events grab one
event = &ctx->groupApiPool[groupApiId%groupApiPoolSize];
// Make sure all child events of the picked group API event are cleared
while (!profilerQueueEmpty(&event->collApiEvents)) {
struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents);
resetTaskEvents(collApiEvent, ctx);
__atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
}
while (!profilerQueueEmpty(&event->p2pApiEvents)) {
struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents);
resetTaskEvents(p2pApiEvent, ctx);
__atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
}
while (!profilerQueueEmpty(&event->kernelLaunchEvents)) {
profilerQueueDequeue(&event->kernelLaunchEvents);
__atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED);
}
} else {
// else drop this event
__atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileGroupApi;
event->ctx = ctx;
event->groupApiId = groupApiId;
event->graphCaptured = eDescr->groupApi.graphCaptured;
event->groupDepth = eDescr->groupApi.groupDepth;
event->startTs = gettime() - startTime;
*eHandle = event;
} else if (eDescr->type == ncclProfileCollApi) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct collApi* event;
int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) {
// if there are available Coll API events grab one
event = &ctx->collApiPool[collApiId%collApiPoolSize];
resetTaskEvents(event, ctx);
} else {
// else drop this event
__atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileCollApi;
event->collApiId = collApiId;
event->ctx = ctx;
event->func = eDescr->collApi.func;
event->stream = (cudaStream_t) eDescr->collApi.stream;
event->count = eDescr->collApi.count;
event->datatype = eDescr->collApi.datatype;
event->root = eDescr->collApi.root;
event->graphCaptured = eDescr->collApi.graphCaptured;
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
event->parent = parent;
profilerQueueEnqueue(&parent->collApiEvents, event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
*eHandle = event;
} else if (eDescr->type == ncclProfileP2pApi) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct p2pApi* event;
int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) {
// if there are available p2p API events grab one
event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize];
resetTaskEvents(event, ctx);
} else {
// else drop this event
__atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileP2pApi;
event->p2pApiId = p2pApiId;
event->ctx = ctx;
event->func = eDescr->p2pApi.func;
event->stream = (cudaStream_t) eDescr->p2pApi.stream;
event->count = eDescr->p2pApi.count;
event->datatype = eDescr->p2pApi.datatype;
event->graphCaptured = eDescr->p2pApi.graphCaptured;
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
event->parent = parent;
profilerQueueEnqueue(&parent->p2pApiEvents, event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
*eHandle = event;
} else if (eDescr->type == ncclProfileKernelLaunch) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct kernelLaunch* event;
int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) {
// if there are available kernel API events grab one
event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileKernelLaunch;
event->stream = (cudaStream_t) eDescr->kernelLaunch.stream;
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
event->parent = parent;
profilerQueueEnqueue(&parent->kernelLaunchEvents, event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
*eHandle = event;
} else if (eDescr->type == ncclProfileGroup) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct group* event;
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
@@ -222,7 +373,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
debugEvent(event, "GroupStart");
} else if (eDescr->type == ncclProfileColl) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
struct collApi* parent = (struct collApi *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct collective* event;
@@ -253,12 +404,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
event->proto = eDescr->coll.proto;
*eHandle = event;
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
// increment the group ref counter so the event will staty open
// increment the group ref counter so the event will stay open
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "CollStart");
} else if (eDescr->type == ncclProfileP2p) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct p2p* event;
@@ -458,8 +609,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
}
void updateEvent(void* handle) {
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
uint64_t type = *(uint64_t *)handle;
if (type == ncclProfileGroupApi) {
struct groupApi* event = (struct groupApi*) handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
__atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED);
}
} else if (type == ncclProfileCollApi) {
struct collApi* event = (struct collApi*) handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
__atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
}
updateEvent(event->parent);
return;
} else if (type == ncclProfileP2pApi) {
struct p2pApi* event = (struct p2pApi*) handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
__atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
}
updateEvent(event->parent);
event->stopTs = gettime() - startTime;
} else if (type == ncclProfileKernelLaunch) {
struct kernelLaunch* event = (struct kernelLaunch*) handle;
event->stopTs = gettime() - startTime;
updateEvent(event->parent);
} else if (type == ncclProfileGroup) {
struct group* event = (struct group *)handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
@@ -527,25 +704,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
// stopping the group event in NCCL core does not
// mean the group has completed. It means the group
// was submitted/enqueued so we need to keep the event open
uint64_t type = *(uint64_t *)eHandle;
// Stopping API events, Kernel Launch events, collective/p2p task events
// in NCCL core do not mean that they are complete. It means that the
// operation was enqueued so we need to keep the events open
if (type == ncclProfileGroupApi) {
struct groupApi* event = (struct groupApi*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileCollApi) {
struct collApi* event = (struct collApi*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileP2pApi) {
struct p2pApi* event = (struct p2pApi*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileKernelLaunch) {
struct kernelLaunch* event = (struct kernelLaunch*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileColl) {
// stopping the collective event in NCCL core does not
// mean the collective has completed. It means the collective
// was submitted/enqueued so we need to keep the event open
struct collective* event = (struct collective *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileP2p) {
// stopping the p2p event in NCCL core does not
// mean the p2p has completed. It means the p2p
// was submitted/enqueued so we need to keep the event open
struct p2p* event = (struct p2p *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
@@ -559,8 +746,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileProxyOp) {
uint64_t type = *(uint64_t *)eHandle;
if (type == ncclProfileGroupApi) {
struct groupApi* event = (struct groupApi*) eHandle;
if (eState == ncclProfilerEndGroupApiStart) {
event->endOfncclGroupStartTs = gettime() - startTime;
} else if (eState == ncclProfilerBeginGroupApiEnd) {
event->startOfncclGroupEndTs = gettime() - startTime;
}
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
if (eState == ncclProfilerProxyOpInProgress_v4) {
event->progrTs = gettime() - startTime;
@@ -592,6 +786,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
case ncclProfilerProxyStepRecvGPUWait:
event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
break;
default:
break;
}
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
@@ -609,7 +805,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
return ncclSuccess;
}
ncclProfiler_t ncclProfiler_v4 = {
ncclProfiler_t ncclProfiler_v5 = {
"Example-profiler",
exampleProfilerInit,
exampleProfilerStartEvent,
@@ -618,14 +814,15 @@ ncclProfiler_t ncclProfiler_v4 = {
exampleProfilerFinalize,
};
int exampleProfilerStart(int eActivationMask) {
__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) {
profilerDumpFile = name;
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
}
return ncclSuccess;
}
int exampleProfilerStop(void) {
__attribute__((visibility("default"))) int exampleProfilerStop(void) {
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
}
@@ -7,7 +7,8 @@
#ifndef PLUGIN_H_
#define PLUGIN_H_
int exampleProfilerStart(int eActivationMask);
int exampleProfilerStop(void);
__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name);
__attribute__((visibility("default"))) int exampleProfilerStop(void);
#endif
@@ -5,15 +5,59 @@
************************************************************************/
#include <stdio.h>
#include "err.h"
#include "profiler.h"
#include "event.h"
#include "print_event.h"
#include <cuda_runtime.h>
#define __hidden __attribute__ ((visibility("hidden")))
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET)
static __thread int groupApiId;
__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n",
"Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth);
}
__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Group API", groupApiId++, getpid(), 1, event->stopTs);
}
static __thread int p2pApiId;
__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream);
}
__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->func, p2pApiId++, getpid(), 1, event->stopTs);
}
static __thread int collApiId;
__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream);
}
__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->func, collApiId++, getpid(), 1, event->stopTs);
}
static __thread int kernelLaunchId;
__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream);
}
__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs);
}
static __thread int groupId;
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
@@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
static __thread int collId;
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
}
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
@@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
static __thread int p2pId;
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
}
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
@@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) {
char filename[64] = { 0 };
sprintf(filename, "EventDebug-%d", getpid());
FILE* fh = fopen(filename, "a+");
uint8_t type = *(uint8_t *)eHandle;
uint64_t type = *(uint64_t *)eHandle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
@@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) {
void printEvent(FILE* fh, void* handle) {
if (handle == NULL || fh == NULL) return;
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
uint64_t type = *(uint64_t *)handle;
if (type == ncclProfileGroupApi) {
struct groupApi* g = (struct groupApi*) handle;
printGroupApiEventHeader(fh, g);
struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents);
while (kernelLaunchHead != NULL) {
printEvent(fh, kernelLaunchHead);
kernelLaunchHead = kernelLaunchHead->next;
}
struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents);
while (collApiHead != NULL) {
printEvent(fh, collApiHead);
collApiHead = collApiHead->next;
}
struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents);
while (p2pApiHead != NULL) {
printEvent(fh, p2pApiHead);
p2pApiHead = p2pApiHead->next;
}
printGroupApiEventTrailer(fh, g);
} else if (type == ncclProfileCollApi) {
struct collApi* collApiEvent = (struct collApi *) handle;
printCollApiEventHeader(fh, collApiEvent);
struct taskEventBase* base = taskEventQueueHead(collApiEvent);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printCollApiEventTrailer(fh, collApiEvent);
} else if (type == ncclProfileP2pApi) {
struct p2pApi* p2pApiEvent = (struct p2pApi *) handle;
printP2pApiEventHeader(fh, p2pApiEvent);
struct taskEventBase* base = taskEventQueueHead(p2pApiEvent);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printP2pApiEventTrailer(fh, p2pApiEvent);
} else if (type == ncclProfileKernelLaunch) {
struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle;
printKernelLaunchEventHeader(fh, kernelLaunchEvent);
printKernelLaunchEventTrailer(fh, kernelLaunchEvent);
} else if (type == ncclProfileGroup) {
struct group* g = (struct group *)handle;
printGroupEventHeader(fh, g);
struct taskEventBase* base = taskEventQueueHead(g);
@@ -0,0 +1,50 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef QUEUE_H
#define QUEUE_H
template<typename T, T *T::*next>
struct profilerQueue {
T *head, *tail;
};
template<typename T, T *T::*next>
inline void profilerQueueConstruct(profilerQueue<T,next> *me) {
me->head = nullptr;
me->tail = nullptr;
}
template<typename T, T *T::*next>
inline bool profilerQueueEmpty(profilerQueue<T,next> *me) {
return me->head == nullptr;
}
template<typename T, T *T::*next>
inline T* profilerQueueHead(profilerQueue<T,next> *me) {
return me->head;
}
template<typename T, T *T::*next>
inline T* profilerQueueTail(profilerQueue<T,next> *me) {
return me->tail;
}
template<typename T, T *T::*next>
inline void profilerQueueEnqueue(profilerQueue<T,next> *me, T *x) {
x->*next = nullptr;
(me->head ? me->tail->*next : me->head) = x;
me->tail = x;
}
template<typename T, T *T::*next>
inline T* profilerQueueDequeue(profilerQueue<T,next> *me) {
T *ans = me->head;
me->head = ans->*next;
if (me->head == nullptr) me->tail = nullptr;
return ans;
}
#endif
@@ -0,0 +1,22 @@
.PHONY: build-CoMMA
all: build-CoMMA
build-CoMMA: clone-CoMMA
cd CoMMA && cargo build
clone-CoMMA:
@if [ ! -d CoMMA ] ; then \
git clone https://github.com/google/CoMMA.git; \
ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \
fi
clean:
@if [ -d CoMMA ] ; then \
cd CoMMA && cargo clean; \
fi
delete:
@if [ -d CoMMA ] ; then \
rm -rf CoMMA; \
fi
@@ -0,0 +1,62 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# Variables
NCCL_HOME := ../../build
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
PLUGIN_SO := libnccl-profiler-inspector.so
VERSION_FILE := version.cc
# Compiler and flags
CXX := g++
CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra
ifeq ($(DEBUG), 1)
CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer
endif
ifeq ($(ASAN), 1)
CXXFLAGS += -fsanitize=address
LDFLAGS += -fsanitize=address -static-libasan
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
endif
ifeq ($(UBSAN), 1)
CXXFLAGS += -fsanitize=undefined
LDFLAGS += -fsanitize=undefined -static-libubsan
NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
endif
# Source files
SOURCES := inspector_plugin.cc inspector.cc json.cc
# Default target
all: $(PLUGIN_SO)
# Rule to build the plugin
$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES)
@echo "Compiling to create $@ from $^"
$(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^
# Rule to generate version.cc
$(VERSION_FILE):
@GIT_INFO=$$(./utils/extract_git_version.sh); \
echo '#include "version.h"' > $(VERSION_FILE).tmp; \
echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \
if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \
echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \
mv $(VERSION_FILE).tmp $(VERSION_FILE); \
else \
echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \
rm $(VERSION_FILE).tmp; \
fi
# Clean target
clean:
rm -f $(VERSION_FILE) $(PLUGIN_SO)
# Phony targets
.PHONY: all clean
@@ -0,0 +1,216 @@
# NCCL Inspector Plugin
The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation.
## Related Documentation
- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs
## Folder Location
The Inspector plugin source is located in:
```
ext-profiler/inspector/
```
## Building the Inspector Plugin
To build the Inspector plugin, run:
```bash
make
```
The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments.
### Build Options
The Makefile supports several build options:
- **DEBUG=1**: Enable debug build with additional debugging information
- **ASAN=1**: Enable Address Sanitizer for memory error detection
- **UBSAN=1**: Enable Undefined Behavior Sanitizer
Example debug build:
```bash
make DEBUG=1
```
### Build Output
The build process creates:
- `libnccl-profiler-inspector.so`: The main inspector plugin library
- `version.cc`: Auto-generated version information from git
## Using NCCL Inspector
### Key Differences from Normal NCCL Usage
The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging:
**Normal NCCL Run:**
```bash
# Standard NCCL execution
./your_nccl_application
```
**NCCL Inspector Run:**
```bash
# NCCL Inspector enabled execution
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
export NCCL_INSPECTOR_ENABLE=1
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
./your_nccl_application
```
### Required Environment Variables
- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so`
Loads the Inspector plugin into NCCL.
- `NCCL_INSPECTOR_ENABLE=1`
Enables the Inspector plugin.
- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=<interval>`
Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`.
- `NCCL_INSPECTOR_DUMP_DIR=<output_dir>` (optional)
Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-<slurm_job_id>` if running under SLURM.
- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional)
Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default).
### Example Usage
**Single Node:**
```bash
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
export NCCL_INSPECTOR_ENABLE=1
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8
```
**Multi-Node (SLURM):**
```bash
# Add these environment variables to your SLURM script
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
export NCCL_INSPECTOR_ENABLE=1
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/
# Then run your normal NCCL application
srun your_nccl_application
```
## Example Scripts
For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory:
- **Single Node Example**: Basic NCCL performance testing with inspector
- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations
- **Training Workload Example**: Integration with distributed training workloads
## Output Example
Each output file contains JSON objects with the following structure:
```json
{
"header": {
"id": "0x7f8c496ae9f661",
"rank": 2,
"n_ranks": 8,
"nnodes": 1
},
"metadata": {
"inspector_output_format_version": "v4.0",
"git_rev": "",
"rec_mechanism": "profiler_plugin",
"dump_timestamp_us": 1748030377748202,
"hostname": "example-hostname",
"pid": 1639453
},
"coll_perf": {
"coll": "AllReduce",
"coll_sn": 1407,
"coll_msg_size_bytes": 17179869184,
"coll_exec_time_us": 61974,
"coll_algobw_gbs": 277.210914,
"coll_busbw_gbs": 485.119099
}
}
```
## Output Example Verbose
To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable:
```bash
export NCCL_INSPECTOR_DUMP_VERBOSE=1
```
This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event.
```json
{
"header": {
"id": "0xe62dedaa97644a",
"rank": 4,
"n_ranks": 8,
"nnodes": 1
},
"metadata": {
"inspector_output_format_version": "v4.0",
"git_rev": "9019a1912-dirty",
"rec_mechanism": "nccl_profiler_interface",
"dump_timestamp_us": 1752867229276385,
"hostname": "example-hostname",
"pid": 438776
},
"coll_perf": {
"coll": "ReduceScatter",
"coll_sn": 1231,
"coll_msg_size_bytes": 2147483648,
"coll_exec_time_us": 41057,
"coll_timing_source": "kernel_gpu",
"coll_algobw_gbs": 418.439467,
"coll_busbw_gbs": 366.134533,
"event_trace_sn": {
"coll_start_sn": 1,
"coll_stop_sn": 2,
"kernel_events": [
{
"channel_id": 0,
"kernel_start_sn": 3,
"kernel_stop_sn": 48,
"kernel_record_sn": 47
}
]
},
"event_trace_ts": {
"coll_start_ts": 1752867229235059,
"coll_stop_ts": 1752867229235064,
"kernel_events": [
{
"channel_id": 0,
"kernel_start_ts": 1752867229235181,
"kernel_stop_ts": 1752867229275811,
"kernel_record_ts": 1752867229275811
}
]
}
}
}
```
Multiple such JSON objects are written, one per collective operation per communicator.
## Output Directory
- By default, output files are written to:
- `nccl-inspector-unknown-jobid` (if no SLURM job ID is present)
- `nccl-inspector-<slurm_job_id>` (if running under SLURM)
- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable.
## Additional Notes
- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments.
- For more details, see the source code and comments in `ext-profiler/inspector/`.
@@ -0,0 +1,151 @@
# NCCL Inspector Performance Summary Exporter
This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries.
One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems.
## Features
- **Performance Analysis**: Generates statistical summaries for collective operations
- **Communication Type Classification**: Automatically categorizes communication patterns
- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics
- **Data Export**: Converts logs to Parquet format for efficient processing
- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files
- **Parallel Processing**: Utilizes multi-core processing for faster analysis
## Requirements
- Python 3.7+
- Access to NCCL Inspector log files
## Installation
### Clone the Repository
```bash
git clone https://github.com/NVIDIA/nccl.git
cd nccl/ext-profiler/inspector/exporter/example
```
Install the required dependencies using the provided `requirements.txt` file:
```bash
pip install -r requirements.txt
```
## Usage
The script processes NCCL Inspector log files from a specified directory.
**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md).
### Basic Usage
```bash
python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs
```
This mode processes all log files in the specified directory and its subdirectories recursively.
### Command Line Arguments
- `--input_dir <path>`: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories)
- `--output_dir <name>`: **Optional**. Custom output directory name (default: `<input_directory_name>-analysis`)
## Output
The tool generates:
1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory)
2. **Summary Directory**: Contains comprehensive analysis results
3. **Visualizations**: Scatter plots, histograms, and box plots for each message size
4. **CSV Files**: Detailed summaries for each message size and collective type
5. **Log File**: Processing log with detailed information
## Example Output Structure
```
<output_dir_name>/
├── output.log
├── parquet_files/
│ ├── <filename1>.parquet
│ ├── <filename2>.parquet
│ └── ...
└── summary/
├── scatter_plot_<comm_type>_<coll_type>.png
├── combined_scatter_plot_<comm_type>_<coll_type>.png
└── msg_size_<human_readable_size>/
├── histograms/
│ └── histogram_<comm_type>_<coll_type>_<size>.png
├── boxplots/
│ └── boxplot_<comm_type>_<coll_type>_<size>.png
└── summary_<comm_type>_<coll_type>_<size>.csv
```
## Supported Communicator Types
- `single-rank`
- `nvlink-only`
- `hca-only`
- `mixed`
## Supported Collective Types
- `AllReduce`
- `AllGather`
- `ReduceScatter`
- `Broadcast`
## Log File Formats
### Supported Formats
- `.log` - Plain text JSON lines
- `.log.gz` - Compressed JSON lines
- `.jsonl` - JSON lines format
- `.jsonl.gz` - Compressed JSON lines
### Expected JSON Structure
```json
{
"header": {
"id": "0x9e7a479f95a66c",
"rank": 31,
"n_ranks": 32,
"nnodes": 4
},
"metadata": {
"inspector_output_format_version": "v4.0",
"git_rev": "75e61acda-dirty",
"rec_mechanism": "nccl_profiler_interface",
"dump_timestamp_us": 1749490229087081,
"hostname": "example-hostname",
"pid": 468528
},
"coll_perf": {
"coll": "ReduceScatter",
"coll_sn": 129,
"coll_msg_size_bytes": 65536,
"coll_exec_time_us": 110,
"coll_timing_source": "kernel_gpu",
"coll_algobw_gbs": 19.065018,
"coll_busbw_gbs": 18.469236
}
}
```
## Troubleshooting
### Common Issues
1. **No log files found**: Ensure the log directory path is correct and contains valid log files
2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment
3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings.
### Log Files
The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages.
## Support
Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title.
@@ -0,0 +1,548 @@
from pathlib import Path
import argparse
import glob
import gzip
import sys
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import json
from tqdm.auto import tqdm
import duckdb
import math
import matplotlib.pyplot as plt
import matplotlib.dates
from matplotlib.gridspec import GridSpec
import os
import logging
import contextlib
from datetime import datetime
import numpy as np
def setup_logging(output_dir):
log_file = output_dir / "output.log"
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
@contextlib.contextmanager
def smart_open(filename, mode="r"):
if filename.endswith(".gz"):
opener = gzip.open
else:
opener = open
with opener(filename, mode) as f:
yield f
def get_log_files_and_output_dir():
parser = argparse.ArgumentParser(description="Process log files in a directory.")
parser.add_argument(
"--input_dir",
type=str,
help="The directory containing NCCL Inspector log files to process.",
)
parser.add_argument(
"--output_dir",
type=str,
help="Custom output directory name (default: auto-generated from input directory)."
)
args = parser.parse_args()
if args.input_dir:
# Use the provided input directory
root_dir = Path(args.input_dir)
if not root_dir.exists():
raise FileNotFoundError(f"Input directory not found: {root_dir}")
logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True))
gzlogfiles = list(
glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True)
)
jsonlfiles = list(
glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True)
)
gzjsonlfiles = list(
glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True)
)
if (
sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0))
> 1
):
### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail
logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!")
sys.exit(1)
files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles
if not files:
print("No inspector logs found")
sys.exit(1)
# Generate output directory name from input directory
if args.output_dir:
output_dir_name = args.output_dir
else:
output_dir_name = f"{root_dir.name}-analysis"
return files, output_dir_name
def bytes_to_human_readable(size_bytes):
"""
Convert bytes to human-readable format using decimal (SI) units.
Uses powers of 1000 (decimal/SI standard):
- 1 KB = 1,000 bytes
- 1 MB = 1,000,000 bytes
- 1 GB = 1,000,000,000 bytes
Not binary units (powers of 1024):
- Does NOT use KiB, MiB, GiB (1024-based)
Args:
size_bytes: Number of bytes to convert
Returns:
Human-readable string (e.g., "1.50MB", "2.34GB")
"""
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.log10(int(size_bytes)) / 3)
s = round(size_bytes * math.pow(10, -3 * i), 2)
return f"{s:.2f}{size_name[i]}"
def timestamp_to_datetime(timestamp_us):
"""Convert microsecond timestamp to datetime string"""
return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
def microseconds_to_human_readable(microseconds):
"""Convert microseconds to human readable format"""
if microseconds < 1000:
return f"{microseconds:.1f}μs"
elif microseconds < 1000000:
return f"{microseconds/1000:.1f}ms"
else:
return f"{microseconds/1000000:.1f}s"
def get_comm_type(row) -> str:
if row["n_ranks"] == 1:
return "single-rank"
elif row["nnodes"] == 1:
return "nvlink-only"
elif row["n_ranks"] == row["nnodes"]:
return "hca-only"
else:
return "mixed"
def parse_file(filepath: Path, output_dir):
filename = Path(filepath).stem
parquet_file = output_dir / f"{filename}.parquet"
# Check if parquet file exists and is newer than source file
if parquet_file.exists():
source_mtime = Path(filepath).stat().st_mtime
parquet_mtime = parquet_file.stat().st_mtime
if parquet_mtime >= source_mtime:
logging.info(f"Parquet file {parquet_file} is up to date. Skipping...")
return
else:
logging.info(f"Source file {filepath} is newer than parquet. Regenerating...")
# Check if file is empty or too small
file_size = Path(filepath).stat().st_size
if file_size == 0:
logging.warning(f"Skipping empty file: {filepath}")
return
recs = []
try:
with smart_open(filepath, "r") as infile:
for lineno, line in enumerate(infile):
try:
json_recs = json.loads(line)
except json.JSONDecodeError:
logging.error(f"Failed to parse line {filepath}:{lineno}")
continue
# Validate that required fields exist
if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]):
logging.error(f"Missing required fields in {filepath}:{lineno}")
continue
header = json_recs["header"]
metadata = json_recs["metadata"]
comm_type = get_comm_type(header)
coll_perf = json_recs["coll_perf"]
recs.append(
dict(
**header,
comm_type=comm_type,
**coll_perf,
**metadata,
)
)
except Exception as e:
logging.error(f"Error reading file {filepath}: {e}")
return
# Skip files with no valid records
if not recs:
logging.warning(f"No valid records found in file: {filepath}. Skipping...")
return
df = pd.DataFrame(recs)
df.to_parquet(parquet_file)
logging.info(f"Created parquet file {parquet_file} with {len(recs)} records")
def create_per_node_parquet_files(files, output_dir):
output_dir = Path(output_dir) / "parquet_files"
output_dir.mkdir(parents=True, exist_ok=True)
max_workers = min(64, len(files), os.cpu_count() or 1)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
list(
tqdm(
executor.map(parse_file, files, [output_dir] * len(files)),
total=len(files),
desc="Processing files",
unit="file",
)
)
return output_dir
def generate_scatter_plot(df, comm_type, coll_type, output_file):
plt.figure(figsize=(10, 6), dpi=100)
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
for msg_size in distinct_msg_sizes:
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
plt.scatter(
df_msg_size["coll_sn"],
df_msg_size["mean_coll_busbw_gbs"],
label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
alpha=0.5,
)
plt.xlabel("Operation Sequence Number")
plt.ylabel("Mean Collective Bus BW (GB/s)")
plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}")
plt.legend(title="Message Size", loc="upper right")
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Scatter plot saved to {output_file}")
def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3):
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
num_plots = len(distinct_msg_sizes)
# Compute number of rows and columns
num_cols = min(max_cols, num_plots) # Limit max columns
num_rows = (num_plots + num_cols - 1) // num_cols # Calculate rows dynamically
# Create figure with GridSpec
fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100)
gs = GridSpec(num_rows, num_cols, figure=fig)
for i, msg_size in enumerate(distinct_msg_sizes):
row, col = divmod(i, num_cols) # Determine row & column index
ax = fig.add_subplot(gs[row, col]) # Create subplot at position
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
ax.scatter(
df_msg_size["coll_sn"],
df_msg_size["mean_coll_busbw_gbs"],
label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
alpha=0.5,
)
ax.set_xlabel("Op Seq No")
ax.set_ylabel("Mean Collective Bus BW (GB/s)")
ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})")
ax.legend(loc="upper right")
fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98)
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Combined scatter plot saved to {output_file}")
def generate_histogram(df, comm_type, coll_type, output_file, message_size):
plt.figure(figsize=(10, 6), dpi=100)
data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min()
num_bins = min(50, int(data_range) + 1)
plt.hist(
df["mean_coll_busbw_gbs"],
bins=num_bins,
alpha=0.7,
color="b",
edgecolor="black",
linewidth=1.2,
)
plt.xlabel("Mean Collective Bus BW (GB/s)")
plt.ylabel("Frequency")
plt.title(
f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}"
)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}"))
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s"))
plt.gca().xaxis.get_offset_text().set_visible(False)
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Histogram saved to {output_file}")
def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
plt.figure(figsize=(10, 6))
boxprops = dict(linestyle="-", linewidth=2, color="blue")
flierprops = dict(marker="o", color="red", alpha=0.5)
medianprops = dict(linestyle="-", linewidth=2.5, color="orange")
whiskerprops = dict(linestyle="--", linewidth=2, color="green")
capprops = dict(linestyle="-", linewidth=2, color="black")
plt.boxplot(
df["mean_coll_busbw_gbs"],
vert=False,
patch_artist=True,
boxprops=boxprops,
flierprops=flierprops,
medianprops=medianprops,
whiskerprops=whiskerprops,
capprops=capprops,
)
plt.xlabel("Mean Coll Bus BW (GB/s)")
plt.title(
f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})"
)
# Adding labels for min, max, and median
stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5])
plt.annotate(
f"Min: {stats['min']:.2f}",
xy=(stats["min"], 1),
xytext=(stats["min"], 1.1),
arrowprops=dict(facecolor="black", shrink=0.05),
)
plt.annotate(
f"Median: {stats['50%']:.2f}",
xy=(stats["50%"], 1),
xytext=(stats["50%"], 1.1),
arrowprops=dict(facecolor="black", shrink=0.05),
)
plt.annotate(
f"Max: {stats['max']:.2f}",
xy=(stats["max"], 1),
xytext=(stats["max"], 1.1),
arrowprops=dict(facecolor="black", shrink=0.05),
)
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Box plot saved to {output_file}")
def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name):
"""Summarize parquet data per communication and collective type using DuckDB"""
logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}")
# Check if there are any parquet files
parquet_dir = output_root / "parquet_files"
parquet_files = list(parquet_dir.glob("*.parquet"))
if not parquet_files:
logging.warning(f"No parquet files found for {comm_type} and {coll_type}")
return None
# Clean up invalid/empty parquet files by moving them to a separate directory
invalid_dir = parquet_dir / "invalid"
invalid_dir.mkdir(exist_ok=True)
invalid_count = 0
for pf in parquet_files:
try:
# Check file size first
if pf.stat().st_size == 0:
logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory")
pf.rename(invalid_dir / pf.name)
invalid_count += 1
continue
# Use pyarrow to check parquet metadata without reading data
import pyarrow.parquet as pq
parquet_file = pq.ParquetFile(pf)
if parquet_file.metadata.num_rows == 0:
logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory")
pf.rename(invalid_dir / pf.name)
invalid_count += 1
except Exception as e:
logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}")
pf.rename(invalid_dir / pf.name)
invalid_count += 1
# Check if any valid files remain
remaining_files = list(parquet_dir.glob("*.parquet"))
if not remaining_files:
logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)")
return None
logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)")
try:
duckdb.execute(
f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')"
)
df = duckdb.execute(f"""
SELECT
id,
coll_sn,
coll_msg_size_bytes,
AVG(coll_busbw_gbs) as mean_coll_busbw_gbs,
COUNT(*) as log_count,
ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks,
ARRAY_DISTINCT(LIST(nnodes)) as nnodes,
MIN(dump_timestamp_us) as coll_start_timestamp_us,
MAX(dump_timestamp_us) as coll_end_timestamp_us,
(MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us
FROM logs
WHERE coll = '{coll_type}' and comm_type = '{comm_type}'
GROUP BY id, coll_sn, coll_msg_size_bytes
ORDER BY coll_sn
""").df()
except Exception as e:
logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}")
return None
if df.empty:
logging.info(f"No data for {comm_type} and {coll_type}")
return None
# Add human-readable formatting
df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply(
bytes_to_human_readable
)
# Log example of time range data for first few rows
if len(df) > 0:
sample_row = df.iloc[0]
start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us'])
end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us'])
duration = microseconds_to_human_readable(sample_row['coll_duration_us'])
logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, "
f"Start: {start_time}, End: {end_time}, Duration: {duration}")
return df
def generate_visualizations(df, output_root, comm_type, coll_type):
"""Generate all visualizations and save CSV files for the processed data"""
logging.info(f"Generating visualizations for {comm_type} and {coll_type}")
summary_dir = output_root / "summary"
summary_dir.mkdir(parents=True, exist_ok=True)
# Scatter Plot for all message sizes
output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png"
generate_scatter_plot(df, comm_type, coll_type, output_file)
# Combined Scatter Plot for all message sizes
output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png"
generate_combined_scatter_plot(df, comm_type, coll_type, output_file)
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
for msg_size in distinct_msg_sizes:
hr_msg_size = bytes_to_human_readable(msg_size)
msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}"
msg_size_hist_dir = msg_size_dir / "histograms"
msg_size_boxplot_dir = msg_size_dir / "boxplots"
msg_size_dir.mkdir(parents=True, exist_ok=True)
msg_size_hist_dir.mkdir(parents=True, exist_ok=True)
msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True)
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
# Add human-readable time formatting
df_msg_size = df_msg_size.copy()
df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime)
df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime)
df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable)
# Histogram
output_file = (
msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png"
)
generate_histogram(
df_msg_size,
comm_type,
coll_type,
output_file,
bytes_to_human_readable(msg_size),
)
# Box Plot
output_file = (
msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png"
)
generate_boxplot(
df_msg_size,
comm_type,
coll_type,
output_file,
bytes_to_human_readable(msg_size),
)
output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv"
df_msg_size.to_csv(output_file, index=False)
logging.info(
f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}"
)
def generate_summary(output_root, comm_type, coll_type, output_dir_name):
"""Generate summary by summarizing data per comm/coll type and creating visualizations"""
logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}")
# Step 1: Summarize data per communication and collective type
df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name)
# Step 2: Generate visualizations if data exists
if df is not None:
generate_visualizations(df, output_root, comm_type, coll_type)
else:
logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation")
def generate_summary_wrapper(args):
return generate_summary(*args)
if __name__ == "__main__":
files, output_dir_name = get_log_files_and_output_dir()
print(f"Number of log files found: {len(files)}")
print(f"Output directory: {output_dir_name}")
output_dir = Path(output_dir_name)
output_dir.mkdir(parents=True, exist_ok=True)
setup_logging(output_dir)
create_per_node_parquet_files(files, output_dir)
comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"]
coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"]
summary_args = [
(output_dir, comm_type, coll_type, output_dir_name)
for comm_type in comm_types
for coll_type in coll_types
]
max_workers = min(64, len(summary_args), os.cpu_count() or 1)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
list(
tqdm(
executor.map(generate_summary_wrapper, summary_args),
total=len(summary_args),
desc="Generating summaries",
)
)
print("Done!")
@@ -0,0 +1,6 @@
pandas>=1.3.0
tqdm>=4.60.0
duckdb>=0.8.0
matplotlib>=3.3.0
pyarrow>=5.0.0
numpy>=1.21.0
File diff ditekan karena terlalu besar Load Diff
@@ -0,0 +1,198 @@
#pragma once
#include <pthread.h>
#include "json.h"
#include "common.h"
#include "version.h"
#define MAX_CHANNELS 64
#define INS_CHK_GOTO(call, res, label) \
do { \
res = call; \
if (inspectorSuccess != res) { \
INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
inspectorErrorString(res)); \
goto label; \
} \
} while (0);
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;
typedef enum {
inspectorSuccess = 0,
inspectorUninitializedError,
inspectorMemoryError,
inspectorFileOpenError,
inspectorDisabledError,
inspectorLockError,
inspectorPthreadError,
inspectorJsonError,
inspectorCudaError,
inspectorBadHash,
inspectorDeleteUnknownCommError,
inspectorAddDuplicateCommError,
inspectorNop,
inspectorNullTally,
inspectorGlobalInitError,
inspectorReturn,
} inspectorResult_t;
typedef enum {
inspectorTimingSourceKernelGpu = 0,
inspectorTimingSourceKernelCpu = 1,
inspectorTimingSourceCollectiveCpu = 2,
} inspectorTimingSource_t;
struct inspectorEventTraceInfo {
uint64_t ts;
uint64_t sn;
};
typedef enum {
NCCL_INSP_EVT_TRK_COLL_START = 0,
NCCL_INSP_EVT_TRK_COLL_STOP = 1,
NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
} inspectorEventTrkColl_t;
typedef enum {
NCCL_INSP_EVT_TRK_KERNEL_START = 0,
NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
} inspectorEventTrkKernel_t;
struct inspectorEventTrkKernelInfo {
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
};
struct inspectorEventTrkCollInfo {
int sn;
uint32_t nChannels;
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
};
struct inspectorCompletedCollInfo {
ncclFunc_t func;
uint64_t sn;
size_t msgSizeBytes;
uint64_t execTimeUsecs;
inspectorTimingSource_t timingSource;
double algoBwGbs;
double busBwGbs;
// Event trace information
struct inspectorEventTrkCollInfo collEvtTrk;
};
enum {
NCCL_COMM_HASH_LENGTH = 17
};
struct inspectorCommInfo {
struct inspectorCommInfo* next;
const char* commName;
uint64_t commHash;
char commHashStr[NCCL_COMM_HASH_LENGTH];
int rank;
int nranks;
int nnodes;
bool dump;
struct inspectorCompletedCollInfo completedCollInfo;
pthread_rwlock_t guard;
};
struct inspectorKernelChInfo {
uint64_t type;
int refCount; /*unused*/
struct inspectorCollInfo *collInfo;
uint8_t channelId;
uint64_t tsStartUsec;
uint64_t tsCompletedUsec;
uint64_t startGpuClk;
uint64_t stopGpuClk;
};
struct inspectorCollInfo {
uint64_t type;
int refCount;
struct inspectorCommInfo *commInfo;
const char* func;
uint64_t sn;
size_t msgSizeBytes;
uint64_t tsStartUsec;
uint64_t tsCompletedUsec;
uint32_t nChannels;
uint32_t nKernelChStarted;
uint32_t nKernelChCompleted;
pthread_rwlock_t guard;
struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
struct inspectorEventTrkCollInfo collEvtTrk;
};
extern ncclDebugLogger_t logFn;
#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
inline int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
case ncclFloat8e4m3:
case ncclFloat8e5m2:
return 1;
case ncclFloat16:
case ncclBfloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
const char* inspectorErrorString(inspectorResult_t result);
inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorGlobalInit(int rank);
inspectorResult_t inspectorGlobalFinalize();
uint64_t inspectorGetTime();
inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
const char* commName, uint64_t commHash,
int nNodes, int nranks, int rank);
inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
struct inspectorCollInfo *collInfo);
ncclDataType_t inspectorStringToDatatype(const char* str);
void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
struct inspectorCompletedCollInfo *completedColl,
ncclFunc_t collType);
@@ -0,0 +1,493 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "profiler.h"
#include "inspector.h"
#define __hidden __attribute__ ((visibility("hidden")))
static int gInitialized;
static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER;
/*
* Description:
* Records an event trace with timestamp and sequence number
*
* Thread Safety:
* Not thread-safe - must be called with proper locking. This function
* is designed to be called from within locked sections where the
* collective info structure is already protected.
*
* Input:
* struct inspectorEventTraceInfo* evtTrace - event trace array
* int eventIndex - index in the event trace array (must be valid)
* struct inspectorCollInfo* collInfo - collective info structure (must not be NULL)
*
* Output:
* Event trace is updated with current timestamp and next sequence
* number from collective
*
* Return:
* uint64_t - the sequence number assigned to this event
*
* Preconditions:
* - collInfo must not be NULL
* - eventIndex must be within valid bounds for evtTrace array
* - Function must be called from within a locked section
*/
static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace,
int eventIndex,
struct inspectorCollInfo* collInfo) {
evtTrace[eventIndex].ts = inspectorGetTime();
evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter
return evtTrace[eventIndex].sn;
}
/*
* Description:
*
* Initializes the NCCL Inspector plugin and global state for a
* communicator.
*
* Thread Safety:
* Thread-safe (uses mutex for initialization).
*
* Input:
* void** context - pointer to plugin context.
* int* eActivationMask - pointer to activation mask output.
* const char* commName - communicator name.
* uint64_t commHash - communicator hash.
* int nNodes - number of nodes.
* int nranks - number of ranks.
* int rank - rank.
* ncclDebugLogger_t logfn - logger function pointer.
*
* Output:
* context is set to plugin context; eActivationMask is set.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash,
int* eActivationMask,
const char* commName,
int nNodes, int nranks, int rank,
ncclDebugLogger_t logfn) {
inspectorResult_t res = inspectorSuccess;
*context = nullptr;
logFn = logfn;
pthread_mutex_lock(&gLock);
if (++gInitialized == 1) {
res = inspectorGlobalInit(rank);
if (res != inspectorSuccess) {
WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res,
inspectorErrorString(res));
gInitialized = 0;
pthread_mutex_unlock(&gLock);
return ncclInternalError;
}
}
pthread_mutex_unlock(&gLock);
INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context,
commName, commHash,
nNodes, nranks, rank), res, success);
*eActivationMask = ncclProfileColl | ncclProfileKernelCh;
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d",
commName ? commName : "", commHash, nranks, rank);
success:
if (res != inspectorSuccess) {
return ncclInternalError;
} else {
return ncclSuccess;
}
}
/*
* Description:
*
* Finalizes the NCCL Inspector plugin and global state for a
* communicator.
*
* Thread Safety:
* Thread-safe (uses mutex for finalization).
*
* Input:
* void* context - plugin context.
*
* Output:
* Plugin context is finalized and cleaned up.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginFinalize(void* context) {
inspectorDelComm((struct inspectorCommInfo *)context);
pthread_mutex_lock(&gLock);
if (--gInitialized == 0) {
inspectorGlobalFinalize();
}
pthread_mutex_unlock(&gLock);
return ncclSuccess;
}
inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) {
collInfo->refCount += 1;
return inspectorSuccess;
}
inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) {
inspectorLockWr(&collInfo->guard);
inspectorPluginCollInfoRef(collInfo);
inspectorUnlockRWLock(&collInfo->guard);
return inspectorSuccess;
}
inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) {
collInfo->refCount -= 1;
if (collInfo->refCount == 0) {
inspectorLockDestroy(&collInfo->guard);
memset(collInfo, 0, sizeof(struct inspectorCollInfo));
free(collInfo);
return inspectorReturn;
}
return inspectorSuccess;
}
inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) {
inspectorLockWr(&collInfo->guard);
inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo);
inspectorUnlockRWLock(&collInfo->guard);
return res;
}
/*
* Description:
* Initializes a new inspectorCollInfo structure for a collective
* event.
*
* Thread Safety:
* Not thread-safe (allocates and initializes a new collective info
* structure).
*
* Input:
*
* struct inspectorCollInfo **collInfo - pointer to output
* collective info struct.
* ncclProfilerEventDescr_t *eDescr - event descriptor.
*
* Output:
* collInfo is set to the new collective info struct.
*
* Return:
* None.
*/
static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo,
ncclProfilerEventDescr_t *eDescr,
struct inspectorCommInfo *commInfo) {
struct inspectorCollInfo *collInfoPtr
= (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo));
if (collInfoPtr == nullptr) {
WARN("Inspector: Failed to allocate memory for collective info structure");
*collInfo = nullptr;
return;
}
collInfoPtr->type = ncclProfileColl;
collInfoPtr->refCount = 0;
inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed
collInfoPtr->func = eDescr->coll.func;
collInfoPtr->sn = eDescr->coll.seqNumber;
collInfoPtr->nChannels = eDescr->coll.nChannels;
if (collInfoPtr->nChannels > 0) {
inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion
}
collInfoPtr->tsStartUsec = inspectorGetTime();
collInfoPtr->msgSizeBytes =
ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count;
collInfoPtr->commInfo = commInfo;
collInfoPtr->collEvtTrk.sn = 0;
collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels;
inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace,
NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr);
inspectorLockInit(&collInfoPtr->guard);
*collInfo = collInfoPtr;
}
/*
* Description:
*
* Initializes a new inspectorKernelChInfo structure for a kernel
* channel event.
*
* Thread Safety:
* Not thread-safe (initializes kernel channel info within a
* collective info structure).
*
* Input:
* struct inspectorKernelChInfo **kernelChInfo - pointer to output
* kernel channel info struct.
* ncclProfilerEventDescr_t *eDescr - event descriptor.
*
* Output:
*
* kernelChInfo is set to the new kernel channel info struct.
*
* Return:
* None.
*/
static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo,
ncclProfilerEventDescr_t *eDescr) {
if (eDescr->parentObj) {
uint64_t parentType=*(uint64_t*)eDescr->parentObj;
if (parentType == ncclProfileColl) {
struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj;
if (collInfo && collInfo->type == ncclProfileColl) {
inspectorLockWr(&collInfo->guard);
struct inspectorEventTraceInfo *krnlEvtTrk =
collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace;
inspectorRecordEventTrace(krnlEvtTrk,
NCCL_INSP_EVT_TRK_KERNEL_START,
collInfo);
struct inspectorKernelChInfo *kernelChInfoPtr
= &collInfo->kernelCh[eDescr->kernelCh.channelId];
kernelChInfoPtr->type = ncclProfileKernelCh;
kernelChInfoPtr->channelId = eDescr->kernelCh.channelId;
kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer;
if (kernelChInfoPtr->stopGpuClk == 0) {
inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event
}
kernelChInfoPtr->tsStartUsec = inspectorGetTime();
if (collInfo->nKernelChStarted == 0) {
collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec;
}
collInfo->nKernelChStarted += 1;
inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event
kernelChInfoPtr->collInfo = collInfo;
*kernelChInfo = kernelChInfoPtr;
inspectorUnlockRWLock(&collInfo->guard);
}
}
}
}
/*
* Description:
*
* Starts a profiling event for the NCCL Inspector plugin.
*
* Thread Safety:
* Thread-safe (allocates and initializes event structures).
*
* Input:
* void* context - plugin context.
* void** eHandle - pointer to event handle output.
* ncclProfilerEventDescr_t* eDescr - event descriptor.
*
* Output:
* eHandle is set to the new event structure.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginStartEvent(void* context,
void** eHandle,
ncclProfilerEventDescr_t* eDescr) {
if (context == nullptr || eDescr == nullptr) {
INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__);
return ncclSuccess;
}
*eHandle = nullptr;
if (eDescr->type == ncclProfileColl) {
struct inspectorCollInfo *collEvent = nullptr;
struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context;
inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx);
*eHandle = collEvent;
} else if (eDescr->type == ncclProfileKernelCh) {
struct inspectorKernelChInfo *kernelChEvent = nullptr;
inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr);
*eHandle = kernelChEvent;
} else {
return ncclSuccess;
}
return ncclSuccess;
}
/*
* Description:
*
* Stops a profiling event for the NCCL Inspector plugin.
*
* Thread Safety:
*
* Thread-safe (updates event state and performance info).
*
* Input:
*
* void *eHandle - event handle.
*
* Output:
*
* Event is stopped and performance info may be updated.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
if (eHandle == nullptr) {
INFO(NCCL_INIT,
"Profiler/Plugin: Event Handle NULL for start event %s", __func__);
return ncclSuccess;
}
uint64_t type = *(uint64_t *)eHandle;
inspectorResult_t res = inspectorSuccess;
if (type == ncclProfileColl) {
struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle;
// Record collective stop event
inspectorLockWr(&collInfo->guard);
inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace,
NCCL_INSP_EVT_TRK_COLL_STOP,
collInfo);
res = inspectorPluginCollInfoDeRef(collInfo);
if (res == inspectorReturn) {
// WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl");
return ncclSuccess;
}
inspectorUnlockRWLock(&collInfo->guard);
return ncclSuccess;
} else if (type == ncclProfileKernelCh) {
struct inspectorKernelChInfo *kernelChInfo
= (struct inspectorKernelChInfo *)eHandle;
struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
if (collInfo && collInfo->type == ncclProfileColl) {
inspectorLockWr(&collInfo->guard);
struct inspectorEventTraceInfo *krnlEvtTrk =
collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
inspectorRecordEventTrace(krnlEvtTrk,
NCCL_INSP_EVT_TRK_KERNEL_STOP,
collInfo);
kernelChInfo->tsCompletedUsec = inspectorGetTime();
collInfo->nKernelChCompleted += 1;
res = inspectorPluginCollInfoDeRef(collInfo);
if (res == inspectorReturn) {
WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh");
return ncclSuccess;
}
if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted)
&& (collInfo->nKernelChCompleted == collInfo->nChannels)) {
struct inspectorCompletedCollInfo completedColl;
struct inspectorCommInfo *commInfo = collInfo->commInfo;
collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec;
inspectorUpdateCollPerf(&completedColl, collInfo);
res = inspectorPluginCollInfoDeRef(collInfo);
if (res != inspectorReturn) {
inspectorUnlockRWLock(&collInfo->guard);
}
if (commInfo != nullptr) {
inspectorLockWr(&commInfo->guard);
inspectorComputeCollBw(commInfo,
&completedColl,
completedColl.func);
memcpy(&commInfo->completedCollInfo,
&completedColl,
sizeof(struct inspectorCompletedCollInfo));
commInfo->dump = true;
inspectorUnlockRWLock(&commInfo->guard);
}
return ncclSuccess;
}
inspectorUnlockRWLock(&collInfo->guard);
}
return ncclSuccess;
}
return ncclSuccess;
}
/*
* Description:
*
* Records the state of a profiling event for the NCCL Inspector
* plugin.
*
* Thread Safety:
*
* Thread-safe (updates event state as needed).
*
* Input:
* void* eHandle - event handle.
* ncclProfilerEventState_t eState - event state.
* ncclProfilerEventStateArgs_t* eStateArgs - event state arguments.
*
* Output:
* Event state is updated as needed.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
ncclProfilerEventState_t eState,
ncclProfilerEventStateArgs_t* eStateArgs) {
if (eHandle == nullptr || eStateArgs == nullptr)
return ncclSuccess;
uint64_t type = *(uint64_t *)eHandle;
if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) {
struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle;
struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
inspectorResult_t res = inspectorSuccess;
if (collInfo && collInfo->type == ncclProfileColl) {
inspectorLockWr(&collInfo->guard);
struct inspectorEventTraceInfo *krnlEvtTrk
= collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
inspectorRecordEventTrace(krnlEvtTrk,
NCCL_INSP_EVT_TRK_KERNEL_RECORD,
collInfo);
kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer;
if (kernelChInfo->startGpuClk != 0) {
res = inspectorPluginCollInfoDeRef(collInfo);
if (res == inspectorReturn) {
WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState");
return ncclSuccess;
}
}
inspectorUnlockRWLock(&collInfo->guard);
}
}
return ncclSuccess;
}
ncclProfiler_t ncclProfiler_v5 = {
"Inspector",
inspectorPluginInit,
inspectorPluginStartEvent,
inspectorPluginStopEvent,
inspectorPluginRecordEventState,
inspectorPluginFinalize,
};
@@ -0,0 +1,496 @@
#include "json.h"
#include <assert.h>
#include <math.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const char* jsonErrorString(jsonResult_t res) {
switch (res) {
case jsonSuccess:
return "jsonSuccess";
case jsonFileError:
return "jsonFileError";
case jsonUnknownStateError:
return "jsonUnknownStateError";
case jsonEmptyStateError:
return "jsonEmptyStateError";
case jsonExpectedNonNoneStateError:
return "jsonExpectedNonNoneStateError";
case jsonMemoryError:
return "jsonMemoryError";
case jsonStringOverflowError:
return "jsonStringOverflowError";
case jsonStringBadChar:
return "jsonStringBadChar";
case jsonLockError:
return "jsonLockError";
default:
return "unknown json error";
}
}
// We use these statics to mantain a stack of states where we are writing.
typedef struct jsonFileOutput {
jsonState_t* states;
size_t state_cap; // Allocated stack capacity
size_t state_n; // # of items in the stack.
FILE* fp;
pthread_mutex_t mutex;
} jsonFileOutput;
jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) {
jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput));
if (new_jfo == NULL) {
return jsonMemoryError;
}
if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) {
free(new_jfo);
*jfo = 0;
return jsonLockError;
}
new_jfo->states = NULL;
new_jfo->state_cap = 0;
new_jfo->state_n = 0;
new_jfo->fp = fopen(outfile, "w");
if (new_jfo->fp == NULL) {
free(new_jfo);
*jfo = 0;
return jsonFileError;
}
*jfo = new_jfo;
return jsonSuccess;
}
jsonResult_t jsonNewline(jsonFileOutput* jfo) {
fprintf(jfo->fp, "\n");
return jsonSuccess;
}
jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
fflush(jfo->fp);
return jsonSuccess;
}
jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
if (pthread_mutex_lock(&jfo->mutex) != 0) {
return jsonLockError;
}
return jsonSuccess;
}
jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
if (pthread_mutex_unlock(&jfo->mutex) != 0) {
return jsonLockError;
}
return jsonSuccess;
}
jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
// Really should probably complain if we aren't in a valid state
if (pthread_mutex_destroy(&jfo->mutex) != 0) {
free(jfo);
return jsonLockError;
}
if (jfo->states != NULL) {
free(jfo->states);
}
jfo->states = NULL;
jfo->state_cap = 0;
jfo->state_n = 0;
if (jfo->fp) {
fclose(jfo->fp);
jfo->fp = 0;
}
free(jfo);
return jsonSuccess;
}
static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) {
int copy_len;
if ((in[0] & 0xE0) == 0xC0) {
// 2-byte sequence
if ((in[1] & 0xC0) != 0x80 || out_lim < 2) {
return 0;
}
copy_len = 2;
} else if ((in[0] & 0xF0) == 0xE0) {
// 3-byte sequence
if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) {
return 0;
}
copy_len = 3;
} else if ((in[0] & 0xF8) == 0xF0) {
// 4-byte sequence
if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) {
return 0;
}
copy_len = 4;
} else {
// Invalid start byte
return 0;
}
for (int i = 0; i < copy_len; ++i) {
out[i] = in[i];
}
return copy_len;
}
// This tries to sanitize/quote a string from 'in' into 'out',
// assuming 'out' has length 'lim'. We mainly quote ",/,\,\t,\n, and
// bail if we encounter non-printable stuff or non-ASCII stuff.
// 'in' should be null-terminated, of course.
//
// We return false if we were not able to copy all of 'in', either for
// length reasons or for unhandled characters.
static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) {
int c = 0;
while (*in) {
if (c + 1 >= lim) {
out[c] = 0;
return jsonStringOverflowError;
}
switch (*in) {
case '"':
case '\\':
case '/':
case '\t':
case '\n':
if (c + 2 > lim) {
out[c] = 0;
return jsonStringOverflowError;
}
out[c++] = '\\';
if (*in == '\n') {
out[c++] = 'n';
} else if (*in == '\t') {
out[c++] = 't';
} else {
out[c++] = *in;
}
++in;
break;
default:
if (*in <= 0x1F) {
out[c] = 0;
return jsonStringBadChar;
} else if (*in <= 0x7F) {
out[c++] = *in;
++in;
} else {
const int utf8len = utf8copy(out + c, lim - c - 1, in);
if (utf8len == 0) {
out[c] = 0;
return jsonStringBadChar;
}
c += utf8len;
in += utf8len;
}
break;
}
}
out[c] = 0;
return jsonSuccess;
}
static size_t max(size_t a, size_t b) {
if (a < b) {
return b;
}
return a;
}
// Push state onto the state stack. Reallocate for extra storage if needed.
// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
if (state == JSON_NONE) {
return jsonExpectedNonNoneStateError;
}
if (jfo->state_cap <= (jfo->state_n + 1)) {
jfo->state_cap = max((size_t)16, jfo->state_cap * 2);
jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap);
if (jfo->states == 0) {
return jsonMemoryError;
}
}
jfo->states[jfo->state_n++] = state;
return jsonSuccess;
}
// Return the current state at the top of the stack
static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
if (jfo->state_n == 0) {
return JSON_NONE;
}
return jfo->states[jfo->state_n - 1];
}
// Replace the stack with state (equivalent to a pop & push if stack is not empty)
static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) {
if (state == JSON_NONE) {
return jsonExpectedNonNoneStateError;
}
if (jfo->state_n == 0) {
return jsonEmptyStateError;
}
jfo->states[jfo->state_n - 1] = state;
return jsonSuccess;
}
// Pop the top state off the stack, or return that the state is empty
static jsonState_t jsonPopState(jsonFileOutput* jfo) {
if (jfo->state_n == 0) {
return JSON_NONE;
}
return jfo->states[--jfo->state_n];
}
// Emit a key and separator. Santize the key.
// This is only acceptable if the top state is an object
// Emit a ',' separator of we aren't the first item.
jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
switch (jsonCurrState(jfo)) {
case JSON_OBJECT_EMPTY:
jsonReplaceState(jfo, JSON_OBJECT_SOME);
break;
case JSON_OBJECT_SOME:
fprintf(jfo->fp, ",");
break;
default:
return jsonUnknownStateError;
}
unsigned char tmp[2048];
const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "\"%s\":", tmp);
jsonPushState(jfo, JSON_KEY);
return jsonSuccess;
}
// Helper function for inserting values.
// Only acceptable after keys, top-level, or in lists.
// Emit preceeding ',' if in a list and not first item.
static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
switch (jsonCurrState(jfo)) {
case JSON_LIST_EMPTY:
jsonReplaceState(jfo, JSON_LIST_SOME);
break;
case JSON_LIST_SOME:
fprintf(jfo->fp, ",");
break;
case JSON_KEY:
jsonPopState(jfo);
break;
case JSON_NONE:
break;
default:
return jsonUnknownStateError;
}
return jsonSuccess;
}
// Start an object
jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "{");
return jsonPushState(jfo, JSON_OBJECT_EMPTY);
}
// Close an object
jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
switch (jsonPopState(jfo)) {
case JSON_OBJECT_EMPTY:
case JSON_OBJECT_SOME:
break;
default:
return jsonUnknownStateError;
}
fprintf(jfo->fp, "}");
return jsonSuccess;
}
// Start a list
jsonResult_t jsonStartList(jsonFileOutput* jfo) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "[");
return jsonPushState(jfo, JSON_LIST_EMPTY);
}
// Close a list
jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
switch (jsonPopState(jfo)) {
case JSON_LIST_EMPTY:
case JSON_LIST_SOME:
break;
default:
return jsonUnknownStateError;
}
fprintf(jfo->fp, "]");
return jsonSuccess;
}
// Write a null value
jsonResult_t jsonNull(jsonFileOutput* jfo) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "null");
return jsonSuccess;
}
// Write a (sanititzed) string
jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
if (str == NULL) {
jsonNull(jfo);
return jsonSuccess;
}
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
unsigned char tmp[2048];
const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str);
if (san_res != jsonSuccess) {
return san_res;
}
fprintf(jfo->fp, "\"%s\"", tmp);
return jsonSuccess;
}
// Write a bool as "true" or "false" strings.
jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
return jsonStr(jfo, val ? "true" : "false");
}
// Write an integer value
jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%d", val);
return jsonSuccess;
}
// Write an integer value
jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%u", val);
return jsonSuccess;
}
// Write an integer value
jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%lu", val);
return jsonSuccess;
}
// Write a size_t value
jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%zu", val);
return jsonSuccess;
}
// Write a double value
jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
if (val != val) {
fprintf(jfo->fp, "\"nan\"");
} else {
fprintf(jfo->fp, "%lf", val);
}
return jsonSuccess;
}
#ifdef DO_JSON_TEST
// compile with
// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test
// run with:
// ./json_test
// if something fails, it will print out the error
// if it all works, print out "output matches reference"
#define JSONCHECK(expr) \
do { \
const jsonResult_t res = (expr); \
if (res != jsonSuccess) { \
fprintf(stderr, "jsonError: %s\n", jsonErrorString(res)); \
exit(1); \
} \
} while (0)
int main() {
const char refstr[] =
"{\"number\":123,\"utfstring\":\"∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ "
"¬β = ¬(¬α β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}";
jsonFileOutput* jfo;
JSONCHECK(jsonInitFileOutput(&jfo, "test.json"));
JSONCHECK(jsonStartObject(jfo));
JSONCHECK(jsonKey(jfo, "number"));
JSONCHECK(jsonInt(jfo, 123));
JSONCHECK(jsonKey(jfo, "utfstring"));
JSONCHECK(
jsonStr(jfo, "∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),"));
JSONCHECK(jsonKey(jfo, "list"));
JSONCHECK(jsonStartList(jfo));
JSONCHECK(jsonBool(jfo, true));
JSONCHECK(jsonNull(jfo));
JSONCHECK(jsonUint64(jfo, 9423812381231ULL));
JSONCHECK(jsonSize_t(jfo, 3123111));
JSONCHECK(jsonDouble(jfo, 0.69423413));
JSONCHECK(jsonFinishList(jfo));
JSONCHECK(jsonFinishObject(jfo));
JSONCHECK(jsonFinalizeFileOutput(jfo));
FILE* fp = fopen("test.json", "r");
const size_t reflen = sizeof(refstr) / sizeof(char);
char buffer[reflen];
fread(buffer, sizeof(char), reflen, fp);
fclose(fp);
if (memcmp(buffer, refstr, reflen) == 0) {
printf("output matches reference\n");
} else {
printf("output %s\nreference %s\n", buffer, refstr);
return 1;
}
return 0;
}
#endif
@@ -0,0 +1,83 @@
#pragma once
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
typedef enum {
JSON_NONE, // A pseudo-state meaning that the document is empty
JSON_KEY,
JSON_OBJECT_EMPTY,
JSON_OBJECT_SOME,
JSON_LIST_EMPTY,
JSON_LIST_SOME,
} jsonState_t;
typedef enum {
jsonSuccess,
jsonFileError,
jsonUnknownStateError,
jsonEmptyStateError,
jsonExpectedNonNoneStateError,
jsonStringOverflowError,
jsonStringBadChar,
jsonMemoryError,
jsonLockError,
} jsonResult_t;
const char *jsonErrorString(jsonResult_t res);
typedef struct jsonFileOutput jsonFileOutput;
jsonResult_t jsonLockOutput(jsonFileOutput *jfo);
jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo);
jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo,
const char *outfile);
jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo);
jsonResult_t jsonNewline(jsonFileOutput *jfo);
jsonResult_t jsonFlushOutput(jsonFileOutput *jfo);
// Emit a key and separator. Santize the key.
// This is only acceptable if the top state is an object
// Emit a ',' separator of we aren't the first item.
jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name);
// Start an object
jsonResult_t jsonStartObject(jsonFileOutput *jfo);
// Close an object
jsonResult_t jsonFinishObject(jsonFileOutput *jfo);
// Start a list
jsonResult_t jsonStartList(jsonFileOutput *jfo);
// Close a list
jsonResult_t jsonFinishList(jsonFileOutput *jfo);
// Emit a null value
jsonResult_t jsonNull(jsonFileOutput *jfo);
// Write a (sanititzed) string
jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str);
// Write a bool as "true" or "false" strings.
jsonResult_t jsonBool(jsonFileOutput *jfo, bool val);
// Write an integer value
jsonResult_t jsonInt(jsonFileOutput *jfo, const int val);
//Write an unsigned int value
jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val);
// Write an integer value
jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val);
// Write a size_t value
jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val);
// Write a double value
jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val);
@@ -0,0 +1,73 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */
/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
ncclFloat8e4m3 = 10,
ncclFloat8e5m2 = 11,
ncclNumTypes = 12
} ncclDataType_t;
typedef enum {
NCCL_LOG_NONE = 0,
NCCL_LOG_VERSION = 1,
NCCL_LOG_WARN = 2,
NCCL_LOG_INFO = 3,
NCCL_LOG_ABORT = 4,
NCCL_LOG_TRACE = 5
} ncclDebugLogLevel;
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6,
ncclInProgress = 7,
ncclNumResults = 8 } ncclResult_t;
typedef enum {
NCCL_INIT = 0x1,
NCCL_COLL = 0x2,
NCCL_P2P = 0x4,
NCCL_SHM = 0x8,
NCCL_NET = 0x10,
NCCL_GRAPH = 0x20,
NCCL_TUNING = 0x40,
NCCL_ENV = 0x80,
NCCL_ALLOC = 0x100,
NCCL_CALL = 0x200,
NCCL_PROXY = 0x400,
NCCL_NVLS = 0x800,
NCCL_BOOTSTRAP = 0x1000,
NCCL_REG = 0x2000,
NCCL_PROFILE = 0x4000,
NCCL_RAS = 0x8000,
NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts
NCCL_ALL = ~0
} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
@@ -0,0 +1,85 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_H_
#define PROFILER_H_
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
ncclProfileGroupApi = (1 << 8), // Group API events
ncclProfileCollApi = (1 << 9), // Collective API events
ncclProfileP2pApi = (1 << 10), // Point-to-Point API events
ncclProfileKernelLaunch = (1 << 11), // Kernel launch events
};
typedef enum {
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
ncclProfilerProxyOpInProgress_v4 = 19,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait = 8,
ncclProfilerProxyStepSendPeerWait_v4 = 20,
ncclProfilerProxyStepSendWait = 9,
ncclProfilerProxyStepRecvWait = 10,
ncclProfilerProxyStepRecvFlushWait = 11,
ncclProfilerProxyStepRecvGPUWait = 12,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle = 13,
ncclProfilerProxyCtrlActive = 14,
ncclProfilerProxyCtrlSleep = 15,
ncclProfilerProxyCtrlWakeup = 16,
ncclProfilerProxyCtrlAppend = 17,
ncclProfilerProxyCtrlAppendEnd = 18,
/* Network defined events states */
ncclProfilerNetPluginUpdate = 21,
/* Kernel event states */
ncclProfilerKernelChStop = 22,
/* Group API States */
ncclProfilerEndGroupApiStart = 23,
ncclProfilerBeginGroupApiEnd = 24
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
#include "profiler_v5.h"
#include "profiler_v4.h"
#include "profiler_v3.h"
#include "profiler_v2.h"
#include "profiler_v1.h"
#include "profiler_net.h"
typedef ncclProfiler_v5_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
#endif // end include guard
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_NET_H_
#define PROFILER_NET_H_
#define NCCL_PROFILER_NET_VER_BITS (16)
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
typedef enum {
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
} ncclProfilerNetType;
#endif
@@ -0,0 +1,112 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V1_H_
#define PROFILER_V1_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
#endif
@@ -0,0 +1,108 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V2_H_
#define PROFILER_V2_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v2_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v2_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v2_t;
#endif
@@ -0,0 +1,116 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V3_H_
#define PROFILER_V3_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v3_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v3_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v3_t;
#endif
@@ -0,0 +1,127 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V4_H_
#define PROFILER_V4_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
#endif
@@ -0,0 +1,151 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V5_H_
#define PROFILER_V5_H_
typedef struct {
uint64_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
bool graphCaptured;
int groupDepth;
} groupApi;
struct {
const char* func;
size_t count;
const char* datatype;
int root;
void* stream;
bool graphCaptured;
} collApi;
struct {
const char* func;
size_t count;
const char* datatype;
void* stream;
bool graphCaptured;
} p2pApi;
struct {
void* stream;
} kernelLaunch;
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
void* parentGroup; // for backward compatibility with v4
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
void* parentGroup; // for backward compatibility with v4
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v5_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v5_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commId : communicator id
// - commName : user assigned communicator name
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v5_t;
#endif
@@ -0,0 +1,21 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
} ncclDataType_t;
#endif
@@ -0,0 +1,12 @@
#ifndef VERSION_H
#define VERSION_H
#ifdef __cplusplus
extern "C" {
#endif
const char* get_git_version_info();
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif // VERSION_H
Submodule projects/rccl/ext-src/rocSHMEM added at b28a56bd54
+803
Melihat File
@@ -0,0 +1,803 @@
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 9bfd8dcf..4d3f0a08 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -29,6 +29,7 @@
#include "ibvwrap.h"
#include "mlx5/mlx5dvwrap.h"
+#include "ionic/ionicdvwrap.h"
#include "graph/xml.h"
#define MAXSUFFIXSIZE 16
@@ -110,16 +111,38 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
static std::mutex ncclIbMutex;
static int ncclIbRelaxedOrderingEnabled = 0;
+static bool rcclAinicRoce = 0;
+static bool rcclCtsInlineData = 0;
+static bool rcclCtsOffloadEnabled = 0;
+static bool ncclIbUseInline = 0;
+static int ncclIbGdrFlushDisable = 0;
+
+enum ncclIbChannelType {
+ ncclIbChannelTypeCts = 0,
+ ncclIbChannelTypeData = 1,
+ ncclIbChannelTypeMax = 2
+};
+
+struct ncclChannelToUd {
+ int channelId;
+ bool udId;
+ bool udAllocated;
+};
+
+static ncclChannelToUd nccl_channel_ud_map[MAXCHANNELS][ncclIbChannelTypeMax];
+static bool nccl_channel_last_ud[MAX_IB_DEVS][ncclIbChannelTypeMax];
// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
// rather than once for all communicators. However, the internal plugin implementation
// still assumes the plugin is initialized only once across all communicators. The ref
// counter makes sure the plugin internally initializes only once. When per communicator
// context support is added to the plugin the ref counter can be removed.
static int netRefCount;
#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
+#define NCCL_CTS_QP_SLOT_INVALID 0xFF
+
#define NCCL_IB_SL_DEFAULT 0
#define NCCL_IB_TC_DEFAULT 0
@@ -141,6 +164,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
RCCL_PARAM(IbQpsPerP2p, "IB_QPS_PER_P2P", 0);
+NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
+
+// AMD AINIC
+RCCL_PARAM(CtsInlineData, "CTS_INLINE_DATA", -1);
+RCCL_PARAM(CtsOffloadEnabled, "CTS_OFFLOAD_ENABLED", -1);
+
+extern int64_t rcclParamAinicRoce();
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
@@ -779,6 +809,10 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
+ if(wrap_ionicdv_symbols() != ncclSuccess) {
+ WARN("NET/IB : Failed to open ionicdv symbols. Advance features like AINIC UD load balancing will be disabled.");
+ return ncclInternalError;
+ }
// Detect IB cards
int nIbDevs = 0;
@@ -944,6 +978,23 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
+ ncclIbUseInline = ncclParamIbUseInline();
+ ncclIbGdrFlushDisable = ncclParamIbGdrFlushDisable();
+
+ rcclAinicRoce = ((rcclParamAinicRoce() == 1) ? true : false);
+ if (rcclAinicRoce) {
+ // for AINIC, these params are defaulted to enabled unless user forces it to disable(0).
+ rcclCtsInlineData = ((rcclParamCtsInlineData() == 0) ? false : true);
+ rcclCtsOffloadEnabled = ((rcclParamCtsOffloadEnabled() == 0) ? false : true);
+ // for AINIC IbUseInline is enabled by default always
+ ncclIbUseInline = true;
+ // for AINIC GDR flush is disabled by default
+ ncclIbGdrFlushDisable = 1;
+
+ INFO(NCCL_INIT|NCCL_NET, "NET/IB : AINIC RoCEv2 optimizations enabled: CTS Inline Data: %s; CTS Offload: %s; "
+ "IB Use Inline: enabled; GDR Flush: disabled", rcclCtsInlineData ? "Enabled": "Disabled",
+ rcclCtsOffloadEnabled ? "Enabled": "Disabled");
+ }
}
exit:
ibContext.trafficClass = config->trafficClass;
@@ -1271,6 +1322,8 @@ struct ncclIbListenComm {
struct ncclIbCommStage stage;
};
+#define MAX_INLINE_DATA_SIZE 24
+
struct alignas(64) ncclIbSendFifo {
uint64_t addr;
uint64_t size;
@@ -1281,10 +1334,21 @@ struct alignas(64) ncclIbSendFifo {
char padding[16];
};
+struct alignas(32) ncclIbSendFifoCtsInline {
+ uint64_t addr;
+ uint32_t rkeys[1];
+ int size;
+ uint8_t nreqs;
+ uint16_t tag;
+ uint32_t idx;
+ char padding[9];
+} __attribute__((packed));
+
struct ncclIbQp {
struct ibv_qp* qp;
int devIndex;
int remDevIdx;
+ int8_t ctsQpSlot;
};
struct ncclIbRemSizesFifo {
@@ -1331,6 +1395,7 @@ struct ncclIbSendComm {
struct ncclIbNetCommBase base;
// Start with fifo and ibv structs as they have alignment restrictions
struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+ struct ncclIbSendFifoCtsInline fifo_inline[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
// Each dev correlates to a mergedIbDev
@@ -1346,6 +1411,7 @@ struct ncclIbSendComm {
static_assert((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset");
static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
+static_assert((sizeof(struct ncclIbSendFifoCtsInline) % 32) == 0, "ncclIbSendFifoCtsInline element size must be 32-byte multiples");
static_assert((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned");
static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned");
@@ -1360,6 +1426,7 @@ struct ncclIbGpuFlush {
struct ncclIbRemFifo {
struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+ struct ncclIbSendFifoCtsInline elems_cts_inline[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
uint64_t fifoTail;
uint64_t addr;
uint32_t flags;
@@ -1415,20 +1482,59 @@ ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
return ncclSuccess;
}
-ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
+ int access_flags, void* qp_context, struct ncclIbQp* qp,
+ int channel_id, bool data_qp, int8_t cts_qp_slot) {
struct ibv_qp_init_attr qpInitAttr;
+ enum ncclIbChannelType channel_type = (data_qp ? ncclIbChannelTypeData : ncclIbChannelTypeCts);
memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
qpInitAttr.qp_context = qp_context;
qpInitAttr.send_cq = base->cq;
qpInitAttr.recv_cq = base->cq;
qpInitAttr.qp_type = IBV_QPT_RC;
+
+ if (rcclAinicRoce) {
+ if (!nccl_channel_ud_map[channel_id][channel_type].udAllocated) {
+ bool lud = nccl_channel_last_ud[base->ibDevN][channel_type];
+ nccl_channel_ud_map[channel_id][channel_type].udId = lud;
+ nccl_channel_ud_map[channel_id][channel_type].udAllocated = true;
+ nccl_channel_last_ud[base->ibDevN][channel_type] =
+ !(nccl_channel_last_ud[base->ibDevN][channel_type]);
+ }
+ if (nccl_channel_ud_map[channel_id][channel_type].udId) {
+ wrap_ionicdv_pd_set_udma_mask(base->pd, IONIC_UDMA_MASK_HIGH);
+ } else {
+ wrap_ionicdv_pd_set_udma_mask(base->pd, IONIC_UDMA_MASK_LOW);
+ }
+ qpInitAttr.sq_sig_all |= (1 << 16);
+ if (data_qp) {
+ qpInitAttr.sq_sig_all |= (1 << 17);
+ } else {
+ qpInitAttr.sq_sig_all &= (~(1 << 17));
+ }
+ qpInitAttr.sq_sig_all |= (1 << 18);
+
+ if (rcclCtsOffloadEnabled) {
+ qpInitAttr.sq_sig_all |= (1 << 19);
+ } else {
+ qpInitAttr.sq_sig_all &= (~(1 << 19));
+ }
+ }
+
// We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
qpInitAttr.cap.max_send_sge = 1;
qpInitAttr.cap.max_recv_sge = 1;
- qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
+ if (rcclCtsInlineData) {
+ qpInitAttr.cap.max_inline_data = MAX_INLINE_DATA_SIZE;
+ } else {
+ qpInitAttr.cap.max_inline_data = ncclIbUseInline ? sizeof(struct ncclIbSendFifo) : 0;
+ }
NCCLCHECK(wrap_ibv_create_qp(&qp->qp, base->pd, &qpInitAttr));
+ if (rcclAinicRoce) {
+ NCCLCHECK(wrap_ionicdv_qp_set_gda(qp->qp, false, true));
+ }
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_INIT;
@@ -1438,6 +1544,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
+ if (rcclAinicRoce) {
+ qp->ctsQpSlot = cts_qp_slot;
+ }
return ncclSuccess;
}
@@ -1521,7 +1630,7 @@ fail:
goto exit;
}
-ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
ncclResult_t ret = ncclSuccess;
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
struct ncclIbCommStage* stage = &handle->stage;
@@ -1529,8 +1638,13 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
int ready;
uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
int isP2p = 0;
+ int channel_id = 0;
*sendComm = NULL;
+ if (rcclAinicRoce) {
+ channel_id = ((ncclNet_ctxt_t *)sendDevComm)->chId;
+ }
+
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
@@ -1612,7 +1726,7 @@ ib_recv_dev_list:
for (int q = 0; q < comm->base.nqps; q++) {
ncclIbSendCommDev* commDev = comm->devs + devIndex;
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q, channel_id, true, NCCL_CTS_QP_SLOT_INVALID), ret, fail);
comm->base.qps[q].devIndex = devIndex;
meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num;
meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
@@ -1637,7 +1751,11 @@ ib_recv_dev_list:
devInfo->lid = ibDev->portAttr.lid;
devInfo->ibv_dev_index = commDev->base.ibDevN;
// Prepare my fifo
- NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+ if (rcclCtsInlineData) {
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo_inline, sizeof(struct ncclIbSendFifoCtsInline)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+ } else {
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+ }
devInfo->fifoRkey = commDev->fifoMr->rkey;
// Pack local GID info
@@ -1680,7 +1798,11 @@ ib_recv_dev_list:
}
}
config = (ncclNetCommConfig_t*)ctx;
- meta.fifoAddr = (uint64_t)comm->fifo;
+ if (rcclCtsInlineData) {
+ meta.fifoAddr = (uint64_t)comm->fifo_inline;
+ } else {
+ meta.fifoAddr = (uint64_t)comm->fifo;
+ }
meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
@@ -1825,18 +1947,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
return ncclSuccess;
}
-NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
RCCL_PARAM(IbGdrFlushGpuMemNoRelaxedOrdering, "GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING", 1);
-ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) {
ncclResult_t ret = ncclSuccess;
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
struct ncclIbCommStage* stage = &lComm->stage;
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
int ready;
int link_layer = IBV_LINK_LAYER_UNSPECIFIED;
+ int channel_id = 0;
*recvComm = NULL;
+ if (rcclAinicRoce) {
+ channel_id = ((ncclNet_ctxt_t *) recvDevComm)->chId;
+ }
+
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
@@ -1966,7 +2092,7 @@ ib_recv:
// Local ibDevN
ibDevN = rComm->devs[devIndex].base.ibDevN;
ibDev = ncclIbDevs + ibDevN;
- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp, channel_id, false, q), ret, fail);
qp->devIndex = devIndex;
devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
@@ -1992,16 +2118,22 @@ ib_recv:
useDmaBuf = (ncclIbDmaBufSupport(lComm->dev) == ncclSuccess);
rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || useDmaBuf)
- && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
+ && (ncclIbGdrFlushDisable == 0)) ? 1 : 0;
for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
rCommDev = rComm->devs + i;
ibDev = ncclIbDevs + rCommDev->base.ibDevN;
// Retain remote fifo info and prepare my RDMA ops
rComm->remFifo.addr = remMeta.fifoAddr;
- NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+ if (rcclCtsInlineData) {
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems_cts_inline,
+ sizeof(struct ncclIbSendFifoCtsInline)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS,
+ IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+ } else {
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+ }
rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
- if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
+ if (ncclIbUseInline) rComm->remFifo.flags = IBV_SEND_INLINE;
// Allocate Flush dummy buffer for GPU Direct RDMA
if (rComm->flushEnabled) {
@@ -2039,7 +2171,7 @@ ib_recv:
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
rCommDev->gpuFlush.sge.length = 1;
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp, channel_id, true, NCCL_CTS_QP_SLOT_INVALID), ret, fail);
struct ncclIbDevInfo devInfo;
devInfo.lid = ibDev->portAttr.lid;
devInfo.link_layer = ibDev->portAttr.link_layer;
@@ -2257,10 +2389,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, bool use_write_op) {
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
- int nreqs = slots[0].nreqs;
+ int nreqs;
+ if (rcclCtsOffloadEnabled) {
+ nreqs = 1;
+ } else {
+ nreqs = slots[0].nreqs;
+ }
if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
uint64_t wr_id = 0ULL;
@@ -2272,7 +2409,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
sge->addr=(uintptr_t)reqs[r]->send.data;
wr->opcode = IBV_WR_RDMA_WRITE;
wr->send_flags = 0;
- wr->wr.rdma.remote_addr = slots[r].addr;
+ if (rcclCtsOffloadEnabled) {
+ wr->wr.rdma.remote_addr = 0xdeadbeef;
+ } else {
+ wr->wr.rdma.remote_addr = slots[r].addr;
+ }
wr->next = wr + 1;
wr_id += (reqs[r] - comm->base.reqs) << (r*8);
#ifdef NCCL_ENABLE_NET_PROFILING
@@ -2283,7 +2424,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
// Write size as immediate data. In the case of multi-send, only write
// 0 or 1 as size to indicate whether there was data sent or received.
uint32_t immData = 0;
- if (nreqs == 1) {
+ if ((nreqs == 1) && (use_write_op == false)) {
immData = reqs[0]->send.size;
} else {
int* sizes = comm->remSizesFifo.elems[slot];
@@ -2293,22 +2434,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
- if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
- // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
- // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
- // completion.
- lastWr++;
- memset(lastWr, 0, sizeof(struct ibv_send_wr));
- if (nreqs > 1) {
- // Write remote sizes Fifo
- lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
- lastWr->num_sge = 1;
- lastWr->sg_list = &comm->remSizesFifo.sge;
+ if (use_write_op == false) {
+ if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
+ // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
+ // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+ // completion.
+ lastWr++;
+ memset(lastWr, 0, sizeof(struct ibv_send_wr));
+ if (nreqs > 1) {
+ // Write remote sizes Fifo
+ lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
+ lastWr->num_sge = 1;
+ lastWr->sg_list = &comm->remSizesFifo.sge;
+ }
}
+ lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+ lastWr->imm_data = immData;
}
lastWr->wr_id = wr_id;
- lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
- lastWr->imm_data = immData;
lastWr->next = NULL;
lastWr->send_flags = IBV_SEND_SIGNALED;
@@ -2324,7 +2467,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
//ncclIbAddEvent(reqs[r], devIndex, &comm->devs[devIndex].base);
// Select proper rkey (needed even for 0-size send)
- comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
+ if (rcclCtsOffloadEnabled) {
+ comm->wrs[r].wr.rdma.rkey = 0xbade;
+ } else {
+ comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
+ }
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
@@ -2340,7 +2487,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
}
- if (nreqs > 1) {
+ if ((use_write_op == false) && (nreqs > 1)) {
// Also make sure lastWr writes remote sizes using the right lkey
comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey;
lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex];
@@ -2398,32 +2545,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
+ bool use_write_op = false;
+ if (rcclAinicRoce) {
+ use_write_op = (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) ? true : false;
+ }
// Wait for the receiver to have posted the corresponding receive
int nreqs = 0;
volatile struct ncclIbSendFifo* slots;
+ if (rcclCtsOffloadEnabled) {
+ nreqs = 1;
+ }
+
int slot = (comm->fifoHead) % MAX_REQUESTS;
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
- slots = comm->fifo[slot];
- uint64_t idx = comm->fifoHead+1;
- if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
- nreqs = slots[0].nreqs;
- // Wait until all data has arrived
- for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
- __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+ if (!rcclCtsOffloadEnabled) {
+ slots = comm->fifo[slot];
+ uint64_t idx = comm->fifoHead+1;
+ if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
+ nreqs = slots[0].nreqs;
+ // Wait until all data has arrived
+ for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
+ __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+ }
for (int r=0; r<nreqs; r++) {
- if (reqs[r] != NULL || slots[r].tag != tag) continue;
-
- if (size > slots[r].size) size = slots[r].size;
- // Sanity checks
- if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
- char line[SOCKET_NAME_MAXLEN + 1];
- union ncclSocketAddress addr;
- ncclSocketGetAddr(&comm->base.sock, &addr);
- WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
- r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
- return ncclInternalError;
+ if (!rcclCtsOffloadEnabled) {
+ if (reqs[r] != NULL || slots[r].tag != tag) continue;
+
+ if (size > slots[r].size) size = slots[r].size;
+ // Sanity checks
+ if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
+ char line[SOCKET_NAME_MAXLEN + 1];
+ union ncclSocketAddress addr;
+ ncclSocketGetAddr(&comm->base.sock, &addr);
+ WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
+ r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
+ return ncclInternalError;
+ }
+ } else{
+ if (reqs[r] != NULL) continue;
}
struct ncclIbRequest* req;
@@ -2467,10 +2628,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
}
TIME_START(0);
- NCCLCHECK(ncclIbMultiSend(comm, slot));
+ NCCLCHECK(ncclIbMultiSend(comm, slot, use_write_op));
// Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
- memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+ if (!rcclCtsOffloadEnabled) {
+ memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+ }
memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
comm->fifoHead++;
TIME_STOP(0);
@@ -2483,30 +2646,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
struct ibv_send_wr wr;
+ struct ncclIbSendFifo* localElem = NULL;
+ struct ncclIbSendFifoCtsInline* localElemCtsInline = NULL;
+ uint64_t localElemRef;
+ int qpIndex = 0;
+ ncclIbQp* ctsQp = NULL;
memset(&wr, 0, sizeof(wr));
int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
req->recv.sizes = comm->sizesFifo[slot];
for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
- struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
+ if (rcclCtsInlineData) {
+ localElemCtsInline = comm->remFifo.elems_cts_inline[slot];
+ } else {
+ localElem = comm->remFifo.elems[slot];
+ }
- // Select the next devIndex (local) and QP to use for posting this CTS message
- // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
- ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex;
- comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
+ if (rcclAinicRoce) {
+ qpIndex = comm->base.qpIndex;
+ ctsQp = comm->base.qps + qpIndex;
+ } else {
+ // Select the next devIndex (local) and QP to use for posting this CTS message
+ // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
+ ctsQp = comm->base.qps + comm->base.devIndex;
+ comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
+ }
for (int i=0; i<n; i++) {
- localElem[i].addr = (uint64_t)data[i];
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
+ if (rcclCtsInlineData) {
+ localElemCtsInline[i].addr = (uint64_t)data[i];
+
+ // Send all applicable rkeys
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
+ localElemCtsInline[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+
+ localElemCtsInline[i].nreqs = n;
+ localElemCtsInline[i].size = sizes[i]; // Sanity/Debugging
+ localElemCtsInline[i].tag = tags[i];
+ localElemCtsInline[i].idx = comm->remFifo.fifoTail+1;
+ localElemRef = (uint64_t)localElemCtsInline;
+
+ } else {
+ localElem[i].addr = (uint64_t)data[i];
- // Send all applicable rkeys
- for (int j = 0; j < comm->base.vProps.ndevs; j++)
- localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+ // Send all applicable rkeys
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
+ localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
- localElem[i].nreqs = n;
- localElem[i].size = sizes[i]; // Sanity/Debugging
- localElem[i].tag = tags[i];
- localElem[i].idx = comm->remFifo.fifoTail+1;
+ localElem[i].nreqs = n;
+ localElem[i].size = sizes[i]; // Sanity/Debugging
+ localElem[i].tag = tags[i];
+ localElem[i].idx = comm->remFifo.fifoTail+1;
+ localElemRef = (uint64_t)localElem;
+ }
}
wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
@@ -2514,8 +2707,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey;
// Set the correct sge properties
- comm->devs[ctsQp->devIndex].fifoSge.addr = (uint64_t)localElem;
- comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
+ comm->devs[ctsQp->devIndex].fifoSge.addr = localElemRef;
+ if (rcclCtsInlineData) {
+ comm->devs[ctsQp->devIndex].fifoSge.length = MAX_INLINE_DATA_SIZE;
+ } else {
+ comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
+ }
wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge;
wr.num_sge = 1;
@@ -2545,7 +2742,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
//
// slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled.
// This works out that each fifo posting QP gets drained
- if (slot == ctsQp->devIndex) {
+ if (rcclAinicRoce) {
+ if (slot == ctsQp->ctsQpSlot) {
+ wr.send_flags |= IBV_SEND_SIGNALED;
+ wr.wr_id = req - comm->base.reqs;
+ ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
+ }
+ } else if (slot == ctsQp->devIndex) {
wr.send_flags |= IBV_SEND_SIGNALED;
wr.wr_id = req - comm->base.reqs;
ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
@@ -2560,10 +2763,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
comm->remFifo.fifoTail++;
+ if (rcclAinicRoce) {
+ // Select the next qpIndex
+ comm->base.qpIndex = (comm->base.qpIndex+1) % comm->base.nqps;
+ }
return ncclSuccess;
}
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
+ ncclResult_t res = ncclSuccess;
+ bool netOptRecvCompletionEnabled = false;
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->base.ready == 0) {
WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
@@ -2573,6 +2782,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
+ if (rcclAinicRoce) {
+ if (*request == (void *) NCCL_NET_OPTIONAL_RECV_COMPLETION) {
+ netOptRecvCompletionEnabled = true;
+ }
+ }
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
req->type = NCCL_NET_IB_REQ_RECV;
@@ -2586,50 +2800,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
req->devBases[i] = &comm->devs[i].base;
}
- struct ibv_recv_wr wr;
- memset(&wr, 0, sizeof(wr));
- wr.wr_id = req - comm->base.reqs;
- wr.sg_list = NULL;
- wr.num_sge = 0;
+ if (!netOptRecvCompletionEnabled) {
+ struct ibv_recv_wr wr;
+ memset(&wr, 0, sizeof(wr));
+ wr.wr_id = req - comm->base.reqs;
+ wr.sg_list = NULL;
+ wr.num_sge = 0;
- TIME_START(1);
- // Select either all QPs, or one qp per-device
- const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
+ TIME_START(1);
+ // Select either all QPs, or one qp per-device
+ const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
- // Post recvs
- struct ibv_recv_wr* bad_wr;
- for (int i = 0; i < nqps; i++) {
- struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
- ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+ // Post recvs
+ struct ibv_recv_wr* bad_wr;
+ int qpIndex = comm->base.qpIndex;
+ for (int i = 0; i < nqps; i++) {
+ struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
+ ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
#ifdef NCCL_ENABLE_NET_PROFILING
- // Start a QP event for every request in the multirecv and every qp
- for (int r = 0; r < n; r++) {
- int nEventHandles = req->pInfo[r].nEventHandles;
- assert(nEventHandles < MAX_QPS_PER_REQ);
- req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
- // Store info for profiler
- int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
- req->pInfo[r].data.type = ncclProfileQp;
- req->pInfo[r].data.qp.device = qp->devIndex;
- req->pInfo[r].data.qp.wr_id = wr.wr_id;
- req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
- NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
- req->pInfo[r].nEventHandles++;
- }
+ // Start a QP event for every request in the multirecv and every qp
+ for (int r = 0; r < n; r++) {
+ int nEventHandles = req->pInfo[r].nEventHandles;
+ assert(nEventHandles < MAX_QPS_PER_REQ);
+ req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
+ // Store info for profiler
+ int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+ req->pInfo[r].data.type = ncclProfileQp;
+ req->pInfo[r].data.qp.device = qp->devIndex;
+ req->pInfo[r].data.qp.wr_id = wr.wr_id;
+ req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
+ NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
+ req->pInfo[r].nEventHandles++;
+ }
#endif
- NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
- comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
- }
+ NCCLCHECKGOTO(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr), res, err);
+ // Don't update comm->base.qpIndex yet, we need to run through this same set of QPs
+ // inside ncclIbPostFifo()
+ if (rcclAinicRoce) {
+ qpIndex = (qpIndex+1)%comm->base.nqps;
+ } else {
+ comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
+ }
+ }
- TIME_STOP(1);
+ TIME_STOP(1);
+ } // netOptRecvCompletionEnabled = false
// Post to FIFO to notify sender
TIME_START(2);
- NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
+ NCCLCHECKGOTO(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req), res, err);
TIME_STOP(2);
*request = req;
return ncclSuccess;
+err:
+ if (req) {
+ ncclIbFreeRequest(req);
+ }
+ return res;
}
ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
@@ -2698,6 +2926,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
}
#endif
+#define NCCL_CQ_POLL_MAX_EVENT 16
+
ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
*done = 0;
@@ -2731,13 +2961,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
int totalWrDone = 0;
int wrDone = 0;
- struct ibv_wc wcs[4];
+ struct ibv_wc wcs[NCCL_CQ_POLL_MAX_EVENT];
+ int cqMaxPollEvent = 4;
+ if (rcclAinicRoce) {
+ cqMaxPollEvent = NCCL_CQ_POLL_MAX_EVENT;
+ }
for (int i = 0; i < NCCL_IB_MAX_DEVS_PER_NIC; i++) {
TIME_START(3);
// If we expect any completions from this device's CQ
if (r->events[i]) {
- NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, 4, wcs, &wrDone));
+ NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, cqMaxPollEvent,
+ wcs, &wrDone));
totalWrDone += wrDone;
if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
if (wrDone == 0) continue;
@@ -2889,7 +3124,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
}
ncclNet_t ncclNetIb = {
- "IB",
+ "ROCM-IB",
ncclIbInit,
ncclIbDevices,
ncclIbGetProperties,
+1 -1
Melihat File
@@ -179,4 +179,4 @@ When developing new tuner plugins:
- [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
- Example plugin implementations in this directory
For questions and support, refer to the NCCL community resources and documentation.
For questions and support, refer to the NCCL community resources and documentation.
@@ -0,0 +1,49 @@
# Compiled shared objects and binaries
*.so
*.o
*.a
*.out
*.exe
*.dll
*.dylib
*.bin
*.elf
# Python cache
__pycache__/
*.pyc
*.pyo
# Build and test artifacts
/build/
*.log
*.tmp
*.swp
# Ignore all CSV files except scripts/sample_performance_data.csv
*.csv
!scripts/sample_performance_data.csv
# Ignore all .conf files except nccl_tuner.conf
*.conf
!nccl_tuner.conf
my_configs
# Ignore test binary
test/test_plugin
# Editor/OS files
.DS_Store
Thumbs.db
# Backup files
*~
*.bak
# Ignore by convention
*.old
*.orig
# Git
.git/
@@ -0,0 +1,26 @@
# Find all C source files in current directory
set(SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
)
# Create shared library
add_library(nccl-tuner-example SHARED ${SRC_FILES})
# Set include directories
target_include_directories(nccl-tuner-example PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/nccl
)
# Set output name to match Makefile
set_target_properties(nccl-tuner-example PROPERTIES
OUTPUT_NAME "nccl-tuner-example"
PREFIX "lib"
POSITION_INDEPENDENT_CODE ON
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
)
# Add custom target for clean (equivalent to Makefile clean target)
add_custom_target(clean-tuner-lib
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so
COMMENT "Cleaning libnccl-tuner-example.so"
)
@@ -45,6 +45,40 @@ typedef enum {
#define NCCL_ALGO_PROTO_IGNORE -1.0
#define NCCL_HW_NVLINK 0
#define NCCL_HW_PCI 1
#define NCCL_HW_NET 2
#define NCCL_NUM_HW_LINKS 3
#define NCCL_VOLTA_COMPCAP_IDX 0
#define NCCL_AMPERE_COMPCAP_IDX 1
#define NCCL_HOPPER_COMPCAP_IDX 2
#define NCCL_BLACKWELL_COMPCAP_IDX 3
#define NCCL_NUM_COMPCAPS 4
#define NCCL_TUNING_SCALE_1NODE 0
#define NCCL_TUNING_SCALE_2NODES 1
#define NCCL_TUNING_SCALE_4NODES 2
#define NCCL_NUM_TUNING_SCALES 3
typedef struct {
int nNvlDomains; // number of NVLink domains
int minRanksPerNvlDomain; // minimum ranks across all NVLink domains
int maxRanksPerNvlDomain; // maximum ranks across all NVLink domains
} ncclNvlDomainInfo_v5_t;
typedef struct {
double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
} ncclTunerConstants_v5_t;
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
@@ -52,12 +86,17 @@ typedef struct {
// Initializes tuner states.
// Inputs:
// - commId: communicator identifier
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// - nNodes: number of nodes in current communicator.
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
// - nvlDomainInfo: NVL domain information struct
// Outputs:
// - context: tuner context object
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
// Input/Output:
// - constants: tuner constants
ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
@@ -87,11 +126,13 @@ typedef struct {
// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v4_t;
ncclResult_t (*finalize)(void* context);
} ncclTuner_v5_t;
typedef ncclTuner_v4_t ncclTuner_t;
typedef ncclTuner_v5_t ncclTuner_t;
typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
typedef ncclTunerConstants_v5_t ncclTunerConstants_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"
#endif
+31 -5
Melihat File
@@ -51,6 +51,7 @@ typedef struct {
size_t nRanks;
size_t nNodes;
ncclDebugLogger_t logFunction;
ncclNvlDomainInfo_v5_t nvlDomainInfo;
} TunerContext;
// Parse collective type from string
@@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
return ncclSuccess;
}
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) {
if (NULL != constants) {
// NCCL constants tuning
// Tune NCCL's internal tuning model to improve base algo/proto selection.
// Note: Example numbers are for reference only.
// Actual numbers may vary depending on the hardware and network topology.
// These numbers are not guaranteed to be optimal for all cases.
// Limit the tree bandwidth to 15GB/s
constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0;
// Limit the ring bandwidth to 20GB/s
constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0;
// Set NVLSTree base network latency to 24us
constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
}
TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
if (!ctx) return ncclSystemError;
@@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
ctx->nRanks = nRanks;
ctx->nNodes = nNodes;
ctx->logFunction = logFunction;
if (nvlDomainInfo) {
ctx->nvlDomainInfo = *nvlDomainInfo;
} else {
memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t));
}
if (logFunction) {
logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains",
nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains);
}
// Try to load config file from environment variable or default location
@@ -435,7 +460,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
return ncclSuccess;
}
__hidden ncclResult_t pluginDestroy(void* context) {
__hidden ncclResult_t pluginFinalize(void* context) {
if (context) {
TunerContext* ctx = (TunerContext*)context;
if (ctx->configs) {
@@ -446,11 +471,12 @@ __hidden ncclResult_t pluginDestroy(void* context) {
return ncclSuccess;
}
#define PLUGIN_NAME "Example"
const ncclTuner_v4_t ncclTunerPlugin_v4 = {
const ncclTuner_v5_t ncclTunerPlugin_v5 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
.destroy = pluginDestroy
.finalize = pluginFinalize
};
@@ -0,0 +1,53 @@
# NCCL Tuner Configuration File (CSV Format)
# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
#
# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
# Protocols: ll, ll128, simple
# Channels: number of channels to use, or -1 to keep default
# nNodes: number of nodes to match, or -1 for any number of nodes
# nRanks: number of ranks to match, or -1 for any number of ranks
# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
#
# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
#
#AR 4PPN
allreduce,33554432,4294967296,ring,simple,16,2,8,-1,-1
allreduce,33554432,4294967296,ring,simple,16,4,16,-1,-1
allreduce,67108864,4294967296,ring,simple,16,8,32,-1,-1
#AR 2PPN
allreduce,2097152,4294967296,ring,simple,4,2,4,-1,-1
allreduce,16777216,4294967296,ring,simple,4,4,8,-1,-1
allreduce,33554432,4294967296,ring,simple,4,8,16,-1,-1
#AR 1PPN
allreduce,134217728,4294967296,ring,simple,4,4,4,-1,-1
allreduce,67108864,4294967296,ring,simple,4,8,8,-1,-1
#AG 4PPN
allgather,8388608,4294967296,ring,simple,16,2,8,-1,-1
allgather,16777216,4294967296,ring,simple,16,4,16,-1,-1
allgather,16777216,4294967296,ring,simple,16,8,32,-1,-1
#AG 2PPN
allgather,262144,4294967296,ring,simple,4,2,4,-1,-1
allgather,16777216,4294967296,ring,simple,4,4,8,-1,-1
allgather,33554432,4294967296,ring,simple,4,8,16,-1,-1
#AG 1PPN
allgather,262144,2097152,ring,simple,4,2,2,-1,-1
allgather,262144,8388608,ring,simple,4,4,4,-1,-1
allgather,67108864,4294967296,ring,simple,4,8,8,-1,-1
#RS 4PPN
reducescatter,1048576,4294967296,ring,simple,16,2,8,-1,-1
reducescatter,1048576,4294967296,ring,simple,16,4,16,-1,-1
reducescatter,1048576,4294967296,ring,simple,16,8,32,-1,-1
#RS 2PPN
reducescatter,262144,33554432,ring,simple,4,2,4,-1,-1
reducescatter,262144,4294967296,ring,simple,4,4,8,-1,-1
reducescatter,262144,4294967296,ring,simple,4,8,16,-1,-1
#RS 1PPN
reducescatter,131072,262144,ring,simple,4,2,2,-1,-1
reducescatter,1048576,2097152,ring,simple,4,2,2,-1,-1
reducescatter,131072,4194304,ring,simple,4,4,4,-1,-1
reducescatter,262144,8388608,ring,simple,4,8,8,-1,-1
@@ -98,12 +98,12 @@ int test_plugin_init() {
void* context = NULL;
// Test successful initialization
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
TEST_ASSERT(context != NULL, "Context should be allocated");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
TEST_PASS();
}
@@ -123,11 +123,11 @@ int test_config_parsing_valid() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
void* context = NULL;
ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_valid.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -144,12 +144,12 @@ int test_config_parsing_invalid() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
void* context = NULL;
ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
// Should still succeed but with no valid configs loaded
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_invalid.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -165,7 +165,7 @@ int test_collective_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
// Create mock cost table
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -209,7 +209,7 @@ int test_collective_matching() {
TEST_ASSERT(nChannels == 4, "Should set 4 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_match.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -226,7 +226,7 @@ int test_size_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -280,7 +280,7 @@ int test_size_matching() {
TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_size.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -298,7 +298,7 @@ int test_topology_matching() {
// Test with single node setup
void* context1 = NULL;
pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node
pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL); // 8 ranks, 1 node
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -316,11 +316,11 @@ int test_topology_matching() {
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
pluginDestroy(context1);
pluginFinalize(context1);
// Test with 4 nodes, 32 ranks setup
void* context2 = NULL;
pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes
pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL); // 32 ranks, 4 nodes
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
@@ -349,7 +349,7 @@ int test_default_channels() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -369,7 +369,7 @@ int test_default_channels() {
TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_default.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -386,7 +386,7 @@ int test_regbuff_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -437,7 +437,7 @@ int test_regbuff_matching() {
TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_regbuff.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -454,7 +454,7 @@ int test_pipeops_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -504,7 +504,7 @@ int test_pipeops_matching() {
TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_pipeops.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -519,7 +519,7 @@ int test_no_match_fallback() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -543,7 +543,7 @@ int test_no_match_fallback() {
TEST_ASSERT(nChannels == 1, "Should use default channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_fallback.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -593,7 +593,7 @@ int test_large_config() {
// Initialize plugin with large config
void* context = NULL;
ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
TEST_ASSERT(context != NULL, "Context should be allocated");
@@ -652,7 +652,7 @@ int test_large_config() {
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink(large_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
@@ -684,7 +684,7 @@ int test_very_large_config_stress() {
// Test initialization with stress config
void* context = NULL;
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
TunerContext* ctx = (TunerContext*)context;
@@ -705,7 +705,7 @@ int test_very_large_config_stress() {
}
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink(stress_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
@@ -726,7 +726,7 @@ int test_empty_config() {
setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
void* context = NULL;
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
TunerContext* ctx = (TunerContext*)context;
@@ -751,13 +751,134 @@ int test_empty_config() {
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink(empty_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test NVLink domain info handling
int test_nvl_domain_info() {
printf("Testing NVLink domain info handling...\n");
// Test NVLink domain structure with min/max ranks per domain
ncclNvlDomainInfo_v5_t nvl_domain = {
.nNvlDomains = 2, // 2 nodes = 2 domains
.minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
.maxRanksPerNvlDomain = 5 // maximum ranks across all domains (capacity)
};
void* context = NULL;
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
// Validate NVLD info structure
TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
// Clean up
pluginFinalize(context);
printf("NVLink domain info test passed!\n");
TEST_PASS();
}
int test_tuner_constants() {
// Initialize constants to -1.0 for testing purposes
ncclTunerConstants_v5_t constants = {
// Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
.baseLatencies = {
{-1.0, -1.0, -1.0}, // NCCL_ALGO_TREE: LL, LL128, Simple
{-1.0, -1.0, -1.0}, // NCCL_ALGO_RING: LL, LL128, Simple
{-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS
{-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS_TREE
{-1.0, -1.0, -1.0} // NCCL_ALGO_PAT
},
// Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
.hwLatencies = {
// NCCL_HW_NVLINK
{
{-1.0, -1.0, -1.0}, // TREE
{-1.0, -1.0, -1.0}, // RING
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NVLS
{-1.0, -1.0, -1.0}, // NVLS_TREE
{-1.0, -1.0, -1.0} // PAT
},
// NCCL_HW_PCI
{
{-1.0, -1.0, -1.0}, // TREE
{-1.0, -1.0, -1.0}, // RING
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NVLS
{-1.0, -1.0, -1.0}, // NVLS_TREE
{-1.0, -1.0, -1.0} // PAT
},
// NCCL_HW_NET
{
{-1.0, -1.0, -1.0}, // TREE
{-1.0, -1.0, -1.0}, // RING
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NVLS
{-1.0, -1.0, -1.0}, // NVLS_TREE
{-1.0, -1.0, -1.0} // PAT
}
},
// LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.llMaxBws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
},
// Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.perChMaxRingLL128Bws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
},
// Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.perChMaxTreeLL128Bws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
},
// Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.perChMaxTreeBws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
}
};
void* context = NULL;
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants);
TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed");
// Test that the constants were set correctly
TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s");
TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s");
TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us");
// Clean up
pluginFinalize(context);
TEST_PASS();
}
// Test runner function pointer type
typedef int (*TestFunction)(void);
@@ -783,6 +904,8 @@ TestCase test_cases[] = {
{"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
{"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
{"empty-config", test_empty_config, "Empty configuration file handling"},
{"nvl-domain", test_nvl_domain_info, "NVL domain info handling"},
{"constants", test_tuner_constants, "Tuner constants initialization"},
{NULL, NULL, NULL} // End marker
};
@@ -826,6 +949,7 @@ int main(int argc, char* argv[]) {
if (argc == 1) {
// No arguments - run all tests
for (int i = 0; test_cases[i].name != NULL; i++) {
printf("Running test: %s\n", test_cases[i].name);
total++;
passed += test_cases[i].func();
}
+21 -3
Melihat File
@@ -26,7 +26,7 @@ install_dependencies=false
install_library=false
install_prefix="${ROCM_PATH}"
log_trace=false
msccl_kernel_enabled=true
msccl_kernel_enabled=false
mscclpp_enabled=false
enable_mscclpp_clip=false
num_parallel_jobs=$(nproc)
@@ -39,7 +39,9 @@ run_tests_all=false
time_trace=false
force_reduce_pipeline=false
generate_sym_kernels=false
warp_speed_enabled=true # note that this flag will be overridden to false for non MI350/MI300 platforms
quiet_warnings=false
build_rocshmem_support=false
# #################################################
# helper functions
@@ -54,7 +56,7 @@ function display_help()
echo " --debug Build debug library"
echo " --enable_backtrace Build with custom backtrace support"
echo " --disable-colltrace Build without collective trace"
echo " --disable-msccl-kernel Build without MSCCL kernels"
echo " --enable-msccl-kernel Build with MSCCL kernels"
echo " --dump-asm Disassemble code and dump assembly with inline code"
echo " --enable-mscclpp Build with MSCCL++ support"
echo " --enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines"
@@ -81,6 +83,7 @@ function display_help()
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
echo " --generate-sym-kernels Generate symmetric memory kernels"
echo " -q|--quiet-warnings Suppress majority of compiler warnings (not recommended)"
echo " --rocshmem Build with rocSHMEM support"
}
# #################################################
@@ -90,7 +93,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,disable-warp-speed,verbose,rocshmem -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -137,7 +140,9 @@ while true; do
--verbose) build_verbose=true; shift ;;
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
--generate-sym-kernels) generate_sym_kernels=true; shift ;;
--disable-warp-speed) warp_speed_enabled=false; shift ;;
-q | --quiet-warnings) quiet_warnings=true; shift ;;
--rocshmem) build_rocshmem_support=true; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -316,12 +321,25 @@ if [[ "${npkit_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
fi
# Enable WARP_SPEED only on MI350/MI300 platforms
if [[ "${warp_speed_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_WARP_SPEED=ON"
fi
# Suppress Warnings
if [[ "${quiet_warnings}" == true ]]; then
cmake_common_options="${cmake_common_options} -DQUIET_WARNINGS=ON"
fi
# Enable rocSHMEM support
if [[ "${build_rocshmem_support}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=ON"
cmake_common_options="${cmake_common_options} -DROCSHMEM_INSTALL_DIR=${ROCSHMEM_INSTALL_DIR}"
else
cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=OFF"
fi
check_exit_code "$?"
# Enable ninja build for time tracing
+1 -6
Melihat File
@@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
# You should define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
# SM35 is deprecated from CUDA12.0 onwards
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
endif
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
+2 -2
Melihat File
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 27
NCCL_PATCH := 7
NCCL_MINOR := 28
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Melihat File
@@ -10,7 +10,7 @@ build : debian.build txz.build
BUILDDIR ?= $(abspath ../build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := debian txz
TARGETS := debian txz doc
all: ${TARGETS:%=%.build}
prep: ${TARGETS:%=%.prep}
build: ${TARGETS:%=%.build}
@@ -1,4 +1,4 @@
bin/ncclras /usr/bin
include/nccl.h /usr/include
include/* /usr/include
lib/libnccl.so /usr/lib/${pkg:MultiArch}
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
+2 -2
Melihat File
@@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li
# devel
install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/
install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
@@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_bindir}/ncclras
%{_includedir}/nccl.h
%{_includedir}/*
%{_libdir}/libnccl.so
%files static
+1 -1
Melihat File
@@ -22,7 +22,7 @@ prep: $(TXZTARGETS)
build: prep
$(MAKE) -C ../../src clean
@printf "Building source tar.xz package\n"
(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
(cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
@@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix}
NCCL_BUILD=${pkg:Revision}
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
if [ "${SRCTXZ_APITESTS}" = "1" ]; then
NCCLNAME+="-apitest"
fi
tar --exclude build \
INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk")
if [ "${SRCTXZ_APITESTS}" = "1" ]; then
# Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES
for entry in $(ls $NCCLDIR/test); do
if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then
EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry"
fi
done
else
# Exclude the entire test directory
EXCLUDE_TEST+=" --exclude test"
fi
tar --exclude fortran \
--exclude doc \
--exclude plc \
--exclude build \
--exclude ".git*" \
--exclude share \
--exclude ompi \
--exclude ext-net \
--exclude pkg/srctxz \
--exclude docker \
$EXCLUDE_TEST \
--transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
+180
Melihat File
@@ -0,0 +1,180 @@
# Source files
set(LIBSRCFILES
bootstrap.cc
channel.cc
ce_coll.cc
collectives.cc
debug.cc
enqueue.cc
group.cc
init.cc
init_nvtx.cc
proxy.cc
transport.cc
mnnvl.cc
allocator.cc
sym_kernels.cc
dev_runtime.cc
)
# Add compatibility shim if using static cudart
if(CUDARTLIB STREQUAL "cudart_static")
list(APPEND LIBSRCFILES enhcompat.cc)
endif()
# Configure pkg-config file
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
@ONLY
)
# Add files from subdirectories
add_subdirectory(transport)
add_subdirectory(misc)
add_subdirectory(register)
add_subdirectory(graph)
add_subdirectory(plugin)
add_subdirectory(device)
add_subdirectory(nccl_device)
add_subdirectory(ras)
add_subdirectory(scheduler)
add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
# Add all source files
list(APPEND LIBSRCFILES
${TRANSPORT_SOURCES}
${MISC_SOURCES}
${REGISTER_SOURCES}
${GRAPH_SOURCES}
${PLUGIN_SOURCES}
${RAS_SOURCES}
${SYM_SOURCES}
${SCHEDULER_SOURCES}
)
###################### Create a shared NCCL library ############################
add_library(nccl SHARED)
target_sources(nccl PRIVATE ${LIBSRCFILES})
# Include directories
target_include_directories(nccl PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/device
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
)
add_custom_command(
OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
-e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
-e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
-e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
-e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
)
add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
add_dependencies(nccl nccl_header)
# Set version and output name
set_target_properties(nccl PROPERTIES
VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
SOVERSION ${NCCL_MAJOR}
OUTPUT_NAME "nccl"
PREFIX "lib"
)
# Set CUDA specific flags
set_target_properties(nccl PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
POSITION_INDEPENDENT_CODE ON
)
# Link libraries
target_link_libraries(nccl
PRIVATE
nccl_device
pthread
rt
dl
${CUDAToolkit_LIBRARIES}
${EXTRA_LIBS}
)
# Set output directories for nccl shared library
set_target_properties(nccl PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
)
###################### Create a ras binary executable ############################
set(RAS_BINSRCFILES ras/client.cc)
add_executable(ncclras ${RAS_BINSRCFILES})
target_include_directories(ncclras PUBLIC
${CMAKE_BINARY_DIR}/include
${CUDAToolkit_INCLUDE_DIRS}
)
add_dependencies(ncclras nccl_header)
target_link_libraries(ncclras
PRIVATE
pthread
rt
dl
)
# Set output directory for ncclras executable
set_target_properties(ncclras PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
)
###################### Create a static NCCL library ############################
add_library(nccl_static STATIC ${LIBSRCFILES})
# Include directories
target_include_directories(nccl_static PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/device
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
)
# Add dependency on nccl_header
add_dependencies(nccl_static nccl_header)
# Link libraries
target_link_libraries(nccl_static
PRIVATE
nccl_device
pthread
rt
dl
${CUDAToolkit_LIBRARIES}
${EXTRA_LIBS}
)
# Set CUDA specific flags
set_target_properties(nccl_static PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
POSITION_INDEPENDENT_CODE ON
)
# Set output directory for nccl_static library
set_target_properties(nccl_static PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
)
+17 -3
Melihat File
@@ -7,10 +7,12 @@ include ../makefiles/common.mk
include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h
INCEXPORTS := nccl.h nccl_device.h \
$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
LIBSRCFILES := \
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \
$(wildcard graph/*.cc) \
$(wildcard misc/*.cc) \
$(wildcard transport/*.cc) \
@@ -19,6 +21,8 @@ LIBSRCFILES := \
$(wildcard plugin/net/*.cc) \
$(wildcard plugin/tuner/*.cc) \
$(wildcard plugin/profiler/*.cc) \
$(wildcard nccl_device/*.cc) \
$(wildcard scheduler/*.cc) \
$(filter-out ras/client.cc,$(wildcard ras/*.cc))
BINSRCFILES := ras/client.cc
@@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
mkdir -p $(INCDIR)
install -m 644 $< $@
$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)/nccl_device
install -m 644 $< $@
$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)/nccl_device/impl
install -m 644 $< $@
$(PKGDIR)/%.pc : %.pc
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(PKGDIR)
@@ -149,7 +163,7 @@ install : build
mkdir -p $(PREFIX)/bin
cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/
cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
+332 -62
Melihat File
@@ -7,10 +7,11 @@
#include "comm.h"
#include "transport.h"
#include "group.h"
#include "nvtx.h"
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
ncclResult_t ret = ncclSuccess;
#if ROCM_VERSION >= 70000
@@ -99,7 +100,7 @@ fail:
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
ncclResult_t ncclMemFree_impl(void *ptr) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
ncclResult_t ret = ncclSuccess;
int saveDevice;
@@ -129,70 +130,339 @@ fail:
goto exit;
}
// This is a collective function and should be called by all ranks in the communicator
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
ncclResult_t ret = ncclSuccess;
void* regSymAddr = NULL;
size_t allocSize = size;
size_t granularity;
CUdevice cuDev;
CUmemAllocationProp memprop = {};
CUmemGenericAllocationHandle memHandle;
int bit = 0, cnt = 0;
////////////////////////////////////////////////////////////////////////////////
// ncclSpace:
//
// This datastructure "cuts" the line of non-negative integers into segments
// which alternate between "full" (allocated) and "empty" (not allocated). The
// cuts are sorted ascending. The segment after the last cut must be empty
// (the unallocated frontier). Knwoing this we can deduce whether the segment
// ending at cut[i] is full or empty with this formula:
// isFull(i) = (i%2 != ncuts%2)
// aligment must be power of 2 as an input
while (bit < sizeof(size_t) * 8) {
if (alignment & (1L << bit)) cnt++;
if (cnt == 2) {
WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
goto fail;
void ncclSpaceConstruct(struct ncclSpace* a) {
memset(a, 0, sizeof(*a));
}
void ncclSpaceDestruct(struct ncclSpace* a) {
free(a->cuts);
}
static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) {
// Insert space for two cuts in `a->cuts[]` before `index`.
if (a->count + 2 > a->capacity) {
a->capacity *= 2;
if (a->capacity == 0) a->capacity = 16;
int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t));
for (int i=0; i < index; i++) cuts1[i] = a->cuts[i];
for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i];
free(a->cuts);
a->cuts = cuts1;
} else {
for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i];
}
a->cuts[index+0] = lo;
a->cuts[index+1] = hi;
a->count += 2;
// Filter pairs of adjacent repeated values from cuts[]. Since these mark
// boundaries where segments transition between full<->empty, dropping such a
// pair fuses two adjacent segments together. Examples:
// [1,2,3,3,4] -> [1,2,4]
// [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition
// [1,2,3,3,3,3,4] -> [1,2,4]
// Leading zeros don't have to be in pairs, they are always dropped:
// [0,1,2] -> [1,2]
// [0,0,1,2] -> [1,2]
int r = index, w = index; // Read and write cursors.
int64_t prev = r==0 ? 0 : a->cuts[r-1];
while (r < a->count) {
int64_t cur = a->cuts[r++];
a->cuts[w++] = cur;
if (prev == cur) { // Repeated value is an empty segment which can be deleted.
// Erase last two cuts or just one if we're at the start.
w -= w==1 ? 1 : 2;
// Zeros can only occur at the beginning (due to being sorted). We want to
// drop any number of zeros, but only even numbers of other repeated values.
// So set to zero here, which will make prev=0, thus if next value is zero
// it will be dropped but if its not zero then it will need to begin a new
// pair to be dropped.
cur = 0;
}
bit++;
prev = cur;
}
// temporarily align the alignment to NCCL_REC_PAGE_SIZE
ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
memprop.requestedHandleType = ncclCuMemHandleType;
memprop.location.id = cuDev;
CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
ALIGN_SIZE(allocSize, granularity);
CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
ALIGN_SIZE(comm->symAllocHead, alignment);
NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
comm->symAllocHead += allocSize;
*symPtr = regSymAddr;
exit:
return ret;
fail:
*symPtr = NULL;
goto exit;
a->count = w;
}
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
CUmemGenericAllocationHandle handle;
size_t size = 0;
ncclResult_t ret = ncclSuccess;
int saveDev = comm->cudaDev;
CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
if (ncclCuMemEnable()) {
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
ncclResult_t ncclSpaceAlloc(
struct ncclSpace* a, int64_t limit, int64_t size, int align,
int64_t* outOffset
) {
// When allocating we try to locate the first empty segment which can hold
// the allocation and move its lower cut upward.
int i = a->count%2; // First empty segment ends at cuts[i]
size_t off;
while (i <= a->count) {
size_t lo = i == 0 ? 0 : a->cuts[i-1];
size_t hi = i == a->count ? limit : a->cuts[i];
off = alignUp(lo, align);
if (off + size <= hi) {
*outOffset = off;
if (i == 0 || off + size == hi) { // Slow path required.
insertSegment(a, i, off, off+size);
} else { // We can just append to the end of a full segment.
a->cuts[i-1] = off + size;
}
return ncclSuccess;
}
i += 2; // Next empty segment
}
exit:
CUDACHECK(cudaSetDevice(saveDev));
return ret;
fail:
goto exit;
WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit);
return ncclInternalError;
}
ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) {
if (a->count == 0 || a->cuts[a->count-1] <= offset) {
WARN("No allocation found at offset=0x%lx", (long)offset);
return ncclInternalError;
}
// This could be binary search, but since allocate is linear there's no point.
int i = 1 - a->count%2; // First full segment ends at cuts[i]
while (a->cuts[i] <= offset) i += 2;
int64_t lo = i==0 ? 0 : a->cuts[i-1];
int64_t hi = a->cuts[i];
if (offset < lo || hi < offset + size) {
WARN("Given size=0x%lx extends beyond allocation.", (long)size);
return ncclInternalError;
}
// First try the two fast cases which just shrink a segment from one side.
if (i != 0 && lo == offset && offset + size != hi) {
a->cuts[i-1] = offset + size; // Bring bottom up.
} else if (lo != offset && offset + size == hi) {
a->cuts[i] = offset; // Bring top down.
} else { // Slow path.
insertSegment(a, i, offset, offset+size);
}
return ncclSuccess;
}
////////////////////////////////////////////////////////////////////////////////
// ncclShadowPool:
struct ncclShadowPage { // A contiguous block of (at most) 64 objects
struct ncclShadowPage* next;
int objSize;
uint64_t freeMask;
void* devObjs;
};
struct ncclShadowObject {
struct ncclShadowObject* next;
void* devObj;
void* hostObj;
struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool.
};
void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
pool->hbits = 0;
pool->count = 0;
pool->table = nullptr;
pool->pages = nullptr;
}
ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
if (pool->hbits != 0) {
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (pool->count != 0) {
for (int i=0; i < 1<<pool->hbits; i++) {
struct ncclShadowObject* obj = pool->table[i];
while (obj != nullptr) {
struct ncclShadowPage* page = obj->page;
if (page != nullptr) {
if (page->freeMask == 0) { // Put full pages back into page list.
page->freeMask = 1;
page->next = pool->pages;
pool->pages = page;
}
} else {
cudaFreeAsync(obj->devObj, stream);
}
struct ncclShadowObject* next = obj->next;
free(obj);
obj = next;
}
}
}
free(pool->table);
while (pool->pages != nullptr) {
cudaFreeAsync(pool->pages->devObjs, stream);
struct ncclShadowPage* next = pool->pages->next;
free(pool->pages);
pool->pages = next;
}
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
cudaMemPoolDestroy(pool->memPool);
}
return ncclSuccess;
}
static int hashBucket(int hbits, void* devObj) {
uintptr_t h = reinterpret_cast<uintptr_t>(devObj);
h ^= h>>32;
h *= 0x9e3779b97f4a7c13;
return (uint64_t)h >> (64-hbits);
}
static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) {
int b = hashBucket(pool->hbits, obj->devObj);
obj->next = pool->table[b];
pool->table[b] = obj;
}
ncclResult_t ncclShadowPoolAlloc(
struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj,
cudaStream_t stream
) {
if (size == 0) {
if (outDevObj) *outDevObj = nullptr;
if (outHostObj) *outHostObj = nullptr;
return ncclSuccess;
}
int hbits = pool->hbits;
if (hbits == 0) {
cudaMemPoolProps props = {};
props.allocType = cudaMemAllocationTypePinned;
props.handleTypes = cudaMemHandleTypeNone;
props.location.type = cudaMemLocationTypeDevice;
cudaGetDevice(&props.location.id);
CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props));
pool->hbits = hbits = 4;
pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<hbits);
for (int i=0; i < 1<<hbits; i++) pool->table[i] = nullptr;
}
// Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio.
if (pool->count+1 > 2<<hbits) {
struct ncclShadowObject** table0 = pool->table;
struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1));
pool->table = table1;
pool->hbits = hbits+1;
for (int i1=0; i1 < 2<<hbits; i1++) table1[i1] = nullptr;
for (int i0=0; i0 < 1<<hbits; i0++) {
struct ncclShadowObject* obj = table0[i0];
while (obj) {
struct ncclShadowObject* next = obj->next;
hashInsert(pool, obj);
obj = next;
}
}
hbits += 1; // match pool->hbits
free(table0);
}
struct ncclShadowPage* page;
void *devObj;
if ((64<<10)/size >= 3) {
int shift = std::max<int>(0, (int)log2Down(size) + 1 - 4);
int pageObjSize = ((size + (1<<shift)-1)>>shift)<<shift;
struct ncclShadowPage** pagePtr = &pool->pages;
while (true) {
page = *pagePtr;
if (page == nullptr) {
size_t pageSize = std::min<size_t>(64<<10, 64*pageObjSize);
page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage));
page->objSize = pageObjSize;
page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize);
page->next = pool->pages;
pool->pages = page;
CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream));
CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream));
// fall through...
}
if (page->objSize == pageObjSize) {
int slot = popFirstOneBit(&page->freeMask);
devObj = (char*)page->devObjs + slot*pageObjSize;
if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list.
break;
}
pagePtr = &page->next;
}
} else {
page = nullptr;
CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream));
CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream));
}
struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc(
sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size
);
obj->page = page;
obj->devObj = devObj;
obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t));
memset(obj->hostObj, 0, size);
hashInsert(pool, obj);
pool->count += 1;
if (outDevObj) *outDevObj = devObj;
if (outHostObj) *outHostObj = obj->hostObj;
return ncclSuccess;
}
ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) {
if (devObj == nullptr) return ncclSuccess;
int b = hashBucket(pool->hbits, devObj);
struct ncclShadowObject** pobj = &pool->table[b];
while (true) {
if (*pobj == nullptr) {
WARN("Device object does not exist in shadow pool.");
return ncclInternalError;
}
if ((*pobj)->devObj == devObj) break;
pobj = &(*pobj)->next;
}
struct ncclShadowObject* obj = *pobj;
*pobj = obj->next;
if (obj->page != nullptr) {
if (obj->page->freeMask == 0) {
obj->page->next = pool->pages;
pool->pages = obj->page;
}
int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize;
obj->page->freeMask |= uint64_t(1)<<slot;
} else {
CUDACHECK(cudaFreeAsync(devObj, stream));
}
free(obj);
pool->count -= 1;
return ncclSuccess;
}
ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) {
if (devObj == nullptr) {
*hostObj = nullptr;
return ncclSuccess;
}
int b = hashBucket(pool->hbits, devObj);
struct ncclShadowObject* obj = pool->table[b];
while (true) {
if (obj == nullptr) {
WARN("Device object does not exist in shadow pool.");
return ncclInternalError;
}
if (obj->devObj == devObj) break;
obj = obj->next;
}
*hostObj = obj->hostObj;
return ncclSuccess;
}
+10 -15
Melihat File
@@ -15,6 +15,7 @@
#include "signals.h" // [RCCL]
#include "param.h"
#include "ras.h"
#include <mutex>
#define BOOTSTRAP_N_CHECK_ABORT 10000
#define BOOTSTRAP_TAG_CONNECT (0x1 << 31)
@@ -86,13 +87,13 @@ struct bootstrapRootArgs {
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
static union ncclSocketAddress bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
static std::mutex bootstrapNetMutex;
NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);
ncclResult_t bootstrapNetInit() {
if (bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
std::lock_guard<std::mutex> lock(bootstrapNetMutex);
if (bootstrapNetInitDone == 0) {
const char* env = ncclGetEnv("NCCL_COMM_ID");
int nIfs = 0;
@@ -100,21 +101,18 @@ ncclResult_t bootstrapNetInit() {
union ncclSocketAddress remoteAddr;
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidArgument;
}
NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
&nIfs));
if (nIfs <= 0) {
WARN("NET/Socket : No usable listening interface found");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclSystemError;
}
} else {
NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
if (nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidUsage;
}
}
@@ -124,7 +122,6 @@ ncclResult_t bootstrapNetInit() {
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
bootstrapNetInitDone = 1;
}
pthread_mutex_unlock(&bootstrapNetLock);
}
return ncclSuccess;
}
@@ -486,7 +483,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
static int devOOB = -1;
if (devOOB < 0) {
pthread_mutex_lock(&bootstrapNetLock);
std::lock_guard<std::mutex> lock(bootstrapNetMutex);
if (devOOB < 0) {
const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
if (userIfEnv && strlen(userIfEnv) > 0) {
@@ -517,7 +514,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
else
WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidArgument;
}
} else {
@@ -530,13 +526,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
bool hasProp = res == ncclSuccess;
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
}
pthread_mutex_unlock(&bootstrapNetLock);
}
*dev = devOOB;
return ncclSuccess;
}
static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
@@ -544,7 +539,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
do {
NCCLCHECK(checkAbort(abortFlag, &abortCounter));
if (!*sendComm)
NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle));
if (!*recvComm)
NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
} while (!*sendComm || !*recvComm);
@@ -660,7 +655,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
if (ncclParamBootstrapNetEnable()) {
// Create net interface for other ranks to contact me (all gather)
NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
} else {
// create socket for ring neightbor to contact mee
@@ -714,7 +709,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
// accept and connect the ring network
if (ncclParamBootstrapNetEnable()) {
NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
&STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
&STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
} else {
@@ -807,7 +802,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
// create a handle for the others to reach out to me
if (ncclParamBootstrapNetEnable()) {
NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
} else {
// create socket for ring neightbor to contact mee
@@ -826,7 +821,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
if (ncclParamBootstrapNetEnable()) {
NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
&STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
&STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
ret, fail);
+615
Melihat File
@@ -0,0 +1,615 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "register_inline.h"
#include <cuda.h>
#include "rocmwrap.h"
#include "ce_coll.h"
#include "alloc.h"
// Static constant for graph synchronization
static const uint32_t GRAPH_SYNC_VALUE = 1;
// Static constants for intra-batch synchronization to improve CE collective performance with large scale
// Frequency of intra-batch synchronization
static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8;
// Message threshold for intra-batch synchronization
static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024;
ncclResult_t ncclCeInit(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
uint8_t* ceDevBase;
size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2;
ncclWindow_vidmem* ceWinDev;
ncclWindow_vidmem* ceWinDevHost;
// Ensure symmetric memory runtime is initialized
NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
// Allocate and register memory for the symmetric memory
NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail);
NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail);
NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail);
// Get the ncclDevrWindow from the winHost field
comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost;
comm->ceColl.baseUCSymReadyOffset = 0;
comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16);
comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset;
comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset;
comm->ceColl.ceSeqNum = 0;
comm->ceColl.useCompletePtr = false;
comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ;
comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD;
INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum);
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
// Clean up ceInitTaskQueue
while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
free(task);
}
// Clean up CE resources
if (comm->ceColl.baseUCSymReadyPtr != NULL) {
if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail);
NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail);
}
comm->ceColl.baseUCSymReadyPtr = NULL;
comm->ceColl.baseUCSymComplPtr = NULL;
comm->ceColl.ceSyncWin = NULL;
}
exit:
return ret;
fail:
goto exit;
}
bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
int driverVersion;
if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false;
// CE is supported in CUDA 12.5 and later
if (driverVersion >= 12050) {
switch (coll) {
case ncclFuncAllGather:
case ncclFuncAlltoAll:
case ncclFuncScatter:
case ncclFuncGather:
return true;
default:
return false;
}
}
return false;
}
ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
// Source pointer is either the constant graph sync value or the sequence number
void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)&currentSeq;
// Wait value is either the constant graph sync value or the sequence number
uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
// Use multi-cast address as destination pointer
void* mcDstPtr;
void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
// Write our own ready/complete flag to the multi-cast address
CUDACHECKGOTO(cudaMemcpyAsync(
mcDstPtr,
srcPtr,
sizeof(uint32_t),
cudaMemcpyHostToDevice,
stream), ret, fail);
// Add local wait operations for every other rank
for (int r = 0; r < comm->nRanks; ++r) {
if (r == comm->rank) continue;
batchParams[*opIdx] = {};
// batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
batchParams[*opIdx].waitValue.value = waitValue;
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
(*opIdx)++;
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
hipStreamBatchMemOpParams* batchParams,
size_t* opIdx) {
ncclResult_t ret = ncclSuccess;
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
// Write our own ready/complete flag to remote ranks
uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
for (int r = 0; r < comm->nRanks; ++r) {
if (r == comm->rank) continue;
void * peerDstPtr;
void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
batchParams[*opIdx] = {};
// batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
batchParams[*opIdx].writeValue.address = (CUdeviceptr)peerDstPtr;
batchParams[*opIdx].writeValue.value = waitValue;
// batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
(*opIdx)++;
}
// Add local wait operations for every other rank
for (int r = 0; r < comm->nRanks; ++r) {
if (r == comm->rank) continue;
batchParams[*opIdx] = {};
// batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
batchParams[*opIdx].waitValue.value = waitValue;
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
(*opIdx)++;
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Get pointers to the ready and complete synchronization arrays
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
// Allocate enough slots for all possible ops
size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
size_t opIdx = 0;
// Prepare batch memory operations for synchronization
hipStreamBatchMemOpParams* batchParams = nullptr;
NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
if (comm->nvlsSupport) {
NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail);
} else {
NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail);
}
// For CUDA graph capture, add reset operation
if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
for (int i = 0; i < comm->nRanks; i++) {
batchParams[opIdx] = {};
// batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
batchParams[opIdx].writeValue.value = 0;
// batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
opIdx++;
}
}
// Execute all memory operations in a single batch
CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
// Toggle the flag for next call
comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;
exit:
if (batchParams) free(batchParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
ncclResult_t ret = ncclSuccess;
params->srcs = nullptr;
params->dsts = nullptr;
params->sizes = nullptr;
params->numOps = 0;
params->intraBatchSync = false;
#if CUDART_VERSION >= 12080
params->attrs = nullptr;
params->attrIdxs = nullptr;
params->numAttrs = 0;
#endif
NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
#if CUDART_VERSION >= 12080
NCCLCHECKGOTO(ncclCalloc(&params->attrs, nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&params->attrIdxs, nRanks), ret, fail);
#endif
exit:
return ret;
fail:
goto exit;
}
void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
if (params->srcs) free(params->srcs);
if (params->dsts) free(params->dsts);
if (params->sizes) free(params->sizes);
#if CUDART_VERSION >= 12080
if (params->attrs) free(params->attrs);
if (params->attrIdxs) free(params->attrIdxs);
#endif
}
ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Check if there are any operations to perform
if (params->numOps == 0) {
return ncclSuccess;
}
// Check if we are in a CUDA graph capture
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
int driverVersion;
NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
//--------------Graph capture--------------
// cudaMemcpyBatchAsync is not supported during CUDA graph capture
if (capturing) {
for (int i =0; i < params->numOps; i++) {
CUDACHECKGOTO(cudaMemcpyAsync(
(void*)params->dsts[i],
(void*)params->srcs[i],
params->sizes[i],
cudaMemcpyDeviceToDevice,
stream), ret, fail);
if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
}
}
}
//--------------No graph capture--------------
else {
if (/*CUDART_VERSION >= 12080 &&*/ driverVersion >= 12080) {
#if CUDART_VERSION >= 12080
// For CUDA 12.8+, use batch memory copy for better performance
params->attrs[0] = {};
params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
params->attrIdxs[0] = 0;
params->numAttrs = 1;
if (params->intraBatchSync) {
// Break into multiple batches with sync between them
int batchSize = comm->ceColl.intraBatchSyncFreq;
for (int i = 0; i < params->numOps; i += batchSize) {
int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i;
#if CUDART_VERSION >= 13000
CUDACHECKGOTO(cudaMemcpyBatchAsync(
&params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
#else
CUDACHECKGOTO(cudaMemcpyBatchAsync(
&params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
#endif
// Sync after each batch
if (i + batchSize < params->numOps) {
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
}
}
} else {
// Use single batch for all operations
#if CUDART_VERSION >= 13000
CUDACHECKGOTO(cudaMemcpyBatchAsync(
params->dsts, params->srcs, params->sizes, params->numOps,
params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
#else
CUDACHECKGOTO(cudaMemcpyBatchAsync(
params->dsts, params->srcs, params->sizes, params->numOps,
params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
#endif
}
#endif
} else {
// For older CUDA versions, fall back to individual transfers
for (int i = 0; i < params->numOps; i++) {
CUDACHECKGOTO(cudaMemcpyAsync(
(void*)params->dsts[i],
(void*)params->srcs[i],
params->sizes[i],
cudaMemcpyDeviceToDevice,
stream), ret, fail);
if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
}
}
}
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of each rank's data chunk
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
void* peerRecvBuff;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
// Copy own data to receive buffer if operation is out-of-place
if (myRecvBuff != mySendBuff) {
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Copy data to other ranks
for (int r = 1; r < comm->nRanks; r++) {
int targetRank = (comm->rank + r) % comm->nRanks;
offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Check if we need to perform intra-batch synchronization
batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of data each rank sends to every other rank
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
void* peerRecvBuff;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
// Copy data to other ranks: send data chunk for each destination rank
for (int r = 0; r < comm->nRanks; r++) {
int dstRank = (comm->rank + r) % comm->nRanks;
uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
if (dstRank == comm->rank) {
// Local copy for own data
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
} else {
// Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank
offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
}
// Check if we need to perform intra-batch synchronization
batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of data root sends to each rank
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
int rootRank = args->rootRank;
void* peerDstPtr;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
if (comm->rank == rootRank) {
// Check if this is an in-place scatter operation
bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes);
// Copy root's own data first if not in-place
if (!isInPlace) {
uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes;
uint8_t* dstPtr = myRecvBuff;
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Root rank distributes data to other ranks
for (int r = 1; r < comm->nRanks; r++) {
int dstRank = (comm->rank + r) % comm->nRanks;
uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff;
offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
}
// Non-root ranks don't need to perform any copy operations
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of data each rank sends to root
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
int rootRank = args->rootRank;
void* peerRecvBuff;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
if (comm->rank == rootRank) {
// Root rank copies its own data to the correct position in receive buffer
uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
if (mySendBuff != dstPtr) {
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
} else {
// Non-root ranks send their data to root's receive buffer
uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) {
ncclResult_t ret = ncclSuccess;
cudaStream_t stream = comm->planner.streams->stream;
struct ncclCeCollArgs* args = plan->ceCollArgs;
switch (args->func) {
case ncclFuncAllGather:
NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail);
break;
case ncclFuncAlltoAll:
NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail);
break;
case ncclFuncScatter:
NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail);
break;
case ncclFuncGather:
NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail);
break;
default:
ret = ncclInvalidUsage;
}
exit:
return ret;
fail:
goto exit;
}
+141 -163
Melihat File
@@ -13,16 +13,23 @@
#include "nvtx_payload_schemas.h"
#include "msccl/msccl_lifecycle.h"
#ifdef ENABLE_ROCSHMEM
#include <rocshmem/rocshmem.hpp>
#endif
using namespace rccl;
const char* ncclFuncToString(ncclFunc_t fn) {
switch (fn) {
case ncclFuncAllGather: return "AllGather";
case ncclFuncAllReduce: return "AllReduce";
case ncclFuncAlltoAll: return "AlltoAll";
case ncclFuncBroadcast: return "Broadcast";
case ncclFuncGather: return "Gather";
case ncclFuncRecv: return "Recv";
case ncclFuncReduce: return "Reduce";
case ncclFuncReduceScatter: return "ReduceScatter";
case ncclFuncScatter: return "Scatter";
case ncclFuncSendRecv: return "SendRecv";
case ncclFuncSend: return "Send";
default: return "Invalid";
@@ -81,7 +88,6 @@ const char* ncclProtoToString(int proto) {
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
@@ -91,9 +97,12 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, comm -> rcclUseOneSlice ? ALLGATHER_SLICESTEPS_SINGLE_NODE : ALLGATHER_SLICESTEPS, nullptr };
int nRanks;
int nRanks, rank;
int in_place = 0;
const void* srcBuf;
void* dstBuf;
NCCLCHECK(ncclCommCount(comm, &nRanks));
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t msgSize = sendcount * ncclTypeSize(datatype) * nRanks;
if (!mscclIsCaller())
@@ -108,21 +117,28 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
}
if (rcclUseAllGatherDirect(comm, msgSize)) {
INFO(NCCL_INIT, "RCCL DIRECT ALLGATHER count = %zu, msgSize = %zu, comm = %p, stream = %p, rank = %d, sendbuff = %p, recvbuff = %p",
sendcount, msgSize, comm, stream, rank, sendbuff, recvbuff);
// use direct allgather
if (sendcount == 0) return ncclSuccess;
size_t rankOffset = sendcount * ncclTypeSize(datatype);
if (((char*)sendbuff) == (((char*)recvbuff) + comm->rank * rankOffset)) {
if (sendbuff == (((char*)recvbuff) + rank * rankOffset)) {
srcBuf = ((char*)recvbuff) + rank * rankOffset;
dstBuf = recvbuff;
in_place = 1;
}
} else {
srcBuf = sendbuff;
dstBuf = recvbuff;
}
NCCLCHECK(ncclGroupStart());
for (int r = 0; r < nRanks; r++) {
int peer = (comm->rank + r) % nRanks;
if (in_place && (peer == comm->rank)) {
continue;
}
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, peer, comm, stream));
NCCLCHECK(ncclRecv(((char*)recvbuff) + peer * rankOffset, sendcount, datatype, peer, comm, stream));
if (r == rank && in_place)
continue;
NCCLCHECK(ncclSend(((char*)srcBuf), sendcount, datatype, r, comm, stream));
NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
@@ -132,10 +148,101 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
}
}
RCCL_PARAM(AlltoAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAlltoAll_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
}
size_t rankOffset = count * ncclTypeSize(datatype);
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
struct ncclInfo info;
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAlltoAllPivotEnable()) {
info = { ncclFuncAlltoAllPivot, "AlltoAllPivot",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
} else {
#ifdef ENABLE_ROCSHMEM
if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {
struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
return ncclEnqueueCheck(&info);
}
#endif ENABLE_ROCSHMEM
info = { ncclFuncAlltoAll, "AlltoAll",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
}
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclAlltoAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAlltoAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AlltoAllv, NcclNvtxParamsAlltoAllv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
sendcounts[r],
datatype,
r,
comm,
stream));
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
recvcounts[r],
datatype,
r,
comm,
stream));
}
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
@@ -186,104 +293,8 @@ ncclResult_t ncclAllReduceWithBias_impl(const void* sendbuff, void* recvbuff, si
return ncclEnqueueCheck(&info);
}
RCCL_PARAM(AllToAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
}
size_t rankOffset = count * ncclTypeSize(datatype);
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
// Determine Pivot A2A support now that we know number of channels
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAllToAllPivotEnable()) {
struct ncclInfo info = { ncclFuncAllToAllPivot, "AllToAllPivot",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
return ncclEnqueueCheck(&info);
} else {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (count == 0) return ncclSuccess;
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
}
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
}
}
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
sendcounts[r],
datatype,
r,
comm,
stream));
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
recvcounts[r],
datatype,
r,
comm,
stream));
}
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
@@ -315,46 +326,32 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, count, datatype, comm, stream, root));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
count, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = sendcount * ncclTypeSize(datatype);
if (sendcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
struct ncclInfo info = { ncclFuncGather, "Gather",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
@@ -380,8 +377,6 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
@@ -405,48 +400,32 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, count, datatype, comm, stream, root));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
count, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = recvcount * ncclTypeSize(datatype);
if (recvcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
struct ncclInfo info = { ncclFuncScatter, "Scatter",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
@@ -472,7 +451,6 @@ ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t da
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
+26
Melihat File
@@ -0,0 +1,26 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
#include "nccl.h"
#include <cstring>
#include "comm.h"
#include "device.h"
#include "archinfo.h"
__attribute__ ((visibility("default")))
ncclResult_t ncclCommDump(
const ncclComm_t comm,
std::unordered_map<std::string, std::string>& map) {
if (comm == nullptr) {
WARN("ncclCommDump comm is null");
return ncclSuccess;
}
if (comm->proxyState->proxyTrace == nullptr) {
WARN("ncclCommDump comm->proxyState->proxyTrace is null");
return ncclSuccess;
}
WARN("ncclCommDump() ProxyTrace:");
WARN("%s", comm->proxyState->proxyTrace->dump().c_str());
return ncclSuccess;
}
+2 -2
Melihat File
@@ -28,7 +28,7 @@ static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
static uint64_t ncclDebugMask = 0;
uint64_t ncclDebugMask = 0;
FILE *ncclDebugFile = stdout;
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
static std::chrono::steady_clock::time_point ncclEpoch;
@@ -419,4 +419,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
va_end(vargs);
pthread_setname_np(thread, threadName);
#endif
}
}
File diff ditekan karena terlalu besar Load Diff
@@ -0,0 +1,60 @@
# Run the scripts once during configuration to get the file lists
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
OUTPUT_VARIABLE files
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
string(STRIP "${files}" files)
list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/)
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
OUTPUT_VARIABLE symmetric_files
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
string(STRIP "${symmetric_files}" symmetric_files)
list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/)
# Create custom commands to generate source files with proper dependencies
add_custom_command(
OUTPUT ${files}
BYPRODUCTS ${files}
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating device source files"
)
add_custom_command(
OUTPUT ${symmetric_files}
BYPRODUCTS ${symmetric_files}
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating symmetric device source files"
)
# Add library target
add_library(nccl_device OBJECT
${files}
${symmetric_files}
${CMAKE_CURRENT_SOURCE_DIR}/common.cu
${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu
)
set_target_properties(nccl_device PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
# Set include directories for the target
target_include_directories(nccl_device PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_SOURCE_DIR}/src/include
${CMAKE_SOURCE_DIR}/src/include/plugin
${CMAKE_BINARY_DIR}/include
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
)
add_dependencies(nccl_device nccl_header)
+6 -2
Melihat File
@@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device
MANIFEST := $(OBJDIR)/manifest
DEVGLUE_OBJ := $(OBJDIR)/device_glue.o
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
CXXFLAGS += $(INCFLAGS)
@@ -47,7 +47,11 @@ endif
define COMPILE_SYM
@$(SAY) "Compiling" $2;\
mkdir -p $(dir $1);\
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
if [[ -n "$3" ]]; then\
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\
else\
touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\
fi
endef
DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
+11 -2
Melihat File
@@ -20,11 +20,20 @@ namespace {
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
const int *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
ssize_t count, partOffset, partCount, chunkCount;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
#endif
ssize_t offset;
ssize_t dataOffset;
int nelem;
@@ -142,7 +151,7 @@ namespace {
#endif
// Final wait/copy.
prims.directRecv(offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
@@ -671,4 +680,4 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
return;
}
}
};
};
+10
Melihat File
@@ -20,8 +20,14 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
int ringIx = ring->index;
const int nranks = ncclShmem.comm.nRanks;
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
@@ -31,7 +37,11 @@ namespace {
ssize_t gridOffset;
ssize_t channelCount;
ssize_t chunkCount;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#endif
const ssize_t loopCount = nranks * chunkCount;
ssize_t offset;
int nelem;
@@ -0,0 +1,33 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "device.h"
#include "collectives.h"
#include "primitives.h"
#ifdef ENABLE_ROCSHMEM
#include <rocshmem/rocshmem.hpp>
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllToAllGda, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
if (blockIdx.x == 0) {
int num_pes = rocshmem::rocshmem_n_pes();
reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
tid, nThreads, 0, nullptr, false, 1, (void **)&work->sendbuff, 1, (void **)&work->sndbuff,
(work->size*num_pes));
rocshmem::rocshmem_char_alltoall_wg(work->team, ((char*)work->tempbuff), ((char*)work->sndbuff), work->size);
reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
tid, nThreads, 0, nullptr, false, 1, (void **)&work->tempbuff, 1, (void **)&work->recvbuff,
(work->size*num_pes));
}
}
};
#endif
+1 -1
Melihat File
@@ -75,7 +75,7 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
struct RunWorkColl<ncclFuncAlltoAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
runRing<T, RedOp, Proto>(tid, nThreads, work);
+9
Melihat File
@@ -19,7 +19,12 @@ namespace {
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
const int rank = ring->userRanks[0];
const int nextRank = ring->userRanks[1];
const int root = work->root;
@@ -27,7 +32,11 @@ namespace {
ssize_t chunkCount;
ssize_t channelCount;
ssize_t gridOffset;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#endif
size_t offset;
int nelem;
int workNthreads;
+12 -12
Melihat File
@@ -17,24 +17,24 @@ struct RunWorkNop {
__device__ void run() {}
};
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&argsStorage.args);
}
#ifdef ENABLE_COLLTRACE
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&argsStorage.args);
}
#endif
+101 -28
Melihat File
@@ -27,17 +27,30 @@
#endif
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
#define __trace_hwreg()
#define __trace_hwreg() \
collTrace->data_0 = 0;
#else
#define __trace_hwreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
{ int32_t hwid; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwid)); \
collTrace->data_0 = hwid >> 4; }
#endif
#if defined(__gfx942__) || defined(__gfx950__)
#define __trace_xccid() \
{ int32_t xccId; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xccId)); \
collTrace->xccId = xccId; }
#else
#define __trace_xccid() \
collTrace->xccId = 0;
#endif
#ifdef ENABLE_COLLTRACE
#define INC_COLL_TRACE \
uint32_t pos = __hip_atomic_fetch_add(&ncclShmem.collTraceTail->tail, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_WORKGROUP)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->tid = threadIdx.x; \
collTrace->channelId = ncclShmem.channelId;
// TODO: switch to atomicInc after llvm crash is fixed
@@ -46,7 +59,8 @@
#define traceKernelLaunch(launch_type, ix) { \
INC_COLL_TRACE \
collTrace->funcIndex = ncclShmem.funcId; \
__trace_hwreg()\
__trace_hwreg() \
__trace_xccid() \
collTrace->batchIx = ix; \
if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
@@ -63,7 +77,7 @@
collTrace->p2p.recvRegistered = p2pWork->recvNetReg; \
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
__hip_atomic_store(&collTrace->type, (launch_type) | ncclCollTraceP2pElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
collTrace->coll.nWarps = collWork->nWarps; \
@@ -71,7 +85,7 @@
collTrace->coll.bid = ncclShmem.channelId - collWork->channelLo; \
collTrace->coll.root = collWork->root; \
collTrace->opCount = collWork->opCount; \
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
__hip_atomic_store(&collTrace->type, (launch_type) | ncclCollTraceCollElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
} \
}
#define traceKernelEnd(end_type) { \
@@ -81,11 +95,11 @@
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
collTrace->type = (end_type) | ncclCollTraceP2pElemType; \
__hip_atomic_store(&collTrace->type, (end_type) | ncclCollTraceP2pElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
collTrace->opCount = collWork->opCount; \
collTrace->type = (end_type) | ncclCollTraceCollElemType; \
__hip_atomic_store(&collTrace->type, (end_type) | ncclCollTraceCollElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
} \
}
#define traceData(data2, data4, data8_0, data8_1) { \
@@ -94,12 +108,12 @@
collTrace->data_0 = data4; \
collTrace->opCount = data8_0; \
collTrace->data_1 = data8_1; \
collTrace->type = ncclCollTraceDataType; \
__hip_atomic_store(&collTrace->type, ncclCollTraceDataType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
}
#define traceAbort(){\
INC_COLL_TRACE\
collTrace->funcIndex = ncclShmem.funcId;\
collTrace->type = ncclCollTraceAbortType;\
__hip_atomic_store(&collTrace->type, ncclCollTraceAbortType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
}
#else
#define traceKernelLaunch(launch_type, batchIx)
@@ -136,9 +150,13 @@ struct ncclShmemData {
struct ncclDevKernelArgs args;
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclKernelComm comm;
alignas(16) struct ncclDevChannel channel;
#ifdef ENABLE_WARP_SPEED
int warpComm;
alignas(16) struct ncclDevChannel warpChannel[NCCL_MAX_GROUPS];
int warpChannelId[NCCL_MAX_GROUPS];
#endif
int batchIx, nextBatchIx;
enum ncclDevWorkType workType;
uint8_t directMode;
@@ -284,10 +302,10 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
if (WARP_SIZE == 64) {
if (uint64_t(batch.offsetBitset) & (1ull<<lane)) {
int nWorksBelow = __popc(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
int nWorksBelow = __popcll(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
fnsOfBitset[nWorksBelow] = lane;
}
nWorks = __popc(uint64_t(batch.offsetBitset));
nWorks = __popcll(uint64_t(batch.offsetBitset));
} else {
// WARP_SIZE == 32
if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
@@ -442,10 +460,17 @@ struct RunWorkBatch {
if (work->nWarps != workPrev->nWarps) __syncthreads();
}
int subtn = work->nWarps*WARP_SIZE;
#ifdef ENABLE_WARP_SPEED
if (tid < subtn) {
if(ncclShmem.warpComm == 0 || Algo != NCCL_ALGO_RING) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
else if (ncclShmem.warpChannelId[tid / WARP_SIZE] >= 0) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid % WARP_SIZE, WARP_SIZE, work);
}
#else
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
#endif
}
}
};
@@ -477,7 +502,7 @@ __device__ __forceinline__ void profiler(int action) {
ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
}
ncclShmem.channel.workCounter += ncclShmem.nWorks;
if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
}
}
}
@@ -489,7 +514,12 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
int x = tid;
int total = 0, y;
int num = MAXCHANNELS/64 > 0 ? MAXCHANNELS/64 : 1;
#ifdef ENABLE_WARP_SPEED
int warpCount = tn / WARP_SIZE;
int localWarpId = tid / WARP_SIZE;
int globalWarpId = (warpCount * blockIdx.x) + localWarpId;
int laneId = tid % WARP_SIZE;
#endif
// Copy kernel args to shmem and then only read those. Otherwise the compiler
// will end up putting the args into thread local stack which is very wasteful.
if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
@@ -549,7 +579,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
/* set abort flag to 0 */
if (tid == 0) {
ncclShmem.aborted = 0;
ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
}
// Use first 2 warps to load comm and channel, and remaining load work batch.
@@ -557,14 +587,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
case 0:
{ void* dst = &ncclShmem.comm;
void* src = ncclShmem.args.comm;
int bytes = sizeof(ncclDevComm);
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
int bytes = sizeof(ncclKernelComm);
static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn.");
copyToShmem16(tid, dst, src, bytes);
} break;
case 1:
{ // Get address of channel without incurring indirect load from ncclDevComm::channels
{ // Get address of channel without incurring indirect load from ncclKernelComm::channels
void* dst = &ncclShmem.channel;
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
int bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
@@ -583,9 +613,52 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
ncclShmem.collTrace = args->comm->collTrace + COLLTRACE_NUM_ITEMS*ncclShmem.channelId;
ncclShmem.collTraceTail = args->comm->collTraceTail + ncclShmem.channelId;
}
#endif
#ifdef ENABLE_WARP_SPEED
if(tid == 0) {
ncclShmem.warpComm = args->comm->warpLevelComm;
}
#endif
__syncthreads(); // publish shmem
#ifdef ENABLE_WARP_SPEED
// Determine per-warp channel assignment for WarpSpeed enablement
total = 0;
if(ncclShmem.warpComm == 1) { // If warpComm is enabled, assign warps to channels that have the corresponding channel mask enabled
ncclShmem.warpChannelId[localWarpId] = -1;
__syncthreads();
for (int i = 0; i < num; i++) {
if (args->channelMask.masks[i] & (1ull<<laneId)) {
y = __popcll(args->channelMask.masks[i] & ((1ull<<laneId)-1));
y = total + y;
if (globalWarpId == y) {
ncclShmem.warpChannelId[localWarpId] = laneId + total;
break;
}
}
total = total + __popcll(args->channelMask.masks[i]);
}
__syncthreads();
if(ncclShmem.warpChannelId[localWarpId] >= 0) {
void* dst = &ncclShmem.warpChannel[localWarpId];
void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
int bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
// assert((tid-localWarpId*WARP_SIZE) >= 0 && (tid-localWarpId*WARP_SIZE) < WARP_SIZE);
copyToShmem16(tid-localWarpId*WARP_SIZE, dst, src, bytes);
}
} else { // If warpComm is disabled, all warps use the same channel as the block
if(laneId == 0) {
ncclShmem.warpChannelId[localWarpId] = ncclShmem.channelId;
}
// Use all threads in the warp to copy the channel data in parallel
void* dst = &ncclShmem.warpChannel[localWarpId];
void* src = &ncclShmem.channel;
int bytes = sizeof(ncclDevChannel);
copyToShmem16(laneId, dst, src, bytes);
}
__syncthreads();
#endif
#ifdef ENABLE_PROFILING
if (tid == 0) {
ncclShmem.prof.count = 0;
@@ -648,17 +721,17 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
#endif
}
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
#ifdef ENABLE_COLLTRACE
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
#endif
#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {}
#ifdef USE_INDIRECT_FUNCTION_CALL
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
+24 -13
Melihat File
@@ -3,9 +3,10 @@ import os
import sys
import subprocess
from dataclasses import dataclass
import shutil
# Order of colls, redops, tys, protos, algos must match src/include/device.h
all_colls = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AllToAllPivot"]
all_colls = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AlltoAllPivot", "AllToAllGda"]
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
all_protos = ["LL","LL128","SIMPLE"]
@@ -24,8 +25,11 @@ gensrc = sys.argv[1]
if os.path.exists(gensrc):
for name in os.listdir(gensrc):
os.remove(os.path.join(gensrc, name))
#os.truncate(os.path.join(gensrc, name), 0)
path = os.path.join(gensrc, name)
if os.path.isfile(path):
os.remove(path)
elif os.path.isdir(path):
shutil.rmtree(path)
else:
os.makedirs(gensrc)
@@ -64,7 +68,7 @@ else:
# make ONLY_FUNCS="AllReduce RING SIMPLE * *|ReduceScatter RING LL * f32"
# --- or ---
# make ONLY_FUNCS="AllReduce RING SIMPLE|ReduceScatter RING LL * f32"
# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AllToAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AlltoAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
# Paste all non-None arguments together with `sep`.
def paste(sep, *args):
@@ -79,14 +83,15 @@ func_pattern = sys.argv[6:7]
if func_pattern and func_pattern[0]:
func_pattern = func_pattern[0]
else:
func_pattern = "AllGather|AllReduce|AllToAllPivot|Broadcast|Reduce|ReduceScatter|SendRecv"
func_pattern = "AllGather|AllReduce|AlltoAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"
################################################################################
algos_of_coll = {
"AllGather": ["RING", "PAT"],
"AllReduce": ["RING", "TREE"],
"AllToAllPivot": ["RING"],
"AlltoAllPivot": ["RING"],
"AllToAllGda": ["RING"],
"Broadcast": ["RING"],
"Reduce": ["RING"],
"ReduceScatter": ["RING", "PAT"],
@@ -96,7 +101,8 @@ algos_of_coll = {
protos_of_coll = {
"AllGather": all_protos,
"AllReduce": all_protos,
"AllToAllPivot": ["SIMPLE"],
"AlltoAllPivot": ["SIMPLE"],
"AllToAllGda": ["SIMPLE"],
"Broadcast": all_protos,
"Reduce": all_protos,
"ReduceScatter": all_protos,
@@ -106,7 +112,8 @@ protos_of_coll = {
redops_of_coll = {
"AllGather": ["Sum"],
"AllReduce": all_redops,
"AllToAllPivot": ["Sum"],
"AlltoAllPivot": ["Sum"],
"AllToAllGda": ["Sum"],
"Broadcast": ["Sum"],
"Reduce": all_redops,
"ReduceScatter": all_redops,
@@ -116,7 +123,8 @@ redops_of_coll = {
tys_of_coll = {
"AllGather": ["i8"],
"AllReduce": all_tys,
"AllToAllPivot": ["i8"],
"AlltoAllPivot": ["i8"],
"AllToAllGda": ["i8"],
"Broadcast": ["i8"],
"Reduce": all_tys,
"ReduceScatter": all_tys,
@@ -126,7 +134,8 @@ tys_of_coll = {
acc_of_coll = {
"AllGather": ["0"],
"AllReduce": all_accs,
"AllToAllPivot": ["0"],
"AlltoAllPivot": ["0"],
"AllToAllGda": ["0"],
"Broadcast": ["0"],
"Reduce": ["0"],
"ReduceScatter": ["0"],
@@ -136,7 +145,8 @@ acc_of_coll = {
pipelines_of_coll = {
"AllGather": ["0"],
"AllReduce": all_pipelines,
"AllToAllPivot": ["0"],
"AlltoAllPivot": ["0"],
"AllToAllGda": ["0"],
"Broadcast": ["0"],
"Reduce": all_pipelines,
"ReduceScatter": all_pipelines,
@@ -147,7 +157,8 @@ pipelined_types = ["bf16"]
coll_camel_to_lower = {
"AllGather": "all_gather",
"AllReduce": "all_reduce",
"AllToAllPivot": "alltoall_pivot",
"AlltoAllPivot": "alltoall_pivot",
"AllToAllGda": "alltoall_gda",
"Broadcast": "broadcast",
"Reduce": "reduce",
"ReduceScatter": "reduce_scatter",
@@ -503,7 +514,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
)
if fn.coll == "Broadcast":
key = ((coll_idx & 0x3F) | ((proto_idx & 0x3F) << 8))
if fn.coll in ["SendRecv", "AllToAllPivot"]:
if fn.coll in ["SendRecv", "AlltoAllPivot", "AllToAllGda"]:
key = ((coll_idx & 0x3F))
out(f' {{{key}, {fn_id}}}, {comment}\n')
@@ -93,7 +93,7 @@ __device__ __forceinline__ static void mscclReduce(int c, int numReductions, int
template<typename T, typename RedOp, typename Proto, bool fullOps>
__device__ __forceinline__ void mscclRunInterpreter(
struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int nthreads = MSCCL_MAX_NTHREADS;
@@ -120,12 +120,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
case 0:
dst = &ncclShmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
bytes = sizeof(ncclKernelComm);
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
// Get address of channel without incurring indirect load from ncclKernelComm::channels
dst = &ncclShmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
src = &((ncclKernelCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
break;
case 2:
@@ -146,6 +146,9 @@ __device__ __forceinline__ void mscclRunInterpreter(
}
if (bytes) copyToShmem8(tid%WARP_SIZE, dst, src, bytes);
}
#ifdef ENABLE_WARP_SPEED
ncclShmem.warpComm = 0;
#endif
__syncthreads(); // publish shmem
#if defined(ENABLE_NPKIT)
@@ -369,13 +372,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
}
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL, fullOps>(comm, algo, work); \
} \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL128, fullOps>(comm, algo, work); \
} \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS, 0, 2>, fullOps>(comm, algo, work); \
}
+4
Melihat File
@@ -654,7 +654,11 @@ public:
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), threadsPerBlock(blockDim.x),
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
#ifdef ENABLE_WARP_SPEED
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[threadIdx.x / WARP_SIZE];
#else
auto *channel = &ncclShmem.channel;
#endif
barriers = &ncclShmem.groups[group].barrier;
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
@@ -579,7 +579,11 @@ public:
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), /*compiler warnings*/
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)),
warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group), threadsPerBlock(blockDim.x){
#ifdef ENABLE_WARP_SPEED
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[warpInBlock];
#else
auto *channel = &ncclShmem.channel;
#endif
barriers = &ncclShmem.groups[group].barrier;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
+21 -6
Melihat File
@@ -502,14 +502,22 @@ private:
public:
static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) {
#ifdef ENABLE_WARP_SPEED
ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
#else
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
#endif
peerPtr->send[connIndex].step += steps;
st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step);
}
static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) {
int spins = 0;
#ifdef ENABLE_WARP_SPEED
ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
#else
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
#endif
peerPtr->recv[connIndex].step += steps;
st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
@@ -770,13 +778,20 @@ public:
struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
):
tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
#ifdef ENABLE_WARP_SPEED
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(ncclShmem.warpComm? tidInBlock / WARP_SIZE : group), threadsPerBlock(blockDim.x){
#else
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group), threadsPerBlock(blockDim.x){
#endif
barriers = &ncclShmem.groups[group].barrier;
// PAT uses the same barrier for each group
barriers_pat = &ncclShmem.barrier_pat;
this->nworkers = nthreads;
#ifdef ENABLE_WARP_SPEED
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[tidInBlock/WARP_SIZE];
#else
auto *channel = &ncclShmem.channel;
#endif
int peer = -1;
flags = 0;
index = -1;
@@ -831,9 +846,9 @@ public:
}
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(channel->peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(channel->peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
// if (barrierAny(flags & NetDeviceUnpack)) {
// flags |= AnyNetDeviceUnpack;
@@ -861,7 +876,7 @@ public:
// Load recv peer
int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
struct ncclConnInfo* conn = peer->conn = channel->peers[recvPeer]->recv+connIndexRecv;
peer->step = conn->step;
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
@@ -871,7 +886,7 @@ public:
// Load send peer
int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
peer = ((struct ncclPatPeer*)sendPeers)+tid;
conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
conn = peer->conn = channel->peers[sendPeer]->send+connIndexSend;
peer->step = conn->step;
peer->connFifo = conn->connFifo;
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+9
Melihat File
@@ -16,7 +16,12 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
const int prevRank = ring->userRanks[nranks-1];
@@ -24,7 +29,11 @@ namespace {
size_t chunkCount;
size_t channelCount;
size_t gridOffset;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
#endif
size_t offset;
int nelem;
+1 -1
Melihat File
@@ -414,7 +414,7 @@ SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __h
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
#elif ROCM_VERSION < 60000
#else
SPECIALIZE_REDUCE(FuncSum, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) + (float)(y)))
SPECIALIZE_REDUCE(FuncProd, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) * (float)(y)))
SPECIALIZE_REDUCE(FuncMinMax, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)(fn.isMinNotMax ? fminf((float)(x), (float)(y)) : fmaxf((float)(x), (float)(y))))
@@ -16,14 +16,23 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
int const *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
size_t count;
size_t gridOffset;
size_t channelCount;
size_t chunkCount;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
#endif
size_t offset;
size_t dataOffset;
uint32_t nelem;
@@ -1,35 +1,36 @@
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "symmetric.h"
#include "sym_kernels.h"
#include "symmetric/kernel.h"
#include "symmetric/primitives.h"
template<int BytePerPack, int UnrollPacks, int UnrollPeers>
static __device__ void bcastDeep(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
char* inputHere, char* outputRank0, bool inPlace, int nIters
ncclSymkArgsHandler const& handler, int tn, int t,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
ncclSymPtr<char> input, ncclSymPtr<char> output, bool inPlace, int nIters
) {
using Pack = BytePack<BytePerPack>;
int wn = tn/WARP_SIZE;
int w = t/WARP_SIZE;
int lane = t%WARP_SIZE;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack tmp[UnrollPacks];
nIters -= w;
if (0 < nIters) {
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp[u] = inpHere[u*WARP_SIZE];
tmp[u] = inpPacks[u*WARP_SIZE];
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
if (0 < nIters) {
while (true) {
@@ -47,21 +48,21 @@ static __device__ void bcastDeep(
if (partial && dr == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u];
}
if (++r == nRanks) r = 0;
}
}
}
inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
nIters -= wn;
if (nIters <= 0) break;
// Load data for next iteration.
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp[u] = inpHere[u*WARP_SIZE];
tmp[u] = inpPacks[u*WARP_SIZE];
}
}
}
@@ -69,18 +70,17 @@ static __device__ void bcastDeep(
template<int UnrollPeers, typename T>
static __device__ void bcastEnds(
ncclSymPrims& prim, int tn, int t,
T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
ncclSymkArgsHandler const& handler, int tn, int t,
ncclSymPtr<T> input, ncclSymPtr<T> output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
) {
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
BytePack<sizeof(T)>* inpPacks = (BytePack<sizeof(T)>*)input.localPtr();
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
#pragma unroll 1
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
BytePack<sizeof(T)> tmp = inpHere[elt];
BytePack<sizeof(T)> tmp = inpPacks[elt];
int dr = inPlace ? 1 : 0;
int r = rank + dr;
if (r == nRanks) r = 0;
@@ -88,14 +88,14 @@ static __device__ void bcastEnds(
for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
#pragma unroll UnrollPeers
for (int u=0; u < UnrollPeers; u++) {
*add4G(outRank0+elt, r*stride4G) = tmp;
outPacks.lsaPtr(r)[elt] = tmp;
if (++r == nRanks) r = 0;
}
}
#pragma unroll UnrollPeers
for (int u=0; u < UnrollPeers; u++) {
if (dr+u == nRanks) break;
*add4G(outRank0+elt, r*stride4G) = tmp;
outPacks.lsaPtr(r)[elt] = tmp;
if (++r == nRanks) r = 0;
}
}
@@ -103,95 +103,95 @@ static __device__ void bcastEnds(
template<typename T>
static __device__ void bcast(
ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
bool inPlace = (input == output);
// Mpve to rank=0
output = prim.peerPtr(0, output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
uint32_t nPreBytes = (128u - inputUptr)%128u;
uint32_t nPreBytes = (16 - input.offset)%16;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t cursor = nPreBytes;
constexpr int MinWarpPerBlock = 4;
if ((inputUptr-outputUptr)%16 == 0) {
constexpr int BytePerPack = 16, UnrollPacks = 1, UnrollPeers = 1;
if ((input.offset - output.offset)%16 == 0) {
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
prim, tn, t, waitNeeded,
(char*)input + cursor, (char*)output + cursor, inPlace,
chunks*MinWarpPerBlock
handler, tn, t, waitNeeded, bar,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
inPlace, chunks*MinWarpPerBlock
);
cursor = cursorAfter;
waitNeeded = false;
}
}
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
constexpr int BytePerPack = 4, UnrollPacks = 1, UnrollPeers = 1;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
prim, tn, t, waitNeeded,
(char*)input + cursor, (char*)output + cursor, inPlace,
chunks*MinWarpPerBlock
handler, tn, t, waitNeeded, bar,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
inPlace, chunks*MinWarpPerBlock
);
cursor = cursorAfter;
waitNeeded = false;
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
constexpr int UnrollPeers = 8;
size_t nSufElts = (nBytes-cursor)/sizeof(T);
bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
bcastEnds<UnrollPeers>(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
}
__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
int const& rank = prim.rank;
__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
};
int const& rank = handler.comm.rank;
// Threads numbered over rank.
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int btn = prim.nBlocks*blockDim.x;
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bool waitNeeded = true;
handler.forEachWork<char>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<char> input, ncclSymPtr<char> output) {
// Threads numbered over rank.
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int btn = nBlocks*blockDim.x;
bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
waitNeeded = false;
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<typename T>
static __device__ void bcastMultimem(
ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
// Move output to multimem
output = prim.multimemPtr(output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
uint32_t nPreBytes = (16-inputUptr)%16;
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input.localPtr());
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output.multimemPtr(handler.comm.lsaMultimem));
uint32_t nPreBytes = (16 - input.offset)%16;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t nSufBytes;
@@ -230,51 +230,52 @@ static __device__ void bcastMultimem(
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
multimem_st_global(outputUptr + cursor, val);
cursor += tn*sizeof(T);
}
}
__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
int const& rank = prim.rank;
__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar(
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
);
int const& rank = handler.comm.rank;
char* input = args->input;
char* output = args->output;
size_t bytes = args->nElts;
// Round robin memory to blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = prim.nBlocks*blockDim.x;
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
handler.forEachWork<char>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<char> input, ncclSymPtr<char> output) {
// Round robin memory to blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = nBlocks*blockDim.x;
bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts);
}
);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<typename EltType>
static __device__ void allgather_LL_body(
ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
) {
using Pack = BytePack<8>;
constexpr int EltPerPack = 8/sizeof(EltType);
ncclCoopCta cta;
int rank = prim.rank;
int nRanks = prim.nRanks;
constexpr int tn = ncclSymMaxThreads;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
int t = threadIdx.x;
constexpr int tn = ncclSymkMaxThreads;
#pragma unroll 1
while (0 < nElts) {
int nIterPacks = min(nPacks, tn);
if (t < nIterPacks) {
Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
lla2a.bcast(/*slot=*/nIterPacks*rank + t, x);
}
int tn_div_nPacks = tn/nIterPacks;
@@ -287,7 +288,7 @@ static __device__ void allgather_LL_body(
#pragma unroll 1
for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
Pack got[Unroll];
prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
lla2a.template recvUnrolled<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
#pragma unroll
for (int u=0; u < Unroll; u++) {
storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
@@ -302,7 +303,7 @@ static __device__ void allgather_LL_body(
if (i + n*tn < nRanks*nIterPacks) n += 1;
if (n != 0) {
Pack got[Unroll];
prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got);
#pragma unroll
for (int u=0; u < Unroll; u++) {
if (u != 0 && u == n) break;
@@ -316,7 +317,7 @@ static __device__ void allgather_LL_body(
// The non-unrolled but "obviously correct" implementation for reference.
#pragma unroll 1
for (int i = t; i < nRanks*nIterPacks; i += tn) {
Pack got = prim.template recvLL<Pack>(i);
Pack got = lla2a.template recv<Pack>(i);
storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
peer += tn_div_nPacks;
pack += tn_mod_nPacks;
@@ -324,7 +325,7 @@ static __device__ void allgather_LL_body(
}
#endif
prim.endLL(cta);
lla2a.endEpoch(ncclCoopCta());
input += tn*EltPerPack;
output += tn*EltPerPack;
@@ -333,38 +334,41 @@ static __device__ void allgather_LL_body(
}
}
static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
ncclSymkArgsHandler handler{args};
ncclLLA2ASession<ncclCoopCta> lla2a(
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
);
using Pack = BytePack<8>;
constexpr int BytePerPack = 8;
int nElts = args->nElts;
int nPacks = divUp(nElts, BytePerPack);
uint32_t nPackPerBlock, nPackModBlock;
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
int nBlockPacks = blockPackEnd - blockPackBegin;
int nBlockElts = nElts - blockPackBegin*BytePerPack;
nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
char* blockInput = args->input + blockPackBegin*BytePerPack;
char* blockOutput = args->output + blockPackBegin*BytePerPack;
handler.singleWork<char>(
[&]__device__(int nElts, int nAllElts,
ncclSymPtr<char> input, ncclSymPtr<char> output) {
int nPacks = divUp(nElts, BytePerPack);
uint32_t lowBits = args->nElts;
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
if (__builtin_expect(lowBits%8 == 0, true)) {
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
} else {
allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
}
char* blockInput = input.localPtr();
char* blockOutput = output.localPtr();
uint32_t lowBits = nElts;
lowBits |= (uintptr_t)blockInput;
lowBits |= (uintptr_t)blockOutput;
if (__builtin_expect(lowBits%8 == 0, true)) {
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput,
nElts/8, nPacks, nAllElts/8);
} else {
allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts);
}
}
);
}
__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false);
}
__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true);
}
@@ -1,38 +1,41 @@
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
#include "symmetric.h"
#include "sym_kernels.h"
#include "nccl_device.h"
#include "symmetric/kernel.h"
#include "symmetric/primitives.h"
template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
static __device__ __forceinline__ void allreduceDeep(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
Red red, char* inputRank0, char* outputRank0, int32_t nIters
ncclSymkArgsHandler const& handler, int tn, int t,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
) {
using Pack = BytePack<BytePerPack>;
using Acc = typename Red::EltType;
using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
ncclTeam world = ncclTeamWorld(handler.comm);
int wn = tn/WARP_SIZE;
int w = t/WARP_SIZE;
int lane = t%WARP_SIZE;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack acc0[UnrollPacks];
nIters -= w;
if (0 < nIters) {
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
if (0 < nIters) {
while (true) {
@@ -42,7 +45,7 @@ static __device__ __forceinline__ void allreduceDeep(
{ Pack tmp1[UnrollPacks];
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
}
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
@@ -67,7 +70,7 @@ static __device__ __forceinline__ void allreduceDeep(
if (partial && ur!=0 && dr+ur == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
}
if (++r == nRanks) r = 0;
}
@@ -98,22 +101,22 @@ static __device__ __forceinline__ void allreduceDeep(
if (partial && dr == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u];
}
if (++r == nRanks) r = 0;
}
}
}
inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
nIters -= wn;
if (nIters <= 0) break;
// Load data for next iteration.
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
}
}
}
@@ -121,21 +124,23 @@ static __device__ __forceinline__ void allreduceDeep(
template<int UnrollPeers, typename Red, typename T>
static __device__ __forceinline__ void allreduceEnds(
ncclSymPrims& prim, int tn, int t, Red red,
T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
ncclSymkArgsHandler const& handler, int tn, int t, Red red,
ncclSymPtr<T> input, ncclSymPtr<T> output,
size_t nElts, uint32_t nPreElts, size_t nSufElts
) {
using Acc = typename Red::EltType;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
ncclTeam world = ncclTeamWorld(handler.comm);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
#pragma unroll 1
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
BytePack<sizeof(Acc)> acc1;
BytePack<sizeof(T)> tmp[UnrollPeers];
int dr = 1;
@@ -154,7 +159,7 @@ static __device__ __forceinline__ void allreduceEnds(
#pragma unroll
for (int u=0; u < UnrollPeers-partial; u++) {
if (partial && u!=0 && dr+u == nRanks) break;
tmp[u] = *add4G(inpRank0+elt, r*stride4G);
tmp[u] = inpPacks.peerPtr(world, r)[elt];
r += 1;
if (r == nRanks) r = 0;
}
@@ -182,7 +187,7 @@ static __device__ __forceinline__ void allreduceEnds(
#pragma unroll
for (int u=0; u < UnrollPeers-partial; u++) {
if (partial && dr+u == nRanks) break;
*add4G(outRank0+elt, r*stride4G) = acc0;
outPacks.peerPtr(world, r)[elt] = acc0;
r += 1;
if (r == nRanks) r = 0;
}
@@ -193,35 +198,33 @@ static __device__ __forceinline__ void allreduceEnds(
template<typename Red, typename T>
static __device__ void allreduce(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
Red red, T* input, T* output, size_t nElts
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
int nRanks = prim.nRanks;
int nBlocks = prim.nBlocks;
// Mpve to rank=0
input = prim.peerPtr(0, input);
output = prim.peerPtr(0, output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
int const& nRanks = handler.comm.nRanks;
int const& nRanks_rcp32 = handler.nRanks_rcp32;
size_t nBytes = nElts*sizeof(T);
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);
uint32_t nPreBytes = (16u - inputUptr)%16u;
uint32_t nPreBytes = (16u - input.offset)%16u;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t cursor = nPreBytes;
constexpr int MinWarpPerBlock = 4;
if ((inputUptr-outputUptr)%16 == 0) {
if ((input.offset - output.offset)%16 == 0) {
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
prim, tn, t, waitNeeded, red,
(char*)input + cursor, (char*)output + cursor,
handler, tn, t, waitNeeded, bar, red,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
chunks*MinWarpPerBlock
);
cursor = cursorAfter;
@@ -229,16 +232,17 @@ static __device__ void allreduce(
}
}
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
prim, tn, t, waitNeeded, red,
(char*)input + cursor, (char*)output + cursor,
handler, tn, t, waitNeeded, bar, red,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
chunks*MinWarpPerBlock
);
cursor = cursorAfter;
@@ -246,46 +250,51 @@ static __device__ void allreduce(
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
constexpr int UnrollPeers = 8;
size_t nSufElts = (nBytes-cursor)/sizeof(T);
allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
allreduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
int /*const&*/ rank = prim.rank;
int /*const&*/ nRanks = prim.nRanks;
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
};
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
rank, nRanks,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = nRanks*prim.nBlocks*blockDim.x;
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bool waitNeeded = true;
handler.forEachWork<T>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<T> input, ncclSymPtr<T> output) {
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
rank, nRanks,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = nRanks*nBlocks*blockDim.x;
allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts);
waitNeeded = false;
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<typename Red, typename T>
static __device__ void allreduceMultimem(
ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
int tn, int t, Red red, T* input, T* output, size_t nElts
) {
// Mpve to multimem
input = prim.multimemPtr(input);
output = prim.multimemPtr(output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
@@ -330,106 +339,132 @@ static __device__ void allreduceMultimem(
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
multimem_st_global(outputUptr + cursor, val);
cursor += tn*sizeof(T);
}
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
};
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.rank, prim.nRanks,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
auto const& multimem = handler.comm.lsaMultimem;
allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
handler.forEachWork<T>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<T> input, ncclSymPtr<T> output) {
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
rank, nRanks,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = nRanks*nBlocks*blockDim.x;
allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts);
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
int /*const&*/ rank = prim.rank;
using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
Red<Acc> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
ncclSymkArgsHandler handler{args};
ncclLLA2ASession<ncclCoopCta> lla2a(
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A,
blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
using Acc = typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type;
Red<Acc> red(handler.devWork->redOpArg);
using Pack = BytePack<8>;
using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
constexpr int EltPerPack = 8/sizeof(T);
int nElts = args->nElts;
int nPacks = divUp(nElts, EltPerPack);
bool packAligned = 8 <= alignof(T) || (
args->nElts*sizeof(T) |
(uint32_t)reinterpret_cast<uintptr_t>(args->input) |
(uint32_t)reinterpret_cast<uintptr_t>(args->output)
)%8 == 0;
handler.singleWork<T>(
[&]__device__(int nElts, int nAllElts,
ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
int nPacks = divUp(nElts, EltPerPack);
uint32_t nPackPerBlock, nPackModBlock;
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
T* input = (T*)inputPtr.localPtr();
T* output = (T*)outputPtr.localPtr();
nPacks = end - begin;
nElts -= begin*EltPerPack;
nElts = min(nElts, nPacks*EltPerPack);
T* input = (T*)args->input + begin*EltPerPack;
T* output = (T*)args->output + begin*EltPerPack;
bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0;
ncclCoopCta cta;
int t = threadIdx.x;
int tn = ncclSymMaxThreads;
ncclCoopCta cta;
int t = threadIdx.x;
int tn = ncclSymkMaxThreads;
if (__builtin_expect(packAligned, true)) {
#pragma unroll 1
while (0 < nPacks) {
if (t < nPacks) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
storePack((Pack*)output, t, nPacks, out);
if (__builtin_expect(packAligned, true)) {
#pragma unroll 1
while (0 < nPacks) {
if (t < nPacks) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
return applyCast<T, Acc>(x);
},
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
return applyReduce(red, a, b);
}
);
storePack((Pack*)output, t, nPacks, applyCast<Acc, T>(out));
}
lla2a.endEpoch(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nPacks -= tn;
}
} else {
#pragma unroll 1
while (0 < nElts) {
if (t*EltPerPack < nElts) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
return applyCast<T, Acc>(x);
},
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
return applyReduce(red, a, b);
}
);
storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(out));
}
lla2a.endEpoch(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nElts -= tn*EltPerPack;
nPacks -= tn;
}
}
}
prim.endLL(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nPacks -= tn;
}
} else {
#pragma unroll 1
while (0 < nElts) {
if (t*EltPerPack < nElts) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
storePack(output, t*EltPerPack, nElts, out);
}
prim.endLL(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nElts -= tn*EltPerPack;
nPacks -= tn;
}
}
);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
}

Some files were not shown because too many files have changed in this diff Show More