Add 'projects/rccl/' from commit '1f2f9f33bac3e8ecfd84c69af6063d7352c362fc'

git-subtree-dir: projects/rccl
git-subtree-mainline: 3fd8a0d393
git-subtree-split: 1f2f9f33ba
Cette révision appartient à :
Ameya Keshava Mallya
2025-12-11 20:46:05 +00:00
révision 42d84317cf
682 fichiers modifiés avec 293338 ajouts et 0 suppressions
+71
Voir le fichier
@@ -0,0 +1,71 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
- name: pytestFolder
value: '.azuredevops/tests/pytest'
parameters:
- name: pytestList
type: object
default:
- HelloWorld
trigger: none
pr: none
schedules:
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
displayName: "Nightly Build (CST)"
branches:
include:
- develop
always: false
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
displayName: "Nightly Build (CDT)"
branches:
include:
- develop
always: false
jobs:
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
parameters:
installEnabled: false
printDiskSpace: false
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DBUILD_TESTS=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
parameters:
componentName: rccl
testDir: $(Build.SourcesDirectory)/build/test
testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
- ${{ each pytestScript in parameters.pytestList }}:
- task: Bash@3
displayName: Test ${{ pytestScript }}
continueOnError: true
inputs:
targetType: inline
workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
script: pytest ${{ pytestScript }}.py
+77
Voir le fichier
@@ -0,0 +1,77 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
- name: pytestFolder
value: '.azuredevops/tests/pytest'
parameters:
- name: pytestList
type: object
default:
- HelloWorld
trigger: none
pr:
autoCancel: true
branches:
include:
- develop
paths:
exclude:
- .github
- .jenkins
- docs
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rcclStage
displayName: 'RCCL develop PR'
jobs:
- deployment: rccl_pr_approval
displayName: "CI Run Requires Approval"
environment: rccl
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
parameters:
installEnabled: false
printDiskSpace: false
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DBUILD_TESTS=ON
-DGPU_TARGETS=gfx942
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
parameters:
componentName: rccl
testDir: $(Build.SourcesDirectory)/build/test
testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
- ${{ each pytestScript in parameters.pytestList }}:
- task: Bash@3
displayName: Test ${{ pytestScript }}
continueOnError: true
inputs:
targetType: inline
workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
script: pytest ${{ pytestScript }}.py
+44
Voir le fichier
@@ -0,0 +1,44 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr: none
schedules:
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
displayName: "Nightly Build (CST)"
branches:
include:
- develop
always: false
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
displayName: "Nightly Build (CDT)"
branches:
include:
- develop
always: false
jobs:
- job: rccl
timeoutInMinutes: 240
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test_rccl-UnitTests.yml
- template: templates/test_rccl-tests.yml
+49
Voir le fichier
@@ -0,0 +1,49 @@
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr:
autoCancel: true
branches:
include:
- develop
paths:
exclude:
- .github
- .jenkins
- docs
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rcclStage
displayName: 'RCCL develop PR'
jobs:
- deployment: rccl_pr_approval
displayName: "CI Run Requires Approval"
environment: rccl
- job: rccl
timeoutInMinutes: 240
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test_rccl-UnitTests.yml
- template: templates/test_rccl-tests.yml
+73
Voir le fichier
@@ -0,0 +1,73 @@
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
parameters:
- name: pipelinesRepoRef
type: string
default: refs/heads/develop
- name: systemsRepoRef
type: string
default: refs/heads/develop
- name: systemsSparseCheckoutDir
type: string
default: 'projects/rocprofiler-sdk'
- name: triggerDownstreamJobs
type: boolean
default: true
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
ref: ${{ parameters.pipelinesRepoRef }}
- repository: systems_repo
type: github
endpoint: ROCm
name: ROCm/rocm-systems
ref: ${{ parameters.systemsRepoRef }}
trigger:
batch: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
- LICENSE.txt
- NOTICES.txt
pr:
autoCancel: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rccl
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo
parameters:
sparseCheckoutDir: ''
systemsRepo: systems_repo
systemsSparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
triggerDownstreamJobs: ${{ parameters.triggerDownstreamJobs }}
+54
Voir le fichier
@@ -0,0 +1,54 @@
#!/bin/bash
#SBATCH --job-name=rccl-build
#SBATCH --output=rccl-build-%j.out
#SBATCH --error=rccl-build-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
# Setup local binary path
export PATH="$HOME/.local/bin:$PATH"
mkdir -p "$HOME/.local/bin"
# Install Ninja if not already available
if ! command -v ninja &>/dev/null; then
echo "Ninja not found. Installing locally..."
wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
chmod +x "$HOME/.local/bin/ninja"
fi
echo "Using Ninja at: $(which ninja)"
ninja --version
# Define GPU target
export GPU_TARGETS="gfx942"
cd "${SLURM_SUBMIT_DIR:-$PWD}"
## Building RCCL
mkdir -p build
cd build
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
# Building RCCL Replayer
cd ../tools/RcclReplayer 2>/dev/null || cd ../RcclReplayer
RCCL_DIR="../../build" ROCM_DIR="$ROCM_PATH" MPI_DIR="$MPI_HOME" make
cd "${SLURM_SUBMIT_DIR:-$PWD}"
## Building RCCL-Tests
git clone https://github.com/ROCm/rccl-tests
cd rccl-tests
mkdir -p build
cd build
cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
+16
Voir le fichier
@@ -0,0 +1,16 @@
#!/bin/bash
#SBATCH --job-name=rccl-UnitTests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=180
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
cd "$BINARIES_DIR/bin"
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes
+62
Voir le fichier
@@ -0,0 +1,62 @@
#!/bin/bash
#SBATCH --job-name=rccl-tests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
cd ${PIPELINE_WORKSPACE}/TestResults
mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export PATH="$BINARIES_DIR/bin:$PATH"
export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
### create hostlist
#nodelist=($(scontrol show hostnames))
#echo "SLURM nodes:"
#echo ${nodelist[@]}
#echo ""
#
#hosts_8ppn=()
#for node in "${nodelist[@]}"
#do
# hosts_8ppn+=("${node}:8")
#done
#echo ${hosts_8ppn[@]}
### Run multi- and single-node RCCL-Tests
## Run single-node RCCL-Tests
for n in 1
do
total=$((n*8))
#h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
do
for dtype in float bfloat16 half fp8_e5m2
do
out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
#cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
echo "Running ${coll}" 2>&1 | tee ${out_filename}
echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
eval ${cmd} 2>&1 | tee -a ${out_filename}
sleep 2
done
done
done
## To add
### Summarize results
### Convert to junit
+86
Voir le fichier
@@ -0,0 +1,86 @@
# small subset of files to check for install to determine pass/fail
parameters:
- name: expectedInstallFiles
type: object
default:
- bin/rccl-UnitTests
- include/rccl/rccl.h
- lib/cmake/rccl/rccl-config.cmake
- lib/librccl.so
- share/doc/rccl/LICENSE.txt
- share/rccl/msccl-algorithms
- share/rccl/msccl-unit-test-algorithms
steps:
- task: Bash@3
displayName: Build Job
env:
BINARIES_DIR: $(Build.BinariesDirectory)
inputs:
targetType: inline
script: |
echo "##[section]Starting build job..."
rm -rf $(Build.BinariesDirectory)/*
echo "Submitting build job..."
mkdir -p $(Build.BinariesDirectory)
BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
echo "Submitted build job: $BUILD_JOB_ID"
echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
echo "Waiting for build job to start..."
while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
echo "##[section]Build job $BUILD_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=30 # Maximum of 30 loops (30 minutes)
while true; do
STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Build job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Build failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking for expected installed files..."
MISSING_FILES=0
expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
i=1
total=$(echo "$expectedFiles" | wc -w)
while [ $i -le $total ]; do
relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
fullpath="$BINARIES_DIR/$relpath"
if [ ! -e "$fullpath" ]; then
echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
MISSING_FILES=1
fi
i=$((i + 1))
done
if [ "$MISSING_FILES" -eq 1 ]; then
echo "One or more expected files are missing from the install directory."
exit 1
else
echo "All expected files are present in the install directory."
fi
- task: Bash@3
displayName: Build Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
+69
Voir le fichier
@@ -0,0 +1,69 @@
steps:
- task: Bash@3
displayName: RCCL UnitTests
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result XML for failures..."
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1)
if [ -z "$TEST_XML" ]; then
echo "##vso[task.logissue type=error]No $TEST_XML file found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
if grep -q 'failures="[^0]' "$TEST_XML"; then
echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
else
echo "No test failures detected."
fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found"
- task: PublishTestResults@2
displayName: 'Publish Results'
condition: succeededOrFailed()
inputs:
searchFolder: $(Pipeline.Workspace)
testResultsFormat: JUnit
testResultsFiles: '**/rccl-UnitTests_output.xml'
+77
Voir le fichier
@@ -0,0 +1,77 @@
steps:
- task: Bash@3
displayName: RCCL-Tests
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result json for failures..."
TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json')
if [ -z "$TEST_JSON" ]; then
echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
#echo "Checking test result XML for failures..."
#TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1)
#if [ -z "$TEST_XML" ]; then
# echo "##vso[task.logissue type=error]No $TES_XML file found"
# echo "##vso[task.complete result=Failed;]DONE"
# exit 1
#fi
#if grep -q 'failures="[^0]' "$TEST_XML"; then
# echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
# echo "##vso[task.complete result=Failed;]DONE"
# exit 1
#else
# echo "No test failures detected."
#fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found"
# - task: PublishTestResults@2
# displayName: 'Publish Results'
# condition: succeededOrFailed()
# inputs:
# searchFolder: $(Pipeline.Workspace)
# testResultsFormat: JUnit
# testResultsFiles: '**/rccl-tests_output.xml'
+5
Voir le fichier
@@ -0,0 +1,5 @@
import pytest
def test_HelloWorld():
greeting = "Hello, World!"
assert greeting == "Hello, World!"
+139
Voir le fichier
@@ -0,0 +1,139 @@
# Style file for MLSE Libraries based on the modified rocBLAS style
# Common settings
BasedOnStyle: WebKit
TabWidth: 4
IndentWidth: 4
UseTab: Never
ColumnLimit: 100
UseCRLF: false
# Other languages JavaScript, Proto
---
Language: Json
DisableFormat: true
---
Language: Cpp
# http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code
# int formatted_code;
# // clang-format off
# void unformatted_code ;
# // clang-format on
# void formatted_code_again;
DisableFormat: false
Standard: Cpp11
AccessModifierOffset: -4
AlignAfterOpenBracket: true
AlignArrayOfStructures: Right
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: true
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllArgumentsOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: false
BinPackParameters: false
BitFieldColonSpacing: Both
# Configure each individual brace in BraceWrapping
BreakBeforeBraces: Custom
# Control of individual brace wrapping cases
BraceWrapping:
AfterCaseLabel: true
AfterClass: true
AfterControlStatement: Always
AfterEnum: true
AfterExternBlock: false
AfterFunction: true
AfterNamespace: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
BeforeLambdaBody: true
BeforeWhile: true
IndentBraces: false
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
BreakBeforeBinaryOperators: All
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
BreakStringLiterals: true
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: false
DerivePointerAlignment: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: Always
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros: []
IfMacros: []
IncludeBlocks: Preserve
IndentAccessModifiers: false
IndentCaseBlocks: true
IndentCaseLabels: true
IndentExternBlock: NoIndent
IndentPPDirectives: BeforeHash
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: true
LambdaBodyIndentation: Signature
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
PPIndentWidth: -1
PackConstructorInitializers: NextLine
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
QualifierAlignment: Leave
ReferenceAlignment: Pointer
ReflowComments: false
ShortNamespaceLines: 0
SortIncludes: CaseSensitive
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: false
SpaceAroundPointerQualifiers: Default
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: Never
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInCStyleCastParentheses: false
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInParentheses: false
SpacesInSquareBrackets: false
---
Externe Fichier exécutable
+9
Voir le fichier
@@ -0,0 +1,9 @@
* @ROCm/rccl-reviewers
# Documentation files
docs/ @ROCm/rocm-documentation
*.md @ROCm/rocm-documentation
*.rst @ROCm/rocm-documentation
.readthedocs.yaml @ROCm/rocm-documentation
src/include/api_trace.h @ROCm/ROCM-DevTools-Team
+23
Voir le fichier
@@ -0,0 +1,23 @@
## Details
___Do not mention proprietary info or link to internal work items in this PR.___
**Work item:** _"Internal", or link to GitHub issue (if applicable)._
**What were the changes?**
_One sentence describing the work done._
**Why were the changes made?**
_Explain the motivation behind the work. Provide any publicly-available historical context._
**How was the outcome achieved?**
_Technical details behind the work. Explain any publicly-available hardware peculiarities._
**Additional Documentation:**
_What else should the reviewer know?_
## Approval Checklist
___Do not approve until these items are satisfied.___
- [ ] Verify the CHANGELOG has been updated, if
- there are any NCCL API version changes,
- any changes impact library users, and/or
- any changes impact any other ROCm library.
+17
Voir le fichier
@@ -0,0 +1,17 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
labels:
- "dependencies"
- "ci:docs-only"
reviewers:
- "samjwu"
+134
Voir le fichier
@@ -0,0 +1,134 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import fnmatch
import json
import os
from pathlib import Path
import subprocess
import sys
from typing import Iterable, Optional, Mapping
def gha_set_output(vars: Mapping[str, str | Path]):
"""Sets values in a step's output parameters.
This appends to the file located at the $GITHUB_OUTPUT environment variable.
See
* https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
* https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
"""
print(f"Setting github output:\n{vars}")
step_output_file = os.getenv("GITHUB_OUTPUT")
if not step_output_file:
print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
return
with open(step_output_file, "a") as f:
f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
"""Returns the paths of modified files relative to the base reference."""
try:
return subprocess.run(
["git", "diff", "--name-only", base_ref],
stdout=subprocess.PIPE,
check=True,
text=True,
timeout=60,
).stdout.splitlines()
except TimeoutError:
print(
"Computing modified files timed out. Not using PR diff to determine"
" jobs to run.",
file=sys.stderr,
)
return None
GITHUB_WORKFLOWS_CI_PATTERNS = [
"therock*.yml",
]
def is_path_workflow_file_related_to_ci(path: str) -> bool:
return any(
fnmatch.fnmatch(path, ".github/workflows/" + pattern)
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
)
def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
if paths is None:
return False
return any(is_path_workflow_file_related_to_ci(p) for p in paths)
# Paths matching any of these patterns are considered to have no influence over
# build or test workflows so any related jobs can be skipped if all paths
# modified by a commit/PR match a pattern in this list.
SKIPPABLE_PATH_PATTERNS = [
"docs/*",
"*.gitignore",
"*.md",
"*LICENSE*",
"*NOTICES*",
'.github/CODEOWNERS',
'.github/*.md',
'.github/dependabot.yml',
'.azuredevops*',
]
def is_path_skippable(path: str) -> bool:
"""Determines if a given relative path to a file matches any skippable patterns."""
return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
"""Returns true if at least one path is not in the skippable set."""
if paths is None:
return False
return any(not is_path_skippable(p) for p in paths)
def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
"""Returns true if CI workflows should run given a list of modified paths."""
if paths is None:
print("No files were modified, skipping TheRock CI jobs")
return False
paths_set = set(paths)
github_workflows_paths = set(
[p for p in paths if p.startswith(".github/workflows")]
)
other_paths = paths_set - github_workflows_paths
related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
print("should_ci_run_given_modified_paths findings:")
print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
if related_to_ci:
print("Enabling build jobs since a related workflow file was modified")
return True
elif contains_other_non_skippable_files:
print("Enabling TheRock CI jobs since a non-skippable path was modified")
return True
else:
print(
"Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
)
return False
def main(args):
base_ref = args.get("base_ref")
modified_paths = get_modified_paths(base_ref)
print("modified_paths (max 200):", modified_paths[:200])
enable_jobs = should_ci_run_given_modified_paths(modified_paths)
output = {
'enable_therock_ci': json.dumps(enable_jobs)
}
gha_set_output(output)
if __name__ == "__main__":
args = {}
args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
main(args)
+147
Voir le fichier
@@ -0,0 +1,147 @@
name: TheRock CI Linux
on:
workflow_call:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
extra_cmake_options:
type: string
permissions:
contents: read
jobs:
therock-build-linux:
name: Build Linux Packages
runs-on: azure-linux-scale-rocm
permissions:
id-token: write
container:
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:1f1ce0ab151146c7f86ee4345be74c42d8ca83200d9d26843e8a71df01ecad4e
options: -v /runner/config:/home/awsconfig/
env:
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
TEATIME_FORCE_INTERACTIVE: 0
AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
CACHE_DIR: ${{ github.workspace }}/.container-cache
# The ccache.conf will be written by setup_ccache.py before this gets used.
CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf
steps:
- name: Checkout TheRock repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
- name: Checkout rccl repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rccl"
path: rccl
- name: Checkout rccl-tests repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rccl-tests"
path: rccl-tests
- name: Install python deps
run: |
pip install -r requirements.txt
# safe.directory must be set before Runner Health Status
- name: Adjust git config
run: |
git config --global --add safe.directory $PWD
git config fetch.parallel 10
- name: Setup ccache
run: |
./build_tools/setup_ccache.py \
--config-preset "github-oss-presubmit" \
--dir "$(dirname $CCACHE_CONFIGPATH)" \
--local-path "$CACHE_DIR/ccache"
- name: Runner health status
run: |
./build_tools/health_status.py
- name: Fetch sources
run: |
./build_tools/fetch_sources.py --jobs 12
- name: Configure Projects
env:
amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
package_version: ADHOCBUILD
extra_cmake_options: ${{ inputs.extra_cmake_options }}
BUILD_DIR: build
run: |
python3 build_tools/github_actions/build_configure.py
- name: Build therock-dist
run: cmake --build build
- name: Build therock-archives
run: cmake --build build --target therock-archives
- name: Report
#if: ${{ !cancelled() }}
run: |
echo "Full SDK du:"
echo "------------"
du -h -d 1 build/dist/rocm
echo "Artifact Archives:"
echo "------------------"
ls -lh build/artifacts/*.tar.xz
echo "Artifacts:"
echo "----------"
du -h -d 1 build/artifacts
echo "CCache Stats:"
echo "-------------"
ccache -s -v
tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log
- name: Configure AWS Credentials for non-forked repos
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Post Build Upload
if: always()
run: |
python3 build_tools/github_actions/post_build_upload.py \
--run-id ${{ github.run_id }} \
--artifact-group ${{ env.AMDGPU_FAMILIES }} \
--build-dir build \
--upload
therock-test-linux-multi-node:
name: "Test multi-node"
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
permissions:
contents: read
id-token: write
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-multi-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
artifact_group: ${{ inputs.artifact_group }}
test_runs_on: nova-linux-slurm-scale-runner
artifact_run_id: ${{ github.run_id }}
therock-test-linux-single-node:
name: "Test single-node"
if: ${{ inputs.amdgpu_families == 'gfx94X-dcgpu' }}
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-single-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
artifact_group: ${{ inputs.artifact_group }}
test_runs_on: linux-mi325-1gpu-ossci-rocm-frac
artifact_run_id: ${{ github.run_id }}
+91
Voir le fichier
@@ -0,0 +1,91 @@
name: TheRock CI for rccl
on:
push:
branches:
- develop
pull_request:
types:
- labeled
- opened
- synchronize
workflow_dispatch:
permissions:
contents: read
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit). The workflow name is prepended to avoid conflicts between
# different workflows.
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
setup:
runs-on: ubuntu-24.04
env:
# The commit being checked out is the merge commit for a PR. Its first
# parent will be the tip of the base branch.
BASE_REF: HEAD^
outputs:
enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
steps:
- name: "Checking out repository"
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
# We need the parent commit to do a diff
fetch-depth: 2
- name: "Configuring CI options"
id: configure
run: python .github/scripts/therock_configure_ci.py
therock-ci-linux:
name: TheRock CI Linux
needs: setup
if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
permissions:
contents: read
id-token: write
strategy:
fail-fast: false
matrix:
amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu]
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
amdgpu_families: ${{ matrix.amdgpu_family }}
artifact_group: ${{ matrix.amdgpu_family }}
extra_cmake_options: >
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
-DTHEROCK_BUNDLE_SYSDEPS=ON
-DTHEROCK_ENABLE_COMM_LIBS=ON
-DTHEROCK_ENABLE_ROCPROFV3=ON
-DTHEROCK_USE_EXTERNAL_RCCL=ON
-DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
-DTHEROCK_RCCL_SOURCE_DIR=./rccl
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
-DTHEROCK_ENABLE_MPI=ON
therock_ci_summary:
name: TheRock CI Summary
if: always()
needs:
- setup
- therock-ci-linux
runs-on: ubuntu-24.04
steps:
- name: Output failed jobs
run: |
echo '${{ toJson(needs) }}'
FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
| jq --raw-output \
'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
)"
if [[ "${FAILED_JOBS}" != "" ]]; then
echo "The following jobs failed: ${FAILED_JOBS}"
exit 1
fi
+96
Voir le fichier
@@ -0,0 +1,96 @@
name: TheRock Test Packages multi-node
on:
workflow_call:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
workflow_dispatch:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
permissions:
contents: read
id-token: write
jobs:
test_rccl_multi_node:
name: 'Test multi-node'
runs-on: ${{ inputs.test_runs_on }}
defaults:
run:
shell: bash
permissions:
contents: read
id-token: write
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
ARTIFACT_GROUP: ${{ inputs.artifact_group }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: "--rccl --tests"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
- name: Test gfx950
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
run: |
salloc -N 4 -p meta64 -t 04:00:00 --exclusive bash -c "
source /home/arravikum/TheRock/.venv/bin/activate &&
cd /home/arravikum/cvs &&
python input/setup.py &&
pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
--cluster_file ./input/cluster.json \
--config_file ./input/mi350_config.json \
--log-file=/tmp/rccl_log.log \
--html=/home/arravikum/cvs/test_reports/ci_test_report.html \
--capture=tee-sys \
--self-contained-html"
- name: Configure AWS Credentials for non-forked repos
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Post test report upload
if: always()
working-directory: ${{ github.workspace }}
run: |
export PYTHONPATH="${PYTHONPATH}:${{ github.workspace }}/build_tools"
python3 build_tools/github_actions/upload_test_report_script.py \
--run-id "${{ github.run_id }}" \
--amdgpu-family "${{ inputs.amdgpu_families }}" \
--report-path "/home/arravikum/cvs/test_reports" \
--log-destination "/logs/gfx950-dcgpu" \
--index-file-name "index_rccl_test_report.html"
+74
Voir le fichier
@@ -0,0 +1,74 @@
name: TheRock Test Packages single-node
on:
workflow_call:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
workflow_dispatch:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
permissions:
contents: read
jobs:
test_rccl_single_node:
name: 'Test single-node'
runs-on: ${{ inputs.test_runs_on }}
container:
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
options: --ipc host
--group-add video
--device /dev/kfd
--device /dev/dri
--group-add 110
--env-file /etc/podinfo/gha-gpu-isolation-settings
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: "./build"
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
ARTIFACT_GROUP: ${{ inputs.artifact_group }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: "--rccl --tests"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Test
timeout-minutes: 15
# Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
# RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
run: |
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
--log-cli-level=info \
-k "not test_rccl_correctness_tests"
+8
Voir le fichier
@@ -0,0 +1,8 @@
# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*.gcov
/coverage/
build/
ext/
# Visual Studio Code
.vscode
+10
Voir le fichier
@@ -0,0 +1,10 @@
[submodule "ext-src/mscclpp"]
path = ext-src/mscclpp
url = https://github.com/microsoft/mscclpp.git
ignore = dirty
shallow = true
[submodule "ext-src/json"]
path = ext-src/json
url = https://github.com/nlohmann/json.git
ignore = dirty
shallow = true
+18
Voir le fichier
@@ -0,0 +1,18 @@
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
sphinx:
configuration: docs/conf.py
formats: [htmlzip, pdf, epub]
python:
install:
- requirements: docs/sphinx/requirements.txt
+370
Voir le fichier
@@ -0,0 +1,370 @@
# Changelog for RCCL
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
### Changed
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
### Changed
* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes.
### Resolved Issues
* Fixed crash when using the librccl-profiler plugin with the all-to-all collective after the 2.27 update.
## RCCL 2.27.7 for ROCm 7.1.0
### Added
* Added `RCCL_IB_QPS_PER_P2P` to set the number of QPs per connection for P2P operations. When set (≥1), P2P operations (Send/Recv) use `RCCL_IB_QPS_PER_P2P`, while other collective operations continue to use `NCCL_IB_QPS_PER_CONNECTION`. When not set, `NCCL_IB_QPS_PER_CONNECTION` applies to all operations.
* Added `RCCL_FORCE_ENABLE_DMABUF` as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks.
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
### Changed
* The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script.
* Compatibility with NCCL 2.27.7.
### Optimized
* Enabled and optimized batched P2P operations to improve small message performance for AllToAll and AllGather.
* Optimized channel count selection to improve efficiency for small to medium message sizes in ReduceScatter.
* Changed code inlining to improve latency for small message sizes for AllReduce, AllGather, and ReduceScatter.
### Known issues
* Symmetric memory kernels are currently disabled due to ongoing CUMEM enablement work.
* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
## RCCL 2.26.6 for ROCm 7.0.0
### Resolved issues
* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
* Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.
* Suboptimal algorithmic switching point for AllReduce on MI300x.
* Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes.
* Fixed broken functionality within the LL protocol on gfx950 by disabling inlining of LLGenericOp kernels.
### Added
* Added new GPU target `gfx950`.
* Added support for `unroll=1` in device-code generation to improve performance,
* Set a default of 112 channels for a single node with `8 * gfx950`,
* Enabled LL128 protocol on `gfx950`.
* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
* Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled.
* Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap `bf16` arithmetic and bridge the gap between `fp32` performance and `bf16` for both `gfx942` and `gfx950`. Pipelining has been made tunable via `rcclSetPipelining`, similar to algorithms/protocols so that regression is avoided in certain message sizes.
* Added a direct allgather algorithm. This is enabled by default for multi-node if there are 16 nodes or fewer. The message size threshold is 4MB.
* Added `RCCL_OVERRIDE_PROTO` and `RCCL_OVERRIDE_ALGO` to allow direct replacement of protocol and algorithm choices. Unlike `NCCL_PROTO` and `NCCL_ALGO`, which re-run the model across enabled combinations and may not guarantee the intended override, these new options enforce the specified selections explicitly.
### Changed
* Compatibility with NCCL 2.23.4.
* Compatibility with NCCL 2.24.3.
* Compatibility with NCCL 2.25.1.
* Compatibility with NCCL 2.26.6.
### Optimized
* Improved the performance of the `FP8` Sum operation by upcasting to `FP16`.
### Known Issues
* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
## RCCL 2.22.3 for ROCm 6.4.2
### Added
* Added support for the LL128 protocol on gfx942.
## RCCL 2.22.3 for ROCm 6.4.1
### Resolved issues
* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode.
* Fixed IntraNet performance.
* Fixed an issue where, in rare circumstances, the application could stop responding due to a proxy thread synchronization issue.
### Known issues
* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault.
The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
* Within the RCCL-UnitTests test suite, failures occur in tests ending with the `ManagedMem` and `ManagedMemGraph` suffixes. These failures only affect the test results and do not affect the RCCL component itself. This issue will be resolved in the next major release.
## RCCL 2.22.3 for ROCm 6.4.0
### Added
* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters.
* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`.
* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`).
### Changed
* Compatibility with NCCL 2.22.3
* Added support for the rail-optimized tree algorithm for the MI300 series. This feature requires the use of all eight GPUs within
each node. It limits NIC traffic to use only GPUs of the same index across nodes and should not impact performance
on non-rail-optimized network topologies. The original method of building trees can be enabled by setting the
environment variable `RCCL_DISABLE_RAIL_TREES=1`.
* Additional debug information about how the trees are built can be logged to the GRAPH logging subsys by setting
`RCCL_OUTPUT_TREES=1`.
* Added documentation about the NPS4 and CPX partition modes performance benefits on the MI300X.
## RCCL 2.21.5 for ROCm 6.3.1
### Added
### Changed
* Enhanced user documentation
### Resolved issues
* Corrected user help strings in `install.sh`
## RCCL 2.21.5 for ROCm 6.3.0
### Added
* MSCCL++ integration for AllReduce and AllGather on gfx942
* Performance collection to rccl_replayer
* Tuner Plugin example for MI300
* Tuning table for large number of nodes
* Support for amdclang++
* Allow NIC ID remapping using `NCCL_RINGS_REMAP` environment variable
### Changed
* Compatibility with NCCL 2.21.5
* Increased channel count for MI300X multi-node
* Enabled MSCCL for single-process multi-threaded contexts
* Enabled gfx12
* Enabled CPX mode for MI300X
* Enabled tracing with rocprof
* Improved version reporting
* Enabled GDRDMA for Linux kernel 6.4.0+
### Resolved issues
* Fixed model matching with PXN enable
## RCCL 2.20.5 for ROCm 6.2.1
### Fixed
- GDR support flag now set with DMABUF
### Known issues
- On systems running Linux kernel 6.8.0, such as Ubuntu 24.04, Direct Memory Access (DMA) transfers between the GPU and NIC are disabled and impacts multi-node RCCL performance.
- This issue was reproduced with RCCL 2.20.5 (ROCm 6.2.0 and 6.2.1) on systems with Broadcom Thor-2 NICs and affects other systems with RoCE networks using Linux 6.8.0 or newer.
- Older RCCL versions are also impacted.
- This issue will be addressed in a future ROCm release.
## RCCL 2.20.5 for ROCm 6.2.0
### Changed
- Compatibility with NCCL 2.20.5
- Compatibility with NCCL 2.19.4
- Performance tuning for some collective operations on MI300
- Enabled NVTX code in RCCL
- Replaced rccl_bfloat16 with hip_bfloat16
- NPKit updates:
- Removed warm-up iteration removal by default, need to opt in now
- Doubled the size of buffers to accommodate for more channels
- Modified rings to be rail-optimized topology friendly
- Replaced ROCmSoftwarePlatform links with ROCm links
### Added
- Support for fp8 and rccl_bfloat8
- Support for using HIP contiguous memory
- Implemented ROC-TX for host-side profiling
- Enabled static build
- Added new rome model
- Added fp16 and fp8 cases to unit tests
- New unit test for main kernel stack size
- New -n option for topo_expl to override # of nodes
- Improved debug messages of memory allocations
### Fixed
- Bug when configuring RCCL for only LL128 protocol
- Scratch memory allocation after API change for MSCCL
## RCCL 2.18.6 for ROCm 6.1.0
### Changed
- Compatibility with NCCL 2.18.6
## RCCL 2.18.3 for ROCm 6.0.0
### Changed
- Compatibility with NCCL 2.18.3
## RCCL 2.17.1-1 for ROCm 5.7.0
### Changed
- Compatibility with NCCL 2.17.1-1
- Performance tuning for some collective operations
### Added
- Minor improvements to MSCCL codepath
- NCCL_NCHANNELS_PER_PEER support
- Improved compilation performance
- Support for gfx94x
### Fixed
- Potential race-condition during ncclSocketClose()
## RCCL 2.16.2 for ROCm 5.6.0
### Changed
- Compatibility with NCCL 2.16.2
### Fixed
- Remove workaround and use indirect function call
## RCCL 2.15.5 for ROCm 5.5.0
### Changed
- Compatibility with NCCL 2.15.5
- Unit test executable renamed to rccl-UnitTests
### Added
- HW-topology aware binary tree implementation
- Experimental support for MSCCL
- New unit tests for hipGraph support
- NPKit integration
### Fixed
- rocm-smi ID conversion
- Support for HIP_VISIBLE_DEVICES for unit tests
- Support for p2p transfers to non (HIP) visible devices
### Removed
- Removed TransferBench from tools. Exists in standalone repo: https://github.com/ROCm/TransferBench
## RCCL-2.13.4 for ROCm 5.4.0
### Changed
- Compatibility with NCCL 2.13.4
- Improvements to RCCL when running with hipGraphs
- RCCL_ENABLE_HIPGRAPH environment variable is no longer necessary to enable hipGraph support
- Minor latency improvements
### Fixed
- Resolved potential memory access error due to asynchronous memset
## RCCL-2.12.10 for ROCm 5.3.0
### Changed
- Improvements to LL128 algorithms
### Added
- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
- Integrating with NPKit (https://github.com/microsoft/NPKit) profiling code
## RCCL-2.12.10 for ROCm 5.2.3
### Added
- Compatibility with NCCL 2.12.10
- Packages for test and benchmark executables on all supported OSes using CPack.
- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1
- Additional details provided if Binary File Descriptor library (BFD) is pre-installed
- Adding support for reusing ports in NET/IB channels
- Opt-in with NCCL_IB_SOCK_CLIENT_PORT_REUSE=1 and NCCL_IB_SOCK_SERVER_PORT_REUSE=1
- When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
(e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
to resolve the massive port usage issue
- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
### Removed
- Removed experimental clique-based kernels
## RCCL-2.11.4 for ROCm 5.2.0
### Changed
- Unit testing framework rework
- Minor bug fixes
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.11.4 for ROCm 5.1.0
### Added
- Compatibility with NCCL 2.11.4
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.10.3 for ROCm 5.0.0
### Added
- Compatibility with NCCL 2.10.3
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.9.9 for ROCm 4.5.0
### Changed
- Packaging split into a runtime package called rccl and a development package called rccl-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release.
### Added
- Compatibility with NCCL 2.9.9
### Known issues
- Managed memory is not currently supported for clique-based kernels
## [RCCL-2.8.4 for ROCm 4.3.0]
### Added
- Ability to select the number of channels to use for clique-based all reduce (RCCL_CLIQUE_ALLREDUCE_NCHANNELS). This can be adjusted to tune for performance when computation kernels are being executed in parallel.
### Optimizations
- Additional tuning for clique-based kernel AllReduce performance (still requires opt in with RCCL_ENABLE_CLIQUE=1)
- Modification of default values for number of channels / byte limits for clique-based all reduce based on device architecture
### Changed
- Replaced RCCL_FORCE_ENABLE_CLIQUE to RCCL_CLIQUE_IGNORE_TOPO
- Clique-based kernels can now be enabled on topologies where all active GPUs are XGMI-connected
- Topologies not normally supported by clique-based kernels require RCCL_CLIQUE_IGNORE_TOPO=1
### Fixed
- Install script '-r' flag invoked alone no longer incorrectly deletes any existing builds.
### Known issues
- Managed memory is not currently supported for clique-based kernels
## [RCCL-2.8.4 for ROCm 4.2.0]
### Added
- Compatibility with NCCL 2.8.4
### Optimizations
- Additional tuning for clique-based kernels
- Enabling GPU direct RDMA read from GPU
- Fixing potential memory leak issue when re-creating multiple communicators within same process
- Improved topology detection
### Known issues
- None
## [RCCL-2.7.8 for ROCm 4.1.0]
### Added
- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
- Clique-based kernels may offer better performance for smaller input sizes
- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
### Optimizations
- Performance improvements for Rome-based systems
### Known issues
- Clique-based kernels are currently experimental and have not been fully tested on all topologies. By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
- Clique-based kernels may hang if there are differences between environment variables set across ranks.
- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
## [RCCL-2.7.8 for ROCm 3.9.0]
### Added
- Adding support for alltoallv RCCL kernel
### Optimizations
- Modifications to topology based on XGMI links
### Known issues
- None
## [RCCL-2.7.6 for ROCm 3.8.0]
### Added
- Support for static library builds
### Known issues
- None
## [RCCL-2.7.6 for ROCm 3.7.0]
### Added
- Updated to RCCL API version of 2.7.6
- Added gather, scatter and all-to-all collectives
## [RCCL-2.7.0 for ROCm 3.6.0]
### Added
- Updated to RCCL API version of 2.6.4
## [RCCL-2.7.0 for ROCm 3.5.0]
### Added
- Compatibility with NCCL 2.6
- Network interface improvements with API v3
### Optimizations
- Fixing issues and built time improvements for hip-clang
- Network topology detection
- Improved CPU type detection
- Infiniband adaptive routing support
### Changed
- Switched to hip-clang as default compiler
### Deprecated
- Deprecated hcc build
+1378
Voir le fichier
Fichier diff supprimé car celui-ci est trop grand Voir la Diff
+84
Voir le fichier
@@ -0,0 +1,84 @@
arrayIndexThenCheck:src/bootstrap.cc:304
arrayIndexThenCheck:src/debug.cc:88
arrayIndexThenCheck:src/graph/search.cc:844
arrayIndexThenCheck:src/graph/search.cc:916
arrayIndexThenCheck:src/graph/search.cc:927
clarifyCalculation:src/graph/topo.cc:702
clarifyCalculation:src/graph/topo.cc:720
clarifyCondition:src/enqueue.cc:416
funcArgNamesDifferent:src/graph/topo.cc:135
funcArgNamesDifferent:src/graph/topo.h:144
nullPointerRedundantCheck:src/misc/utils.cc:102
nullPointerRedundantCheck:src/misc/utils.cc:109
nullPointerRedundantCheck:src/proxy.cc:143
nullPointerRedundantCheck:src/proxy.cc:144
nullPointerRedundantCheck:src/proxy.cc:147
nullPointerRedundantCheck:src/proxy.cc:148
nullPointerRedundantCheck:src/proxy.cc:149
nullPointerRedundantCheck:src/proxy.cc:150
nullPointerRedundantCheck:src/proxy.cc:151
nullPointerRedundantCheck:src/proxy.cc:155
nullPointerRedundantCheck:src/proxy.cc:159
nullPointerRedundantCheck:src/proxy.cc:160
nullPointerRedundantCheck:src/proxy.cc:161
nullPointerRedundantCheck:src/proxy.cc:163
nullPointerRedundantCheck:src/proxy.cc:165
nullPointerRedundantCheck:src/proxy.cc:167
nullPointerRedundantCheck:src/proxy.cc:168
nullPointerRedundantCheck:src/proxy.cc:340
nullPointerRedundantCheck:src/proxy.cc:342
nullPointerRedundantCheck:src/proxy.cc:93
nullPointerRedundantCheck:src/proxy.cc:94
redundantAssignment:src/proxy.cc:161
redundantAssignment:src/proxy.cc:163
redundantCopy:src/graph/rings.cc:16
redundantCopy:src/graph/rings.cc:17
terminateStrncpy:src/misc/utils.cc:99
terminateStrncpy:src/transport/net_socket.cc:245
unreachableCode:src/transport/net.cc:555
unreadVariable:src/graph/tuning.cc:109
unreadVariable:src/graph/tuning.cc:110
unreadVariable:src/graph/tuning.cc:113
unusedFunction:src/graph/topo.cc:37
unusedFunction:src/graph/topo.cc:836
unusedFunction:src/misc/gdrwrap.cc:109
unusedFunction:src/misc/gdrwrap.cc:117
unusedFunction:src/misc/gdrwrap.cc:130
unusedFunction:src/misc/gdrwrap.cc:144
unusedFunction:src/misc/gdrwrap.cc:158
unusedFunction:src/misc/gdrwrap.cc:172
unusedFunction:src/misc/gdrwrap.cc:186
unusedFunction:src/misc/gdrwrap.cc:200
unusedFunction:src/misc/gdrwrap.cc:209
unusedFunction:src/misc/gdrwrap.cc:218
unusedFunction:src/misc/gdrwrap.cc:232
unusedFunction:src/misc/gdrwrap.cc:52
unusedFunction:src/misc/ibvwrap.cc:203
unusedFunction:src/misc/ibvwrap.cc:239
unusedFunction:src/misc/ibvwrap.cc:255
unusedFunction:src/misc/nvmlwrap.cc:112
unusedFunction:src/misc/nvmlwrap_stub.cc:31
unusedFunction:src/misc/nvmlwrap_stub.cc:35
unusedFunction:src/transport.cc:71
unusedLabel:src/bootstrap.cc:349
unusedLabel:src/clique/ShmObject.h:112
unusedLabel:src/clique/ShmObject.h:204
unusedLabel:src/enqueue.cc:108
unusedLabel:src/enqueue.cc:1093
unusedLabel:src/enqueue.cc:989
unusedLabel:src/init.cc:1189
unusedLabel:src/init.cc:1240
unusedLabel:src/init.cc:1267
unusedLabel:src/transport.cc:238
unusedStructMember:src/graph/xml.cc:410
unusedStructMember:src/graph/xml.cc:411
unusedStructMember:src/graph/xml.cc:412
unusedStructMember:src/graph/xml.cc:428
unusedStructMember:src/graph/xml.cc:431
unusedStructMember:src/graph/xml.cc:432
unusedStructMember:src/graph/xml.cc:435
unusedStructMember:src/graph/xml.cc:437
variableScope:src/graph/search.cc:494
variableScope:src/init.cc:240
variableScope:src/transport/net_ib.cc:117
variableScope:src/transport/net_socket.cc:431
+46
Voir le fichier
@@ -0,0 +1,46 @@
Attributions
Contains contributions from NVIDIA.
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.
+31
Voir le fichier
@@ -0,0 +1,31 @@
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
.PHONY : all clean
default : src.build
install : src.install
BUILDDIR ?= $(abspath ./build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := src pkg
clean: ${TARGETS:%=%.clean}
test.build: src.build
LICENSE_FILES := LICENSE.txt
LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
lic: $(LICENSE_TARGETS)
${BUILDDIR}/%.txt: %.txt
@printf "Copying %-35s > %s\n" $< $@
mkdir -p ${BUILDDIR}
cp $< $@
src.%:
${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
pkg.%:
${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
pkg.debian.prep: lic
pkg.txz.prep: lic
+128
Voir le fichier
@@ -0,0 +1,128 @@
Notices and Licenses file
_______________________________________________________________
Dependencies on nvidia-nccl v2.27.3-1 (BSD3)
Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.
_______________________________________________________________
Dependencies on NPKit (MIT License)
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
_______________________________________________________________
Dependencies on MSCCL++ (MIT License)
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
See:
https://github.com/microsoft/mscclpp
for more information and license details.
_______________________________________________________________
Dependencies on Latency Profiler (MIT License)
Copyright (c) Meta Platforms, Inc. and affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
See:
src/include/latency_profiler
src/misc/latency_profiler
+147
Voir le fichier
@@ -0,0 +1,147 @@
# RCCL
ROCm Communication Collectives Library
[![RCCL](https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/status%2Frccl?repoName=ROCm%2Frccl&branchName=develop)](https://dev.azure.com/ROCm-CI/ROCm-CI/_build/latest?definitionId=107&repoName=ROCm%2Frccl&branchName=develop)
[![TheRock CI](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml/badge.svg?branch=develop&event=push)](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml)
> **Note:** The published documentation is available at [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/index.html) in an organized easy-to-read format that includes a table of contents and search functionality. The documentation source files reside in the [rccl/docs](https://github.com/ROCm/rccl/tree/develop/docs) folder in this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
## Introduction
RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
## Requirements
1. ROCm supported GPUs
2. ROCm stack installed on the system (HIP runtime & HIP-Clang)
## Quickstart RCCL Build
RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
For ROCm installation instructions, see https://github.com/ROCm/ROCm.
The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL.
### To build the library using the install script:
```shell
./install.sh
```
For more info on build options/flags when using the install script, use `./install.sh --help`
```shell
./install.sh --help
RCCL build & installation helper script
Options:
--address-sanitizer Build with address sanitizer enabled
-c|--enable-code-coverage Enable code coverage
-d|--dependencies Install RCCL dependencies
--debug Build debug library
--enable_backtrace Build with custom backtrace support
--disable-colltrace Build without collective trace
--disable-msccl-kernel Build without MSCCL kernels
--enable-mscclpp Build with MSCCL++ support
--enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
--disable-roctx Build without ROCTX logging
-f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
-h|--help Prints this help message
-i|--install Install RCCL library (see --prefix argument below)
-j|--jobs Specify how many parallel compilation jobs to run ($nproc by default)
-l|--local_gpu_only Only compile for local GPU architecture
--amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)
--no_clean Don't delete files if they already exist
--npkit-enable Compile with npkit enabled
--log-trace Build with log trace enabled (i.e. NCCL_DEBUG=TRACE)
--openmp-test-enable Enable OpenMP in rccl unit tests
-p|--package_build Build RCCL package
--prefix Specify custom directory to install RCCL to (default: `/opt/rocm`)
--run_tests_all Run all rccl unit tests (must be built already)
-r|--run_tests_quick Run small subset of rccl unit tests (must be built already)
--static Build RCCL as a static library instead of shared library
-t|--tests_build Build rccl unit tests, but do not run
--time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system)
--verbose Show compile commands
```
By default, RCCL builds for all GPU targets defined in `DEFAULT_GPUS` in `CMakeLists.txt`. To target specific GPU(s), and potentially reduce build time, use `--amdgpu_targets` as a `;` separated string listing GPU(s) to target.
## Manual build
### To build the library using CMake:
```shell
$ git clone --recursive https://github.com/ROCm/rccl.git
$ cd rccl
$ mkdir build
$ cd build
$ cmake ..
$ make -j 16 # Or some other suitable number of parallel jobs
```
If you have already cloned, you can checkout the external submodules manually.
```shell
$ git submodule update --init --recursive --depth=1
```
You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example:
```shell
$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release ..
```
Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
### To build the RCCL package and install package :
Assuming you have already cloned this repository and built the library as shown in the previous section:
```shell
$ cd rccl/build
$ make package
$ sudo dpkg -i *.deb
```
RCCL package install requires sudo/root access because it installs under `/opt/rocm/`. This is an optional step as RCCL can instead be used directly by including the path containing `librccl.so`.
## Docker build
Refer to [docker/README.md](docker/README.md "docker/README.md")
## Tests
There are rccl unit tests implemented with the Googletest framework in RCCL. The rccl unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh).
To invoke the rccl unit tests, go to the build folder, then the test subfolder, and execute the appropriate rccl unit test executable(s).
rccl unit test names are now of the format:
CollectiveCall.[Type of test]
Filtering of rccl unit tests should be done with environment variable and by passing the `--gtest_filter` command line flag, for example:
```shell
UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
```
will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests for more information on how to form more advanced filters.
There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCm/rccl-tests.
See the rccl-tests README for more information on how to build and run those tests.
## Library and API Documentation
Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation.
### How to build documentation
Run the steps below to build documentation locally.
```shell
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+40
Voir le fichier
@@ -0,0 +1,40 @@
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
# while ROCMChecks gives a warning if this variable is modified manually without a target.
# We now choose to disable ROCMChecks for this one case.
set(DISABLE_ROCM_CHECK OFF)
function(rocm_check_toolchain_var var access value list_file)
if(NOT DISABLE_ROCM_CHECK)
_rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
endif()
endfunction()
macro(CHECK_SYMBOL_EXISTS)
set(DISABLE_ROCM_CHECK ON)
_check_symbol_exists(${ARGN})
set(DISABLE_ROCM_CHECK OFF)
endmacro()
+192
Voir le fichier
@@ -0,0 +1,192 @@
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
include(FetchContent)
if(NOT INSTALL_DEPENDENCIES)
find_package(GTest 1.11)
endif()
if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES)
if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
# hip-clang cannot compile googlebenchmark for some reason
set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
endif()
# unset(GTEST_INCLUDE_DIR CACHE)
# unset(GTEST_INCLUDE_DIRS CACHE)
message(STATUS "GTest not found. Downloading and building GTest.")
# Download, build and install googletest library
set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "")
download_project(PROJ googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.0
INSTALL_DIR ${GTEST_ROOT}
CMAKE_ARGS -DBUILD_GTEST=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE} -DBUILD_SHARED_LIBS=OFF
LOG_DOWNLOAD TRUE
LOG_CONFIGURE TRUE
LOG_BUILD TRUE
LOG_INSTALL TRUE
UPDATE_DISCONNECTED TRUE
)
set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gtest/include CACHE PATH "")
set(GMOCK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gmock/include CACHE PATH "")
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib)
set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest_main.a CACHE PATH "")
set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock_main.a CACHE PATH "")
elseif(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64)
set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest_main.a CACHE PATH "")
set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock_main.a CACHE PATH "")
else()
message(FATAL_ERROR "Cannot find gtest library installation path.")
find_package(GTest REQUIRED CONFIG PATHS ${GTEST_ROOT})
find_package(GMock REQUIRED CONFIG PATHS ${GTEST_ROOT})
endif()
elseif(GTest_FOUND AND BUILD_TESTS)
set(GTEST_BOTH_LIBRARIES "GTest::gtest;GTest::gtest_main")
set(GMOCK_BOTH_LIBRARIES "GTest::gmock;GTest::gmock_main")
endif()
# Find or download/install rocm-cmake project
set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
find_package(ROCM 0.7.3 QUIET CONFIG PATHS /opt/rocm)
if(NOT ROCM_FOUND)
set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
file(
DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip
${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
)
list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
if(rocm_cmake_download_error_code)
message(FATAL_ERROR "Error: downloading "
"https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
"error_code: ${rocm_cmake_download_error_code} "
"log: ${rocm_cmake_download_log} "
)
endif()
execute_process(
COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}
RESULT_VARIABLE rocm_cmake_unpack_error_code
)
execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
if(rocm_cmake_unpack_error_code)
message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed")
endif()
find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
endif()
set(CMAKE_INSTALL_LIBDIR lib CACHE STRING "Define install directory for libraries" FORCE)
# Find or download/install fmt
find_package(fmt QUIET)
if(NOT fmt_FOUND)
set(FMT_INSTALL OFF)
message(STATUS "fmt not found, fetching from source...")
FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt
GIT_TAG e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1
)
FetchContent_MakeAvailable(fmt)
else()
message(STATUS "Using system fmt")
get_target_property(FMT_INCLUDE_DIRS fmt::fmt-header-only INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "fmt include directories: ${FMT_INCLUDE_DIRS}")
endif()
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
# Iterate over the "source" list and check if there is a duplicate file name
# NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe
function(add_file_unique FILE_LIST FILE)
get_filename_component(FILE_NAME "${FILE}" NAME)
# Iterate over whatever is in the list so far
foreach(curr_file IN LISTS ${FILE_LIST})
get_filename_component(curr_file_name ${curr_file} NAME)
# Check if duplicate
if(${FILE_NAME} STREQUAL ${curr_file_name})
get_filename_component(DIR_PATH "${FILE}" DIRECTORY)
get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE)
get_filename_component(FILE_EXT "${FILE}" EXT)
# Construct a new file name by adding _tmp
set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE)
endif()
endforeach()
endfunction()
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMInstallSymlinks)
include(ROCMCheckTargetIds)
include(ROCMClients)
include(ROCMHeaderWrapper)
+14
Voir le fichier
@@ -0,0 +1,14 @@
# Distributed under the OSI-approved MIT License. See accompanying
# file LICENSE or https://github.com/Crascit/DownloadProject for details.
cmake_minimum_required(VERSION 2.8.2)
project(${DL_ARGS_PROJ}-download NONE)
include(ExternalProject)
ExternalProject_Add(${DL_ARGS_PROJ}-download
${DL_ARGS_UNPARSED_ARGUMENTS}
SOURCE_DIR "${DL_ARGS_SOURCE_DIR}"
BUILD_IN_SOURCE TRUE
TEST_COMMAND ""
)
+170
Voir le fichier
@@ -0,0 +1,170 @@
# Distributed under the OSI-approved MIT License. See accompanying
# file LICENSE or https://github.com/Crascit/DownloadProject for details.
#
# MODULE: DownloadProject
#
# PROVIDES:
# download_project( PROJ projectName
# [PREFIX prefixDir]
# [DOWNLOAD_DIR downloadDir]
# [SOURCE_DIR srcDir]
# [BINARY_DIR binDir]
# [QUIET]
# ...
# )
#
# Provides the ability to download and unpack a tarball, zip file, git repository,
# etc. at configure time (i.e. when the cmake command is run). How the downloaded
# and unpacked contents are used is up to the caller, but the motivating case is
# to download source code which can then be included directly in the build with
# add_subdirectory() after the call to download_project(). Source and build
# directories are set up with this in mind.
#
# The PROJ argument is required. The projectName value will be used to construct
# the following variables upon exit (obviously replace projectName with its actual
# value):
#
# projectName_SOURCE_DIR
# projectName_BINARY_DIR
#
# The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically
# need to be provided. They can be specified if you want the downloaded source
# and build directories to be located in a specific place. The contents of
# projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the
# locations used whether you provide SOURCE_DIR/BINARY_DIR or not.
#
# The DOWNLOAD_DIR argument does not normally need to be set. It controls the
# location of the temporary CMake build used to perform the download.
#
# The PREFIX argument can be provided to change the base location of the default
# values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments
# are provided, then PREFIX will have no effect. The default value for PREFIX is
# CMAKE_BINARY_DIR.
#
# The QUIET option can be given if you do not want to show the output associated
# with downloading the specified project.
#
# In addition to the above, any other options are passed through unmodified to
# ExternalProject_Add() to perform the actual download, patch and update steps.
#
# Only those ExternalProject_Add() arguments which relate to downloading, patching
# and updating of the project sources are intended to be used. Also note that at
# least one set of download-related arguments are required.
#
# If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to
# prevent a check at the remote end for changes every time CMake is run
# after the first successful download. See the documentation of the ExternalProject
# module for more information. It is likely you will want to use this option if it
# is available to you. Note, however, that the ExternalProject implementation contains
# bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when
# using the URL download method or when specifying a SOURCE_DIR with no download
# method. Fixes for these have been created, the last of which is scheduled for
# inclusion in CMake 3.8.0. Details can be found here:
#
# https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c
# https://gitlab.kitware.com/cmake/cmake/issues/16428
#
# If you experience build errors related to the update step, consider avoiding
# the use of UPDATE_DISCONNECTED.
#
# EXAMPLE USAGE:
#
# include(DownloadProject)
# download_project(PROJ googletest
# GIT_REPOSITORY https://github.com/google/googletest.git
# GIT_TAG master
# UPDATE_DISCONNECTED 1
# QUIET
# )
#
# add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
#
#========================================================================================
set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}")
include(CMakeParseArguments)
function(download_project)
set(options QUIET)
set(oneValueArgs
PROJ
PREFIX
DOWNLOAD_DIR
SOURCE_DIR
BINARY_DIR
)
set(multiValueArgs "")
cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
# Hide output if requested
if (DL_ARGS_QUIET)
set(OUTPUT_QUIET "OUTPUT_QUIET")
else()
unset(OUTPUT_QUIET)
message(STATUS "Downloading/updating ${DL_ARGS_PROJ}")
endif()
# Set up where we will put our temporary CMakeLists.txt file and also
# the base point below which the default source and binary dirs will be.
# The prefix must always be an absolute path.
if (NOT DL_ARGS_PREFIX)
set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}")
else()
get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE
BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
endif()
if (NOT DL_ARGS_DOWNLOAD_DIR)
set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download")
endif()
# Ensure the caller can know where to find the source and build directories
if (NOT DL_ARGS_SOURCE_DIR)
set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src")
endif()
if (NOT DL_ARGS_BINARY_DIR)
set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build")
endif()
set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE)
set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE)
# The way that CLion manages multiple configurations, it causes a copy of
# the CMakeCache.txt to be copied across due to it not expecting there to
# be a project within a project. This causes the hard-coded paths in the
# cache to be copied and builds to fail. To mitigate this, we simply
# remove the cache if it exists before we configure the new project. It
# is safe to do so because it will be re-generated. Since this is only
# executed at the configure step, it should not cause additional builds or
# downloads.
file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt")
# Create and build a separate CMake project to carry out the download.
# If we've already previously done these steps, they will not cause
# anything to be updated, so extra rebuilds of the project won't occur.
# Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project
# has this set to something not findable on the PATH.
configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in"
"${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt")
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
-D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}"
.
RESULT_VARIABLE result
${OUTPUT_QUIET}
WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
)
if(result)
message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build . -j16
RESULT_VARIABLE result
${OUTPUT_QUIET}
WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
)
if(result)
message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
endif()
endfunction()
+39
Voir le fichier
@@ -0,0 +1,39 @@
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(IBVERBS_INCLUDE_DIRS
NAMES infiniband/verbs.h
HINTS
${IBVERBS_INCLUDE_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/include)
find_library(IBVERBS_LIBRARIES
NAMES ibverbs
HINTS
${IBVERBS_LIB_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
+36
Voir le fichier
@@ -0,0 +1,36 @@
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(MSCCLPP_INCLUDE_DIRS
NAMES mscclpp/gpu.hpp
HINTS
${MSCCLPP_ROOT}/include)
find_library(MSCCLPP_LIBRARIES
NAMES mscclpp_nccl
HINTS
${MSCCLPP_ROOT}/lib)
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
+229
Voir le fichier
@@ -0,0 +1,229 @@
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
if(ENABLE_MSCCLPP)
# Try to find the mscclpp install
set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "")
execute_process(
COMMAND mkdir -p ${MSCCLPP_ROOT}
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
find_package(mscclpp_nccl)
#if(NOT mscclpp_nccl_FOUND)
# Ensure the source code is checked out
set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
set(JSON_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json CACHE PATH "")
if((NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt) OR (NOT EXISTS ${JSON_SOURCE}/CMakeLists.txt))
message(STATUS "Checking out external code")
execute_process(
COMMAND git submodule update --init --recursive
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
endif()
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
set(CMAKE_INHERITED_ARGS "")
set(CMAKE_ARGS_LIST "CMAKE_PREFIX_PATH;CMAKE_INSTALL_RPATH_USE_LINK_PATH;HIP_COMPILER")
foreach(arg IN LISTS CMAKE_ARGS_LIST)
if(DEFINED ${arg})
string(REPLACE ";" "%" ARG_VALUE "${${arg}}") # Replace ; with new list separator symbol % to avoid CMake errors
string(STRIP "${ARG_VALUE}" ARG_VALUE) # Eliminate whitespace, reducing to empty string if necessary
# Only add a cmake argument if it has a value
if("${ARG_VALUE}" STREQUAL "")
continue()
endif()
string(APPEND CMAKE_INHERITED_ARGS "-D${arg}=\"${ARG_VALUE}\" ")
endif()
endforeach()
if(NOT DEFINED CACHE{MSCCLPP_GPU_TARGETS})
message(STATUS "Building MSCCL++ only for supported variants: gfx942;gfx950")
set(MSCCLPP_GPU_TARGETS "gfx942;gfx950")
if(BUILD_ADDRESS_SANITIZER)
set(MSCCLPP_GPU_TARGETS "gfx942:xnack+;gfx950:xnack+")
endif()
else()
message(STATUS "Building MSCCL++ for ${MSCCLPP_GPU_TARGETS}")
endif()
string(REPLACE ";" "%" MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}")
download_project(PROJ mscclpp_nccl
#GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
#GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
INSTALL_DIR ${MSCCLPP_ROOT}
LIST_SEPARATOR %
CMAKE_ARGS "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DMSCCLPP_ENABLE_EXECUTOR=${ENABLE_MSCCLPP_EXECUTOR} -DMSCCLPP_ENABLE_FORMAT_CHECKS=${ENABLE_MSCCLPP_FORMAT_CHECKS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
LOG_DOWNLOAD FALSE
LOG_CONFIGURE FALSE
LOG_BUILD FALSE
LOG_INSTALL FALSE
UPDATE_DISCONNECTED TRUE
SOURCE_DIR ${MSCCLPP_SOURCE}
)
find_package(mscclpp_nccl REQUIRED)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
#endif()
execute_process(COMMAND objcopy
--redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
"${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a"
"${PROJECT_BINARY_DIR}/libmscclpp_nccl.a"
)
add_library(mscclpp_nccl STATIC IMPORTED)
set_target_properties(mscclpp_nccl PROPERTIES IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/libmscclpp_nccl.a)
endif()
+24
Voir le fichier
@@ -0,0 +1,24 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
cmake_minimum_required(VERSION 3.16)
message("Building rccl RAS client executable")
add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc")
target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include)
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src)
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include)
target_link_libraries(rcclras PRIVATE hip::host)
target_link_libraries(rcclras PRIVATE dl)
if(BUILD_SHARED_LIBS)
target_link_libraries(rcclras PRIVATE rccl hip::device)
else()
add_dependencies(rcclras rccl)
target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib)
endif()
rocm_install(TARGETS rcclras)
+27
Voir le fichier
@@ -0,0 +1,27 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
HIP_FILE=$1
if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
sed -i "s/__syncthreads()/__syncthreads(); insert_random_delay_per_warp()/" "$HIP_FILE"
echo "Added fault injection to $HIP_FILE"
fi
+42
Voir le fichier
@@ -0,0 +1,42 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
HIP_FILE=$1
if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?)(, bool isNetOffload.*?)?>/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE"
perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?(?:, int RCCLMetadata)?)(, bool isNetOffload.*?)?>/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE"
perl -pi -e 's/(ProtoSimple<[^,]*?,[^,]+?)>/\1, USE_ACC, COLL_UNROLL>/g' "$HIP_FILE"
perl -pi -e 's/(runRing<T.*?)((, (true|false))?>\()/\1, USE_ACC, COLL_UNROLL\2/g' "$HIP_FILE"
perl -pi -e 's/(runTreeUpDown<T.*?)>\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE"
perl -pi -e 's/(runTreeSplit<T.*?)>\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE"
perl -pi -e 's/(runTreeSplit<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
perl -pi -e 's/(runTreeUpDown<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
perl -pi -e 's/(runRing<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
perl -pi -e 's/(runRing<T, RedOp, (ProtoLL|ProtoLL128), (RCCL_ONE_NODE_RING_SIMPLE|RCCL_METADATA_EMPTY), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
perl -pi -e 's/(runRing<T, RedOp, Proto, (RCCL_ONE_NODE_RING_SIMPLE|RCCL_METADATA_EMPTY), USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
perl -pi -e 's/(runRing<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
perl -pi -e 's/(runTreeSplit<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
perl -pi -e 's/(runTreeUpDown<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE"
sed -i "s/\\(RunWorkColl<[^,]*,[^,]*,[^,]*,[^,]*,[^>]*\\)>/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE"
fi
+81
Voir le fichier
@@ -0,0 +1,81 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set(EXTRACT_TIMEOUT 5 CACHE STRING "Timeout in seconds for roc-obj-* calls")
## List the objects for each gfx architecture
execute_process( COMMAND roc-obj-ls librccl.so
RESULT_VARIABLE list_result
OUTPUT_VARIABLE cmd_output
ERROR_VARIABLE cmd_error
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_STRIP_TRAILING_WHITESPACE
TIMEOUT ${EXTRACT_TIMEOUT}
)
if(list_result EQUAL 0)
## Convert cmd output to list of lines
string(REGEX REPLACE "\n$" "" cmd_output "${cmd_output}")
string(REPLACE "\n" ";" cmd_output "${cmd_output}")
## Extract file paths for the selected gfx archs
foreach(line ${cmd_output})
if(line MATCHES "(gfx90a|gfx942|gfx950)")
string(REGEX MATCH "\\file://(.*)" file_match ${line})
if(file_match)
list(APPEND file_paths ${file_match})
endif()
endif()
endforeach()
## Extract objects from files
foreach(file ${file_paths})
execute_process(
COMMAND roc-obj-extract ${file}
RESULT_VARIABLE extraction_result
ERROR_VARIABLE extraction_error
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_STRIP_TRAILING_WHITESPACE
TIMEOUT ${EXTRACT_TIMEOUT}
)
if(extraction_result STREQUAL "TIMEOUT")
message(
WARNING
"[Timeout] Extraction of '${file}' did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${extraction_error}.
Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc."
)
elseif(NOT extraction_result EQUAL 0)
message(
WARNING
"[Error ${extraction_result}] Could not extract objects from '${file}'. stderr: ${extraction_error}"
)
endif()
endforeach()
elseif(list_result STREQUAL "TIMEOUT")
message(
WARNING
"[Timeout] roc-obj-ls did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${cmd_error}.
Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc"
)
else()
## We don't want to stop building unit-tests if this command fails.
message(WARNING "[Error ${list_result}] roc-obj-ls failed. stderr: ${cmd_error}")
endif()
+73
Voir le fichier
@@ -0,0 +1,73 @@
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Attempt to collect the latest git hash
# Use RCCL_SOURCE_DIR if passed, otherwise fallback to CMAKE_CURRENT_SOURCE_DIR
if(NOT DEFINED RCCL_SOURCE_DIR)
set(RCCL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
endif()
if(NOT DEFINED RCCL_BINARY_DIR)
set(RCCL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
endif()
execute_process(COMMAND git log --pretty=format:'%h' -n 1
WORKING_DIRECTORY ${RCCL_SOURCE_DIR}
OUTPUT_VARIABLE GIT_REV
ERROR_QUIET)
# Check if git information was found
if ("${GIT_REV}" STREQUAL "")
set(CURR_GIT_VERSION "const char *rcclGitHash =\"Unknown \";")
else()
# Check for changes (denote with a '+') after hash
execute_process(
COMMAND bash -c "git diff --quiet --exit-code || echo +"
WORKING_DIRECTORY ${RCCL_SOURCE_DIR}
OUTPUT_VARIABLE GIT_DIFF)
# Collect branch information
execute_process(
COMMAND git rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY ${RCCL_SOURCE_DIR}
OUTPUT_VARIABLE GIT_BRANCH)
string(STRIP "${GIT_REV}" GIT_REV)
string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
string(STRIP "${GIT_DIFF}" GIT_DIFF)
string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
set(CURR_GIT_VERSION "const char *rcclGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
endif()
# Compare file with older git version file (git_version.cpp)
if (EXISTS ${RCCL_BINARY_DIR}/git_version.cpp)
#MESSAGE(STATUS "Found ${RCCL_BINARY_DIR}/git_version.cpp")
file(READ ${RCCL_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
#message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
#message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
message(STATUS "Updating git_version.cpp")
file(WRITE ${RCCL_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
else()
message(STATUS "No changes to git_version.cpp required")
endif()
else()
# Create git_version.cpp if it doesn't exist yet
file(WRITE ${RCCL_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
endif()
+124
Voir le fichier
@@ -0,0 +1,124 @@
## base docker image
ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04
ARG ROCM_IMAGE_TAG=latest
FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}"
## rccl repo
ARG RCCL_REPO=https://github.com/ROCm/rccl
ARG RCCL_BRANCH=develop
## rccl-tests repo
ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests
ARG RCCL_TESTS_BRANCH=develop
## AMD GPU Targets
ARG GPU_TARGETS=gfx942
## creating scratch space
ENV WORKDIR /workspace
RUN mkdir -p ${WORKDIR}
WORKDIR ${WORKDIR}
## install dependencies
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
git \
make \
rocm-cmake \
ninja-build \
gfortran \
build-essential \
libomp5 \
libomp-dev \
libbfd-dev \
libboost-all-dev \
libnuma1 \
libnuma-dev \
libpthread-stubs0-dev \
libzstd-dev \
lcov \
zip \
zlib1g-dev \
wget \
pkg-config \
unzip \
chrpath \
doxygen \
lshw \
build-essential \
libssl-dev \
curl \
libncursesw5-dev \
xz-utils \
liblzma-dev \
python3-pip \
python3-setuptools \
python3-venv \
python3-dev \
python3-tk \
python3-yaml \
vim \
less \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \
&& chmod +x cmake-3.28.0-linux-x86_64.sh \
&& bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \
&& rm cmake-3.28.0-linux-x86_64.sh
## Set ROCm path
ENV ROCM_PATH=/opt/rocm
## Install UCX
ENV UCX_INSTALL_PREFIX=/opt/ucx
RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \
&& mkdir -p ucx \
&& tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \
&& cd ucx \
&& mkdir build \
&& cd build \
&& ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \
&& make -j16 install \
&& cd ../.. \
&& rm -rf ucx ucx-1.16.0.tar.gz
## Install OpenMPI
ENV MPI_INSTALL_PREFIX=/opt/ompi
RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \
&& mkdir -p ompi4 \
&& tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \
&& cd ompi4 \
&& mkdir build \
&& cd build \
&& ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \
&& make -j16 install \
&& cd ../.. \
&& rm -rf ompi4 openmpi-4.1.6.tar.gz
## building RCCL
ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl/install
RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" \
&& cd ./rccl \
&& ./install.sh --amdgpu_targets=${GPU_TARGETS} --prefix=${RCCL_INSTALL_PREFIX}
## building RCCL-Tests
RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \
&& cd ./rccl-tests \
&& mkdir build \
&& cd build \
&& cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DCMAKE_PREFIX_PATH="${RCCL_INSTALL_PREFIX};${MPI_INSTALL_PREFIX}" -DGPU_TARGETS=${GPU_TARGETS} .. \
&& make -j16
## set environment variables
ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}"
ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"
ENV UCX_WARN_UNUSED_ENV_VARS=n
ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
ENV NCCL_DEBUG=VERSION
+42
Voir le fichier
@@ -0,0 +1,42 @@
# Using RCCL/RCCL-Tests in a docker environment
## Docker build
Assuming you have docker installed on your system:
### To build the docker image :
By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch), targetting `gfx942` GPUs.
```shell
$ docker build -t rccl-tests -f Dockerfile.ubuntu --pull .
```
The base docker image, rccl repo, rccl-tests repo, and GPU targets can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image for the MI250 GPU:
```shell
$ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull .
```
### To start an interactive docker container on a system with AMD GPUs :
```shell
$ docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash
```
### To run rccl-tests (all\_reduce\_perf) on 8 AMD GPUs (inside the docker container) :
If using ROCm 6.3.x or earlier
```shell
$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
```
If using ROCm 6.4.0 or later
```shell
$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
```
For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests.
## Copyright
All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+5
Voir le fichier
@@ -0,0 +1,5 @@
_build/
_doxygen/
doxygen/html
doxygen/xml
sphinx/_toc.yml
+18
Voir le fichier
@@ -0,0 +1,18 @@
.. meta::
:description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
:keywords: RCCL, ROCm, library, API
.. _api-library:
***********
API library
***********
RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
Operations
==========
.. doxygenindex::
+165
Voir le fichier
@@ -0,0 +1,165 @@
.. meta::
:description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
:keywords: RCCL, ROCm, library, API, reference, environment variable, environment
.. _env-variables:
********************************************************************
RCCL environment variables
********************************************************************
This section describes the most important RCCL environment variables,
which are grouped by functionality.
Configuration and setup
========================
The configuration and setup environment variables for RCCL are collected
in the following table.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``NCCL_CONF_FILE``
| Specifies the path to the RCCL configuration file.
- | String path to configuration file
| Default: ``~/.rccl.conf`` or ``/etc/rccl.conf``
* - | ``NCCL_HOSTID``
| Sets the host identifier for multi-node communication.
- | String value for host identification
| Used for host hash generation
Logging and debugging
=====================
The logging and debugging environment variables for RCCL are collected
in the following table.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``RCCL_LOG_LEVEL``
| Controls RCCL logging verbosity.
- | Integer value (default: ``1``)
| Higher values increase logging detail
* - | ``NCCL_DEBUG_SUBSYS``
| Controls which subsystems generate debug output.
- | Comma-separated list of subsystems (e.g., ``INIT,COLL``)
| Prefix with ``^`` to invert selection
Algorithm and protocol control
==============================
The algorithm and protocol control environment variables for RCCL are
collected in the following table.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``NCCL_ALGO``
| Forces specific algorithm selection for collectives.
- | Algorithm name string
| Used to override automatic algorithm selection
* - | ``NCCL_PROTO``
| Forces specific protocol selection for communication.
- | Protocol name string
| Used to override automatic protocol selection
Network and topology
====================
The network and topology environment variables for RCCL are collected
in the following table.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``NCCL_IB_HCA``
| Specifies InfiniBand device:port to use.
- | Device specification string
| Prefix with ``^`` for exclusion, ``=`` for exact match
* - | ``NCCL_IB_GID_INDEX``
| Defines the Global ID index used in RoCE mode.
- | Integer value (default: ``-1``)
| See InfiniBand ``show_gids`` command for valid values
* - | ``NCCL_SOCKET_IFNAME``
| Specifies which IP interfaces to use for communication.
- | Interface prefix string or list
| Multiple prefixes separated by ``,``
| Prefix with ``^`` for exclusion, ``=`` for exact match
| Example: ``eth`` (all eth interfaces), ``=eth0`` (exact match)
* - | ``NCCL_SOCKET_FAMILY``
| Forces IPv4/IPv6 interface selection.
- | ``AF_INET``: Force IPv4
| ``AF_INET6``: Force IPv6
| Unset: Use first available
* - | ``NCCL_NET_MERGE_LEVEL``
| Controls network device merging behavior.
- | Integer value specifying merge level
| Default: ``PATH_PORT``
* - | ``NCCL_NET_FORCE_MERGE``
| Forces merging of network devices.
- | String specifying forced merge configuration
* - | ``NCCL_RINGS``
| Defines custom ring topology.
- | Ring topology specification string
| Overrides automatic topology detection
* - | ``RCCL_TREES``
| Defines custom tree topology.
- | Tree topology specification string
| Alternative to ring topology
* - | ``NCCL_RINGS_REMAP``
| Controls ring remapping for specific topologies.
- | Remapping specification string
| Used with Rome 4P2H topology
Development and testing (advanced)
==================================
The development and testing environment variables for RCCL are
collected in the following table. These variables are primarily
intended for debugging and development purposes.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``CUDA_LAUNCH_BLOCKING``
| Controls CUDA kernel launch blocking behavior.
- | ``0``: Non-blocking launches
| ``1`` or non-zero: Blocking launches
* - | ``NCCL_COMM_ID``
| Enables multi-process mode in test applications.
- | Any non-empty value enables multi-process mode
| Used with test executables for distributed testing
+114
Voir le fichier
@@ -0,0 +1,114 @@
.. meta::
:description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
:keywords: RCCL, ROCm, library, API
.. _library-specification:
============================
RCCL library specification
============================
This document provides details of the API library.
Communicator functions
----------------------
.. doxygenfunction:: ncclGetUniqueId
.. doxygenfunction:: ncclCommInitRank
.. doxygenfunction:: ncclCommInitAll
.. doxygenfunction:: ncclCommDestroy
.. doxygenfunction:: ncclCommAbort
.. doxygenfunction:: ncclCommCount
.. doxygenfunction:: ncclCommCuDevice
.. doxygenfunction:: ncclCommUserRank
Collective communication operations
-----------------------------------
Collective communication operations must be called separately for each communicator in a communicator clique.
They return when operations have been enqueued on the hipstream.
Since they may perform inter-CPU synchronization, each call has to be done from a different thread or process, or need to use Group Semantics (see below).
.. doxygenfunction:: ncclReduce
.. doxygenfunction:: ncclBcast
.. doxygenfunction:: ncclBroadcast
.. doxygenfunction:: ncclAllReduce
.. doxygenfunction:: ncclReduceScatter
.. doxygenfunction:: ncclAllGather
.. doxygenfunction:: ncclSend
.. doxygenfunction:: ncclRecv
.. doxygenfunction:: ncclGather
.. doxygenfunction:: ncclScatter
.. doxygenfunction:: ncclAllToAll
Group semantics
---------------
When managing multiple GPUs from a single thread, and since NCCL collective
calls may perform inter-CPU synchronization, we need to "group" calls for
different ranks/devices into a single call.
Grouping NCCL calls as being part of the same collective operation is done
using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
collective calls until the ncclGroupEnd call, which will wait for all calls
to be complete. Note that for collective communication, ncclGroupEnd only
guarantees that the operations are enqueued on the streams, not that
the operation is effectively done.
Both collective communication and ncclCommInitRank can be used in conjunction
of ncclGroupStart/ncclGroupEnd.
.. doxygenfunction:: ncclGroupStart
.. doxygenfunction:: ncclGroupEnd
Library functions
-----------------
.. doxygenfunction:: ncclGetVersion
.. doxygenfunction:: ncclGetErrorString
Types
-----
There are few data structures that are internal to the library. The pointer types to these
structures are given below. The user would need to use these types to create handles and pass them
between different library functions.
.. doxygentypedef:: ncclComm_t
.. doxygenstruct:: ncclUniqueId
Enumerations
------------
This section provides all the enumerations used.
.. doxygenenum:: ncclResult_t
.. doxygenenum:: ncclRedOp_t
.. _rccl-supported-data-types:
.. doxygenenum:: ncclDataType_t
+47
Voir le fichier
@@ -0,0 +1,47 @@
.. meta::
:description: RCCL attributions information
:keywords: RCCL, ROCm, library, API, attributions
.. toctree::
:maxdepth: 4
:caption: Attributions
Attributions
============
Contains contributions from NVIDIA.
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
For more information and license details, see `https://github.com/NVIDIA/NVTX <https://github.com/NVIDIA/NVTX>`_
+36
Voir le fichier
@@ -0,0 +1,36 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import subprocess
from rocm_docs import ROCmDocs
name = "RCCL"
get_major = r'sed -n -e "s/^NCCL_MAJOR.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk'
get_minor = r'sed -n -e "s/^NCCL_MINOR.*\([0-9]\{2,\}\).*/\1/p" ../makefiles/version.mk'
get_patch = r'sed -n -e "s/^NCCL_PATCH.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk'
major = subprocess.getoutput(get_major)
minor = subprocess.getoutput(get_minor)
patch = subprocess.getoutput(get_patch)
version_number = f"{major}.{minor}.{patch}"
# for PDF output on Read the Docs
project = f"{name} Documentation"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
version = version_number
release = version_number
external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(f"{name} {version_number} Documentation")
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.setup()
external_projects_current_project = "rccl"
for sphinx_var in ROCmDocs.SPHINX_VARS:
globals()[sphinx_var] = getattr(docs_core, sphinx_var)
Fichier binaire non affiché.

Après

Largeur:  |  Hauteur:  |  Taille: 114 KiB

Fichier binaire non affiché.

Après

Largeur:  |  Hauteur:  |  Taille: 107 KiB

Fichier diff supprimé car celui-ci est trop grand Voir la Diff
+281
Voir le fichier
@@ -0,0 +1,281 @@
.. meta::
:description: Usage tips for the RCCL library of collective communication primitives
:keywords: RCCL, ROCm, library, API, peer-to-peer, transport
.. _rccl-usage-tips:
*****************************************
RCCL usage tips
*****************************************
This topic describes some of the more common RCCL extensions, such as NPKit and MSCCL, and provides tips on how to
configure and customize the application.
NPKit
=====
RCCL integrates `NPKit <https://github.com/microsoft/npkit>`_, a profiler framework that
enables the collection of fine-grained trace events in RCCL components, especially in giant collective GPU kernels.
See the `NPKit sample workflow for RCCL <https://github.com/microsoft/NPKit/tree/main/rccl_samples>`_ for
a fully-automated usage example. It also provides useful templates for the following manual instructions.
To manually build RCCL with NPKit enabled, pass ``-DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_...(other NPKit compile-time switches)"`` to the ``cmake`` command.
All NPKit compile-time switches are declared in the RCCL code base as macros with the prefix ``ENABLE_NPKIT_``.
These switches control the information that is collected.
.. note::
NPKit only supports the collection of non-overlapped events on the GPU.
The ``-DNPKIT_FLAGS`` settings must follow this rule.
To manually run RCCL with NPKit enabled, set the environment variable ``NPKIT_DUMP_DIR``
to the NPKit event dump directory. NPKit only supports one GPU per process.
To manually analyze the NPKit dump results, use `npkit_trace_generator.py <https://github.com/microsoft/NPKit/blob/main/rccl_samples/npkit_trace_generator.py>`_.
MSCCL/MSCCL++
=============
RCCL integrates `MSCCL <https://github.com/microsoft/msccl>`_ and `MSCCL++ <https://github.com/microsoft/mscclpp>`_ to
leverage these highly efficient GPU-GPU communication primitives for collective operations.
Microsoft Corporation collaborated with AMD for this project.
MSCCL uses XMLs for different collective algorithms on different architectures.
RCCL collectives can leverage these algorithms after the user provides the corresponding XML.
The XML files contain sequences of send-recv and reduction operations for the kernel to run.
MSCCL is enabled by default on the AMD Instinct™ MI300X accelerator. On other platforms, users might have to enable it
using the setting ``RCCL_MSCCL_FORCE_ENABLE=1``. By default, MSCCL is only used if every rank belongs
to a unique process. To disable this restriction for multi-threaded or single-threaded configurations,
use the setting ``RCCL_MSCCL_ENABLE_SINGLE_PROCESS=1``.
RCCL allreduce and allgather collectives can leverage the efficient MSCCL++ communication kernels
for certain message sizes. MSCCL++ support is available whenever MSCCL support is available.
To run a RCCL workload with MSCCL++ support, set the following RCCL environment variable:
.. code-block:: shell
RCCL_MSCCLPP_ENABLE=1
To set the message size threshold for using MSCCL++, use the environment variable ``RCCL_MSCCLPP_THRESHOLD``,
which has a default value of 1MB. After ``RCCL_MSCCLPP_THRESHOLD`` has been set,
RCCL invokes MSCCL++ kernels for all message sizes less than or equal to the specified threshold.
The following restrictions apply when using MSCCL++. If these restrictions are not met,
operations fall back to using MSCCL or RCCL.
* The message size must be a non-zero multiple of 32 bytes
* It does not support ``hipMallocManaged`` buffers
* Allreduce only supports the ``float16``, ``int32``, ``uint32``, ``float32``, and ``bfloat16`` data types
* Allreduce only supports the sum operation
Enabling peer-to-peer transport
===============================
To enable peer-to-peer access on machines with PCIe-connected GPUs,
set the HSA environment variable as follows:
.. code-block:: shell
HSA_FORCE_FINE_GRAIN_PCIE=1
This feature requires GPUs that support peer-to-peer access along with
proper large BAR addressing support.
Ignoring CPU affinity with multi-node
=====================================
Depending on the job launcher and the requirements of your workload, performance as the communication workload scales
can be improved by setting ``NCCL_IGNORE_CPU_AFFINITY``. This allows the RCCL communication library to
ignore the job's supplied CPU affinity and use the GPU affinity only.
.. code-block:: shell
NCCL_IGNORE_CPU_AFFINITY=1
For general usage, this environment variable is not set so it doesn't interfere with the user or launcher
supplied preferences.
Improving performance on the MI300X
===================================
This section outlines ways to improve RCCL performance on MI300X systems,
including guidelines for systems with fewer than eight GPUs and the most efficient
GPU partition modes.
Configuration with fewer than eight GPUs
----------------------------------------
On a system with eight MI300X accelerators, each pair of accelerators is
connected with dedicated Infinity Fabric™ links in a fully connected topology.
For collective operations, this can achieve good performance when all eight
accelerators (and all Infinity Fabric links) are used. When fewer than eight
GPUs are used, however, this can only achieve a fraction of the potential
bandwidth on the system. However, if your workload warrants using fewer than
eight MI300X accelerators on a system, you can set the run-time variable
``NCCL_MIN_NCHANNELS`` to increase the number of channels. For example:
.. code-block:: shell
export NCCL_MIN_NCHANNELS=32
Increasing the number of channels can benefit performance, but it also increases
GPU utilization for collective operations.
Additionally, RCCL pre-defines a higher number of channels when only two or four
accelerators are in use on a 8\*MI300X system. In this situation, RCCL uses 32
channels with two MI300X accelerators and 24 channels for four MI300X
accelerators.
.. _nps4_cpx_mi300_rccl:
NPS4 and CPX partition modes
----------------------------
The term compute partitioning modes, or Modular Chiplet Platform (MCP), refers to the
logical partitioning of XCDs into devices in the ROCm stack. The names are
derived from the number of logical partitions that are created out of the eight
XCDs. In the default mode, SPX (Single Partition X-celerator), all eight XCDs are
viewed as a single logical compute element, meaning that the :doc:`amd-smi <amdsmi:index>`
utility will show a single MI300X device. In CPX (Core Partitioned X-celerator)
mode, each XCD appears as a separate logical GPU, for example, as eight separate
GPUs in :doc:`amd-smi <amdsmi:index>` per MI300X. CPX mode can be viewed as
having explicit scheduling privileges for each individual compute element (XCD).
While compute partitioning modes change the space on which you can assign work
to compute units, the memory partitioning modes (known as Non-Uniform Memory
Access (NUMA) Per Socket (NPS)) change the number of NUMA domains that a device
exposes. In other words, it changes the number of HBM stacks which are
accessible to a compute unit, and therefore the size of its memory space. However,
for the MI300X, the number of memory partitions must be less than or equal to
the number of compute partitions. NPS4 (viewing pairs of HBM stacks as a
disparate element), for example, is only enabled when in CPX mode (viewing each
XCD as a disparate element).
- Compute partition modes
- In SPX mode, workgroups launched to the device are distributed
round-robin to the XCDs in the device, meaning that the programmer cannot
have explicit control over which XCD a workgroup is assigned to.
- In CPX mode, workgroups are launched to a single XCD, meaning the
programmer has explicit control over work placement onto the XCDs.
- Memory partition modes
- In NPS1 mode (compatible with CPX and SPX), the entire memory is accessible
to all XCDs.
- In NPS4 mode (compatible with CPX), each memory quadrant of the memory is
directly visible to the logical devices in its quadrant. An XCD can still
access all portions of memory through multi-GPU programming techniques.
The MI300 CPX mode can be accessed using the following :doc:`amdsmi:index`
commands.
.. code-block:: shell
amd-smi set --gpu all --compute-partition CPX
amd-smi set --gpu all --memory-partition NPS4
RCCL performance with CPX and NPS4
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To run RCCL allreduce on 64 GPUs with CPX+NPS4 mode on the MI300X, use this
example:
.. code-block:: shell
mpirun -np 64 --bind-to numa rccl-tests/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1
To run RCCL allreduce on 8 GPUs in the same OAM with CPX+NPS4 mode on the
MI300X, use this example:
.. code-block:: shell
export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
mpirun -np 8 --bind-to numa rccl-tests/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1
RCCL delivers improved allreduce performance in CPX mode for TP=8 (8 GPUs in
the same OAM) on the MI300X.
.. code-block:: shell
export HIP_FORCE_DEV_KERNARG=1
export RCCL_MSCCLPP_THRESHOLD=1073741824
export MSCCLPP_READ_ALLRED=1
export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
mpirun -np 8 --bind-to numa rccl-tests/build/all_reduce_perf -b 32 -e 1G -f 2 -g 1 -G 2 -w 20 -n 50
Here are the benchmark results for in-place (where the output buffer is used as
the input buffer) and out-of-place allreduce bus bandwidth.
.. figure:: ../data/how-to/rccl-usage-tips/in-place_allreduce.png
:alt: In-place allreduce benchmark results
:align: center
.. figure:: ../data/how-to/rccl-usage-tips/out-of-place_allreduce.png
:alt: Out-of-place allreduce benchmark results
:align: center
A significant performance improvement is achievable with optimized CPX mode,
which peaks at ~340 GB/s with a single OAM. The difference in bus bandwidth
between the unoptimized and optimized modes increases as the buffer size grows.
Using RCCL and CPX in PyTorch
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The PyTorch all_reduce benchmark is used to reproduce the performance reported
by RCCL-Tests with the RCCL and CPX optimizations.
.. note::
To use RCCL with CPX mode in PyTorch, check the RCCL version used by PyTorch.
For a virtualenv with a .whl-based PyTorch setup (such as nightly/rocm6.2),
this would be in
``<path-to-your-venv>/lib/<python-version>/site-packages/torch/lib/librccl.so``
This is the version of RCCL that is packaged as part of ROCm version 6.2.
RCCL for CPX mode was enabled in ROCm 6.3.0. To use the CPX features, replace
the existing ``librccl.so`` with one from ROCm 6.3.0 or newer or from a local
build of the RCCL develop branch.
To test the effects of RCCL on PyTorch, the `stas00 all reduce benchmark <https://github.com/stas00/ml-engineering/blob/master/network/benchmarks/all_reduce_bench.py>`_
was used. The following command is used to run a single OAM allreduce
benchmark:
.. code-block:: shell
export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -u -m torch.distributed.run --nproc_per_node=8 --rdzv_endpoint localhost:6000 --rdzv_backend c10d all_reduce_bench.py
For better performance, the ``HIP_FORCE_DEV_KERNARG``, ``RCCL_MSCCLPP_THRESHOLD``,
and ``TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK`` environment variables are
set during the benchmark in the following manner:
.. code-block:: shell
export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=1
export HIP_FORCE_DEV_KERNARG=1
export RCCL_MSCCLPP_THRESHOLD=$((2*1024*1024*1024))
export MSCCLPP_READ_ALLRED=1
export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -u -m torch.distributed.run --nproc_per_node=8 --rdzv_endpoint localhost:6000 --rdzv_backend c10d all_reduce_bench.py
The default allreduce PyTorch benchmark peak bus bandwidth performance is
~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a
single OAM peaks at ~315 GB/s.
Context tracking on GPUs
----------------------------------------
Context tracking is disabled by default for optimal performance. However, enabling of context tracking can significantly improve performance
in certain scenarios. To enable context tracking, set the following environment variable:
.. code-block:: shell
export RCCL_ENABLE_CONTEXT_TRACKING=1
+249
Voir le fichier
@@ -0,0 +1,249 @@
.. meta::
:description: A guide to troubleshooting the RCCL library of multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
:keywords: RCCL, ROCm, library, API, debug
.. _troubleshooting-rccl:
*********************
Troubleshooting RCCL
*********************
This topic explains the steps to troubleshoot functional and performance issues with RCCL.
While debugging, collect the output from the commands in this guide. This data
can be used as supporting information when submitting an issue report to AMD.
.. _debugging-system-info:
Collecting system information
=============================
Collect this information about the ROCm version, GPU/accelerator, platform, and configuration.
* Verify the ROCm version. This might be a release version or a
mainline or staging version. Use this command to display the version:
.. code:: shell
cat /opt/rocm/.info/version
Run the following command and collect the output:
.. code:: shell
rocm_agent_enumerator
Also, collect the name of the GPU or accelerator:
.. code:: shell
rocminfo
* Run these ``rocm-smi`` commands to display the system topology.
.. code:: shell
rocm-smi
rocm-smi --showtopo
rocm-smi --showdriverversion
* Determine the values of the ``PATH`` and ``LD_LIBRARY_PATH`` environment variables.
.. code:: shell
echo $PATH
echo $LD_LIBRARY_PATH
* Collect the HIP configuration.
.. code:: shell
/opt/rocm/bin/hipconfig --full
* Verify the network settings and setup. Use the ``ibv_devinfo`` command
to display information about the available RDMA devices and determine
whether they are installed and functioning properly. Run ``rdma link``
to print a summary of the network links.
.. code:: shell
ibv_devinfo
rdma link
Isolating the issue
-------------------
The problem might be a general issue or specific to the architecture or system.
To narrow down the issue, collect information about the GPU or accelerator and other
details about the platform and system. Some issues to consider include:
* Is ROCm running on:
* A bare-metal setup
* In a Docker container (determine the name of the Docker image)
* In an SR-IOV virtualized
* Some combination of these configurations
* Is the problem only seen on a specific GPU architecture?
* Is it only seen on a specific system type?
* Is it happening on a single node or multinode setup?
* Use the following troubleshooting techniques to attempt to isolate the issue.
* Build or run the develop branch version of RCCL and see if the problem persists.
* Try an earlier RCCL version (minor or major).
* If you recently changed the ROCm runtime configuration, AMD Kernel-mode GPU Driver (KMD), or compiler,
rerun the test with the previous configuration.
.. _collecting-rccl-info:
Collecting RCCL information
=============================
Collect the following information about the RCCL installation and configuration.
* Run the ``ldd`` command to list any dynamic dependencies for RCCL.
.. code:: shell
ldd <specify-path-to-librccl.so>
* Determine the RCCL version. This might be the pre-packaged component in
``/opt/rocm/lib`` or a version that was built from source. To verify the RCCL version,
enter the following command, then run either rccl-tests or an e2e application.
.. code:: shell
export NCCL_DEBUG=VERSION
* Run rccl-tests and collect the results. For information on how to build and run rccl-tests, see the
`rccl-tests GitHub <https://github.com/ROCm/rccl-tests/blob/develop/README.md>`_.
* Collect the RCCL logging information. Enable the debug logs,
then run rccl-tests or any e2e workload to collect the logs. Use the
following command to enable the logs.
.. code:: shell
export NCCL_DEBUG=INFO
.. _use-rccl-replayer:
Using the RCCL Replayer
------------------------
The RCCL Replayer is a debugging tool designed to analyze and replay the collective logs obtained from RCCL runs.
It can be helpful when trying to reproduce problems, because it uses dummy data and doesn't have any dependencies
on non-RCCL calls. For more information,
see `RCCL Replayer GitHub documentation <https://github.com/ROCm/rccl/tree/develop/tools/RcclReplayer>`_.
You must build the RCCL Replayer before you can use it. To build it, run these commands. Ensure ``MPI_DIR`` is set to
the path where MPI is installed.
.. code:: shell
cd rccl/tools/rccl_replayer
MPI_DIR=/path/to/mpi make
To use the RCCL Replayer, follow these steps:
#. Collect the per-rank logs from the RCCL run by adding the following environment variables.
This prevents any race conditions that might cause ranks to interrupt the output from other ranks.
.. code:: shell
NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=COLL NCCL_DEBUG_FILE=some_name_here.%h.%p.log
#. Combine all the logs into a single file. This will become the input to the RCCL Replayer.
.. code:: shell
cat some_name_here_*.log > some_name_here.log
#. Run the RCCL Replayer using the following command. Replace ``<numProcesses>`` with the number of MPI processes to
run, ``</path/to/logfile>`` with the path to the collective log file generated during
the RCCL runs, and ``<numGpusPerMpiRank>`` with the number of GPUs per MPI rank used in the application.
.. code:: shell
mpirun -np <numProcesses> ./rcclReplayer </path/to/logfile> <numGpusPerMpiRank>
In a multi-node application environment, you can replay the collective logs on multiple nodes
using the following command:
.. code:: shell
mpirun --hostfile <path/to/hostfile.txt> -np <numProcesses> ./rcclReplayer </path/to/logfile> <numGpusPerMpiRank>
.. note::
Depending on the MPI library you're using, you might need to modify the ``mpirun`` command.
.. _analyze-performance-info:
Analyzing performance issues
=============================
If the issues involve performance issues in an e2e workload, try the following
microbenchmarks and collect the results. Follow the instructions in the subsequent sections
to run these benchmarks and provide the results to the support team.
* TransferBench
* RCCL Unit Tests
* rccl-tests
Collect the TransferBench data
---------------------------------
TransferBench allows you to benchmark simultaneous copies between
user-specified devices. For more information,
see the :doc:`TransferBench documentation <transferbench:index>`.
To collect the TransferBench data, follow these steps:
#. Clone the TransferBench Git repository.
.. code:: shell
git clone https://github.com/ROCm/TransferBench.git
#. Change to the new directory and build the component.
.. code:: shell
cd TransferBench
make
#. Run the TransferBench utility with the following parameters and save the results.
.. code:: shell
USE_FINE_GRAIN=1 GFX_UNROLL=2 ./TransferBench a2a 64M 8
Collect the RCCL microbenchmark data
-------------------------------------
To use the RCCL tests to collect the RCCL benchmark data, follow these steps:
#. Disable NUMA auto-balancing using the following command:
.. code:: shell
sudo sysctl kernel.numa_balancing=0
Run the following command to verify the setting. The expected output is ``0``.
.. code:: shell
cat /proc/sys/kernel/numa_balancing
#. Build MPI, RCCL, and rccl-tests. To download and install MPI, see either
`OpenMPI <https://www.open-mpi.org/software/ompi/v5.0/>`_ or `MPICH <https://www.mpich.org/>`_.
To learn how to build and run rccl-tests, see the `rccl-tests GitHub <https://github.com/ROCm/rccl-tests/blob/develop/README.md>`_.
#. Run rccl-tests with MPI and collect the performance numbers.
RCCL and NCCL comparisons
=============================
If you are also using NVIDIA hardware or NCCL and notice a performance gap between the two systems,
collect the system and performance data on the NVIDIA platform.
Provide both sets of data to the support team.
+333
Voir le fichier
@@ -0,0 +1,333 @@
.. meta::
:description: How to use the NCCL Net API
:keywords: RCCL, ROCm, library, API, NCCL Net, plugin
.. _using-nccl:
*****************************
Using the NCCL Net plugin API
*****************************
NCCL provides a way to use external plugins to let NCCL run on many network types. This
topic describes the NCCL Net plugin API and explains how to implement a network plugin for NCCL.
Plugins implement the NCCL network API and decouple NCCL binary builds, which are built against a
particular version of the GPU stack (such as NVIDIA CUDA), from the network code, which is built against a
particular version of the networking stack. Using this method, you can easily integrate any CUDA version
with any network stack version.
NCCL network plugins are packaged as a shared library called ``librccl-net.so``. The shared library
contains one or more implementations of the NCCL Net API in the form of versioned structs,
which are filled with pointers to all required functions.
Plugin architecture
===================
When NCCL is initialized, it searches for a ``librccl-net.so`` library and dynamically loads it,
then searches for symbols inside the library.
The ``NCCL_NET_PLUGIN`` environment variable allows multiple plugins to coexist. If it's set, NCCL
looks for a library named ``librccl-net-${NCCL_NET_PLUGIN}.so``. It is therefore
recommended that you name the library according to that pattern, with a symlink pointing from ``librccl-net.so``
to ``librccl-net-${NCCL_NET_PLUGIN}.so``. This lets users select the correct plugin
if there are multiple plugins in the path.
Struct versioning
-----------------
After a library is found, NCCL looks for a symbol named ``ncclNet_vX``, with ``X`` increasing
over time. This versioning pattern ensures that the plugin and the NCCL core are compatible.
Plugins are encouraged to provide a number of these symbols, implementing many versions
of the NCCL Net API. This is so the same plugin can be compiled for and support a wide range of NCCL
versions.
Conversely, and to ease transition, NCCL can choose to support different plugin versions. It can look
for the latest ``ncclNet`` struct version but also search for older versions, so that older plugins
still work.
In-network collective operations (collNet)
----------------------------------------------
In addition to the ``ncclNet`` structure, network plugins can provide a ``collNet`` structure which
implements any supported in-network collective operations. This is an optional
structure provided by the network plugin,
but its versioning is tied to the ``ncclNet`` structure and many functions are common between the two to
ease implementation. The ``collNet`` structure can be used by the NCCL ``collNet``
algorithm to accelerate inter-node reductions in allReduce.
Header management
------------------
To help users effortlessly build plugins, plugins should copy the ``ncclNet_vX`` definitions
they support to their list of internal includes. An example is shown in ``ext-net/example/``, which stores
all headers in the ``nccl/`` directory and provides thin layers to implement old versions on top
of newer ones.
The ``nccl/`` directory is populated with ``net_vX.h`` files, which extract all relevant definitions
from the old API versions. It also provides error codes in ``err.h``.
API (v6)
=========
Here is the main ``ncclNet_v6`` struct. Each function is explained in later sections.
.. code:: shell
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v6_t;
Error codes
-----------
All plugins functions use NCCL error codes as their return value. ``ncclSuccess`` should be returned upon
success. Otherwise, plugins can return one of the following codes:
* ``ncclSystemError`` is the most common error for network plugins. It should be returned when a call to the Linux kernel or a system library fails. This typically includes all network and hardware errors.
* ``ncclInternalError`` is returned when the NCCL core code is using the network plugin in an incorrect way, for example, allocating more requests than it should or passing an invalid argument in API calls.
* ``ncclInvalidUsage`` should be returned when the error is most likely due to user error. This can include misconfiguration, but also size mismatches.
* ``ncclInvalidArgument`` should not typically be used by plugins because arguments should be checked by the NCCL core layer.
* ``ncclUnhandledCudaError`` is returned when an error is received from NVIDIA CUDA. Network plugins should not need to rely on CUDA, so this error should not be common.
Operational overview
--------------------
NCCL first calls the ``init`` function, queries the number of network devices with the
``devices`` function, and retrieves the properties from each network device using ``getProperties``.
To establish a connection between two network devices, NCCL first calls ``listen`` on the
receiving side. It passes the returned handle to the sender side of the connection, and uses it to call ``connect``.
Finally, ``accept`` is called on the receiving side to finalize the establishment of the connection.
After the connection is established, communication is performed using the functions ``isend``,
``irecv``, and ``test``. Prior to calling ``isend`` or ``irecv``, NCCL calls the ``regMr`` function on
all buffers to allow RDMA NICs to prepare the buffers. ``deregMr`` is used to unregister buffers.
In certain conditions, ``iflush`` is called after a ``receive`` call completes to allow the network
plugin to flush data and ensure the GPU processes the newly written data.
To close the connections, NCCL calls ``closeListen`` to close the object returned by ``listen``,
``closeSend`` to close the object returned by ``connect``, and ``closeRecv`` to close the object returned
by ``accept``.
API Functions
-------------
The RCCL Tuner plugin API provides the following interface for initialization, connection management, and
communications.
Initialization
^^^^^^^^^^^^^^
* ``name`` - The ``name`` field should point to a character string with the name of the network plugin. This name is used for all logging, especially when ``NCCL_DEBUG=INFO`` is set.
.. note::
Setting ``NCCL_NET=<plugin name>`` ensures a specific network implementation is used, with
a matching ``name``. This is not to be confused with ``NCCL_NET_PLUGIN`` which defines a suffix for the
``librccl-net.so`` library name to load.
* ``init`` - As soon as NCCL finds the plugin and the correct ``ncclNet`` symbol, it calls the ``init`` function. This allows the plugin to discover network devices and ensure they are usable.
If the ``init`` function does not return ``ncclSuccess``, then NCCL does not use the plugin and falls back to internal ones.
To allow the plugin logs to seamlessly integrate into the NCCL logs, NCCL provides a logging function to ``init``. This function is typically used to allow ``INFO`` and ``WARN`` macros within the plugin code by adding the following definitions:
.. code:: shell
#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
* ``devices`` - After the plugin is initialized, NCCL queries the number of devices available.
This should not be zero. Otherwise, NCCL initialization will fail. If no device is present or usable, the ``init`` function should not return ``ncclSuccess``.
* ``getProperties`` - Right after retrieving the number of devices, NCCL queries the properties for each available network device.
These properties are necessary when multiple adapters are present to ensure NCCL uses each adapter in the optimal way.
* The ``name`` is only used for logging.
* The ``pciPath`` is the base for all topology detection and should point to the PCI device directory
in ``/sys``. This is typically the directory pointed to by ``/sys/class/net/eth0/device`` or
``/sys/class/infiniband/mlx5_0/device``. If the network interface is virtual, then ``pciPath`` should
be ``NULL``.
* The ``guid`` field is used to determine whether network adapters are connected to multiple PCI
endpoints. For normal cases, this is set to the device number. If multiple network devices have
the same ``guid``, then NCCL understands them to be sharing the same network port to the fabric. In this case,
it will not use the port multiple times.
* The ``ptrSupport`` field indicates whether or not CUDA pointers are supported. If so, it should be
set to ``NCCL_PTR_HOST|NCCL_PTR_CUDA``. Otherwise, it should be set to ``NCCL_PTR_HOST``. If the plugin
supports ``dmabuf``, it should set ``ptrSupport`` to ``NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF`` and
provide a ``regMrDmaBuf`` function.
* The ``regIsGlobal`` field allows NCCL to register buffers in advance, for example, using a loopback connection.
Later, it also lets NCCL expect that a subsequent registration on a buffer from a previous registration
will happen nearly immediately, because the buffer is already known by the network adapter. A typical
implementation maintains a registration cache, with the call to ``ncclCommRegister`` creating the
initial entry in the cache using ``regMr()`` on a loopback connection. Any later call to the NCCL
system can call ``regMr()`` again on the real connection, with the real buffer (which could be at a
different offset within the original buffer, with a smaller size, for example). It
could then call ``deregMr()`` immediately afterwards.
The ``ncclCommDeregister`` call should issue the final call to ``deregMr()`` and effectively remove the mapping
on the network adapter.
* The ``speed`` field indicates the speed of the network port in Mbps (10^6 bits per second).
This ensures proper optimization of flows within the node.
* The ``port`` field indicates the port number. This is important for topology detection and
flow optimization within the node when a NIC with a single PCI connection is connected to the fabric through multiple ports.
* The ``latency`` field indicates the network latency in microseconds. This can be useful to
improve the NCCL tuning and ensure NCCL switches from tree to ring at the correct size.
* The ``maxComms`` field indicates the maximum number of connections that can be created.
* The ``maxRecvs`` field indicates the maximum number for grouped receive operations (see grouped receive).
Connection establishment
^^^^^^^^^^^^^^^^^^^^^^^^
Connections are used in an unidirectional manner, with a sender side and a receiver
side.
* ``listen`` - To create a connection, NCCL calls ``listen`` on the receiver side.
This function accepts a device number as an input argument and returns a local ``listenComm`` object and a ``handle``
to pass to the other side of the connection, so that the sender can connect to the receiver.
The ``handle`` is a buffer of size ``NCCL_NET_HANDLE_MAXSIZE`` and is provided by NCCL.
This call should never block, but unlike ``connect`` and ``accept``, ``listenComm`` should never be ``NULL``
if the call succeeds.
* ``connect`` - NCCL uses its bootstrap infrastructure to provide the ``handle`` to the sender side,
then calls ``connect`` on the sender side on a given device index ``dev`` and provides the ``handle``.
``connect`` should not block either. Instead, it sets ``sendComm`` to ``NULL`` and returns ``ncclSuccess``.
In that case, NCCL will keep calling ``accept`` again until it succeeds.
* ``accept`` - To finalize the connection, the receiver side calls ``accept`` on the ``listenComm`` object
previously returned by the ``listen`` call. If the sender did not connect yet, ``accept`` should not block.
It should return ``ncclSuccess``, setting ``recvComm`` to ``NULL``. NCCL will keep calling ``accept``
again until it succeeds.
* ``closeListen`` / ``closeSend`` / ``closeRecv`` - When a ``listenComm``, ``sendComm``, or ``recvComm`` object is no longer
needed, NCCL calls ``closeListen``, ``closeSend``, or ``closeRecv`` to free the associated resources.
Communication
^^^^^^^^^^^^^
Communication is handled using the asynchronous send and receive operations: ``isend``, ``irecv``, and ``test``.
To support RDMA capabilities, buffer registration and flush functions are provided.
To keep track of asynchronous send, receive, and flush operations, requests are returned to NCCL,
then queried using ``test``. Each ``sendComm`` or ``recvComm`` must be able to handle
``NCCL_NET_MAX_REQUESTS`` requests in parallel.
.. note::
This value should be multiplied by the multi-receive capability of the plugin for the sender
side, so the plugin can effectively have ``NCCL_NET_MAX_REQUESTS`` multi-receive operations happening
in parallel. If ``maxRecvs`` is 8 and ``NCCL_NET_MAX_REQUESTS`` is 8, then each
``sendComm`` must be able to handle up to 64 (8x8) concurrent ``isend`` operations.
* ``regMr`` - Prior to sending or receiving data, NCCL calls ``regMr`` with any buffers later used for communication.
It provides a ``sendComm`` or ``recvComm`` object for the ``comm`` argument,
the buffer pointer ``data``, the ``size``, and the ``type``. The type is either ``NCCL_PTR_HOST`` or ``NCCL_PTR_CUDA`` if
the network supports CUDA pointers.
The network plugin can use the output argument ``mhandle`` to store any reference to the memory registration, because
``mhandle`` is returned for all ``isend``, ``irecv``, ``iflush``, and ``deregMr`` calls.
* ``regMrDmaBuf`` - If the plugin has set the ``NCCL_PTR_DMABUF`` property in ``ptrSupport``,
NCCL uses ``regMrDmaBuf`` instead of ``regMr``. If the property was not set, ``regMrDmaBuf`` can be set to ``NULL``.
* ``deregMr`` - When buffers are no longer used for communication, NCCL calls ``deregMr`` to let the plugin
free resources. This function is used to deregister handles returned by ``regMr`` and ``regMrDmaBuf``.
* ``isend`` - Data is sent through the connection using ``isend``, passing the ``sendComm`` object previously created
by ``connect``, the buffer described by ``data``, ``size``, and ``mhandle``. A ``tag`` must
be used if the network supports multi-receive operations (see ``irecv``) to distinguish between different send requests
matching the same multi-receive. Otherwise it can be set to ``0``.
The ``isend`` operation returns a handle in the ``request`` argument for further calls to ``test``.
If the ``isend`` operation cannot be initiated, ``request`` is set to ``NULL``. NCCL will call ``isend`` again later.
* ``irecv`` - To receive data, NCCL calls ``irecv`` with the ``recvComm`` returned by ``accept``.
The argument ``n`` configures NCCL for multi-receive, to allow grouping of multiple sends
through a single network connection. Each buffer can be described by the ``data``, ``sizes``, and ``mhandles`` arrays.
``tags`` specify a tag for each receive so that each of the ``n`` independent ``isend`` operations is received
into the right buffer.
If all receive operations can be initiated, ``irecv`` returns a handle in the ``request`` pointer. Otherwise,
it sets the pointer to ``NULL``. In the case of multi-receive, all ``n`` receive operations are handled by a single request handle.
The sizes provided to ``irecv`` can (and will) be larger than the size of the ``isend`` operation.
However, it is an error if the receive size is smaller than the send size.
.. note::
For a given connection, send and receive operations should always match in the order they were
posted. Tags provided for receive operations are only used to assign a given send operation to one
of the buffers of the first (multi-)receive operation in the queue, not to allow for out-of-order tag
matching on any receive operation posted.
* ``test`` - After an ``isend`` or ``irecv`` operation is initiated, NCCL calls ``test`` on the request handles until
the operation completes. When that happens, ``done`` is set to ``1`` and ``sizes`` is set to the real size sent or received,
the latter could potentially be lower than the size passed to ``irecv``.
In the case of a multi-receive, all receives are considered as part of a single operation, the goal
being to allow aggregation. Therefore, they share a single request and a single ``done`` status. However,
they can have different sizes, so if ``done`` is non-zero, the ``sizes`` array should contain the ``n`` sizes
corresponding to the buffers passed to ``irecv``.
After ``test`` returns ``1`` in ``done``, the request handle can be freed. This means that NCCL will never
call ``test`` again on that request, unless it is reallocated by another call to ``isend`` or ``irecv``.
* ``iflush`` - After a receive operation completes, if the operation was targeting GPU memory and received
a non-zero number of bytes, NCCL calls ``iflush``. This lets the network flush any buffer to ensure
the GPU can read it immediately without seeing stale data. This flush operation is decoupled from
the ``test`` code to improve the latency of ``LL*`` protocols, because those are capable of determining
when data is valid or not.
``iflush`` returns a request which must be queried using ``test`` until it completes.
+135
Voir le fichier
@@ -0,0 +1,135 @@
.. meta::
:description: How to use the RCCL Tuner plugin API
:keywords: RCCL, ROCm, library, API, Tuner, plugin
.. _using-rccl-tuner-plugin:
*******************************
Using the RCCL Tuner plugin API
*******************************
An external plugin enables users to hand-tailor the selection of an algorithm,
protocol, and number of channels (thread blocks) based on an input configuration specifying the
message size, number of nodes and GPUs, and link types (for instance, PCIe, XGMI, or NET).
One advantage of this plugin is that each user can create and maintain their own hand-tailored tuner
without relying on RCCL to develop and maintain it. This topic describes the API required to implement
an external tuner plugin for RCCL.
The following usage notes are relevant when using the RCCL Tuner plugin API:
* The API allows partial outputs: tuners can set only the algorithm and protocol and let RCCL set the remaining fields,
such as the number of channels.
* If ``getCollInfo()`` fails, RCCL uses its default internal mechanisms to determine the best collective configuration.
* ``getCollInfo`` is called for each collective invocation per communicator, so special care
must be taken to avoid introducing excessive latency.
* The supported RCCL algorithms are ``NCCL_ALGO_TREE``, and ``NCCL_ALGO_RING``.
* The supported RCCL protocols are ``NCCL_PROTO_SIMPLE``, ``NCCL_PROTO_LL`` and ``NCCL_PROTO_LL128``.
* Until support is present for network collectives, use the example in the ``pluginGetCollInfo`` API implementation
to ignore other algorithms as follows:
.. code-block:: cpp
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1) continue;
.. note::
The `example plugin <https://github.com/ROCm/rccl/blob/develop/ext-tuner/example/plugin.c>`_
uses math models to approximate the bandwidth and latency of the available selection of algorithms and protocols
and select the one with the lowest calculated latency. It is customized for the AMD Instinct MI300 accelerators and RoCEv2 networks
on a limited number of nodes. This example, which is intended for demonstration purposes only, is not meant to be inclusive of all potential AMD GPUs and network configuration.
API description
================
To build a custom tuner, implement the ``ncclTuner_v1_t`` structure.
Structure: ncclTuner_v1_t
---------------------------
**Fields**
* ``name``
* **Type**: ``const char*``
* **Description**: The name of the tuner, which can be used for logging purposes when ``NCCL_DEBUG=info`` and ``NCCL_DEBUG_SUBSYS=tune`` are set.
**Functions**
* ``init`` (called upon communicator initialization with ``ncclCommInitRank``)
Initializes the tuner states. Each communicator initializes its tuner. ``nNodes`` x ``nRanks`` = the total number of GPUs participating in the collective communication.
* **Parameters**:
* ``nRanks`` (``size_t``): The number of devices (GPUs).
* ``nNodes`` (``size_t``): The number of operating system nodes (physical nodes or VMs).
* ``logFunction`` (``ncclDebugLogger_t``): A log function for certain debugging info.
* **Return**:
* **Type**: ``ncclResult_t``
* **Description**: The result of the initialization.
* ``getCollInfo`` (called for each collective call per communicator)
Retrieves information about the collective algorithm, protocol, and number of channels for the given input parameters.
* **Parameters**:
* ``collType`` (``ncclFunc_t``): The collective type, for example, ``allreduce``, ``allgather``, etc.
* ``nBytes`` (``size_t``): The size of the collective in bytes.
* ``collNetSupport`` (``int``): Whether ``collNet`` supports this type.
* ``nvlsSupport`` (``int``): Whether NVLink SHARP supports this type.
* ``numPipeOps`` (``int``): The number of operations in the group.
* **Outputs**:
* ``algorithm`` (``int*``): The selected algorithm to be used for the given collective.
* ``protocol`` (``int*``): The selected protocol to be used for the given collective.
* ``nChannels`` (``int*``): The number of channels (and SMs) to be used.
* **Description**:
If ``getCollInfo()`` does not return ``ncclSuccess``, RCCL falls back to its default tuning for the given collective.
The tuner is allowed to leave fields unset, in which case RCCL automatically sets those fields.
* **Return**:
* **Type**: ``ncclResult_t``
* **Description**: The result of the operation.
* ``destroy`` (called upon communicator finalization with ``ncclCommFinalize``)
Terminates the plugin and cleans up any resources allocated by the tuner.
* **Return**:
* **Type**: ``ncclResult_t``
* **Description**: The result of the cleanup process.
Build and usage instructions
============================
To use the external plugin, implement the desired algorithm and protocol selection technique using the API described above.
As a reference, the `following example <https://github.com/ROCm/rccl/blob/develop/ext-tuner/example/plugin.c>`_ is based on the
MI300 tuning table by default.
Building and using the example libnccl-tuner.so file
-----------------------------------------------------
#. Build the ``libnccl-tuner.so`` file following `the example Makefile <https://github.com/ROCm/rccl/blob/develop/ext-tuner/example/Makefile>`_.
.. code-block:: shell
cd $RCCL_HOME/ext-tuner/example/
make
#. Tell RCCL to use the custom ``libnccl-tuner.so`` file by setting the following environment variable
to the file path:
.. code-block:: shell
export NCCL_TUNER_PLUGIN=$RCCL_HOME/ext-tuner/example/libnccl-tuner.so
+50
Voir le fichier
@@ -0,0 +1,50 @@
.. meta::
:description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
:keywords: RCCL, ROCm, library, API
.. _index:
******************
RCCL documentation
******************
The ROCm Communication Collectives Library (RCCL) is a stand-alone library
that provides multi-GPU and multi-node collective communication primitives
optimized for AMD GPUs. It uses PCIe and xGMI high-speed interconnects.
To learn more, see :doc:`what-is-rccl`
The RCCL public repository is located at `<https://github.com/ROCm/rccl>`_.
.. grid:: 2
:gutter: 3
.. grid-item-card:: Install
* :doc:`Installing RCCL using the install script <./install/installation>`
* :doc:`Running RCCL using Docker <./install/docker-install>`
* :doc:`Building and installing RCCL from source code <./install/building-installing>`
.. grid-item-card:: How to
* :doc:`Using the RCCL Tuner plugin <./how-to/using-rccl-tuner-plugin-api>`
* :doc:`Using the NCCL Net plugin <./how-to/using-nccl>`
* :doc:`Troubleshoot RCCL <./how-to/troubleshooting-rccl>`
* :doc:`RCCL usage tips <./how-to/rccl-usage-tips>`
.. grid-item-card:: Examples
* `RCCL Tuner plugin examples <https://github.com/ROCm/rccl/tree/develop/ext-tuner/example>`_
* `NCCL Net plugin examples <https://github.com/ROCm/rccl/tree/develop/ext-net/example>`_
.. grid-item-card:: API reference
* :ref:`Library specification<library-specification>`
* :ref:`api-library`
* :ref:`Environment variables<env-variables>`
To contribute to the documentation, see
`Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
You can find licensing information on the
`Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
+103
Voir le fichier
@@ -0,0 +1,103 @@
.. meta::
:description: Information on how to build the RCCL library from source code
:keywords: RCCL, ROCm, library, API, build, install
.. _building-from-source:
*********************************************
Building and installing RCCL from source code
*********************************************
To build RCCL directly from the source code, follow these steps. This guide also includes
instructions explaining how to test the build.
For information on using the quick start install script to build RCCL, see :doc:`installation`.
Requirements
============
The following prerequisites are required to build RCCL:
1. ROCm-supported GPUs
2. Having the ROCm stack installed on the system, including the :doc:`HIP runtime <hip:index>` and the HIP-Clang compiler.
Building the library using CMake:
---------------------------------
To build the library from source, follow these steps:
.. code-block:: shell
git clone --recursive https://github.com/ROCm/rccl.git
cd rccl
mkdir build
cd build
cmake ..
make -j 16 # Or some other suitable number of parallel jobs
If you have already cloned the repository, you can checkout the external submodules manually.
.. code-block:: shell
git submodule update --init --recursive --depth=1
You can substitute a different installation path by providing the path as a parameter
to ``CMAKE_INSTALL_PREFIX``, for example:
.. code-block:: shell
cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release ..
.. note::
Ensure ROCm CMake is installed using the command ``apt install rocm-cmake``. By default,
CMake builds the component in debug mode unless ``DCMAKE_BUILD_TYPE`` is specified.
Building the RCCL package and install package:
----------------------------------------------
After you have cloned the repository and built the library as described in the previous section,
use this command to build the package:
.. code-block:: shell
cd rccl/build
make package
sudo dpkg -i *.deb
.. note::
The RCCL package install process requires ``sudo`` or root access because it creates a directory
named ``rccl`` in ``/opt/rocm/``. This is an optional step. RCCL can be used directly by including the path containing ``librccl.so``.
Testing RCCL
============
The RCCL unit tests are implemented using the Googletest framework in RCCL. These unit tests require Googletest 1.10
or higher to build and run (this dependency can be installed using the ``-d`` option for ``install.sh``).
To run the RCCL unit tests, go to the ``build`` folder and the ``test`` subfolder,
then run the appropriate RCCL unit test executables.
The RCCL unit test names follow this format:
.. code-block:: shell
CollectiveCall.[Type of test]
Filtering of the RCCL unit tests can be done using environment variables
and by passing the ``--gtest_filter`` command line flag:
.. code-block:: shell
UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
This command runs only the ``AllReduce`` correctness tests with the ``float16`` datatype.
A list of the available environment variables for filtering appears at the top of every run.
See the `Googletest documentation <https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests>`_
for more information on how to form advanced filters.
There are also other performance and error-checking tests for RCCL. They are maintained separately at `<https://github.com/ROCm/rccl-tests>`_.
.. note::
For more information on how to build and run rccl-tests, see the `rccl-tests README file <https://github.com/ROCm/rccl-tests/blob/develop/README.md>`_ .
+52
Voir le fichier
@@ -0,0 +1,52 @@
.. meta::
:description: Instruction on how to install the RCCL library for collective communication primitives using Docker
:keywords: RCCL, ROCm, library, API, install, Docker
.. _install-docker:
*****************************************
Running RCCL using Docker
*****************************************
To use Docker to run RCCL, Docker must already be installed on the system.
To build the Docker image and run the container, follow these steps.
#. Build the Docker image
By default, the Dockerfile uses ``docker.io/rocm/dev-ubuntu-22.04:latest`` as the base Docker image.
It then installs RCCL and rccl-tests (in both cases, it uses the version from the ``develop`` branch).
Use this command to build the Docker image:
.. code-block:: shell
docker build -t rccl-tests -f Dockerfile.ubuntu --pull .
The base Docker image, rccl repository, rccl-tests repository, and GPU targets can be modified
by using ``--build-args`` in the ``docker build`` command above. For example, to use a different base Docker image for the MI250 GPU,
use this command:
.. code-block:: shell
docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull .
#. Launch an interactive Docker container on a system with AMD GPUs:
.. code-block:: shell
docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash
To run, for example, the ``all_reduce_perf`` test from rccl-tests on 8 AMD GPUs from inside the Docker container, use this command
for ROCm 6.4.1 or earlier:
.. code-block:: shell
mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
For ROCm 6.4.2 or later, use this command:
.. code-block:: shell
mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
For more information on the rccl-tests options, see the `Usage guidelines <https://github.com/ROCm/rccl-tests#usage>`_ in the GitHub repository.
+85
Voir le fichier
@@ -0,0 +1,85 @@
.. meta::
:description: Instruction on how to install the RCCL library for collective communication primitives using the quick start install script
:keywords: RCCL, ROCm, library, API, install
.. _install:
*****************************************
Installing RCCL using the install script
*****************************************
To quickly install RCCL using the install script, follow these steps.
For instructions on building RCCL from the source code, see :doc:`building-installing`.
For additional tips, see :doc:`../how-to/rccl-usage-tips`.
Requirements
============
The following prerequisites are required to use RCCL:
1. ROCm-supported GPUs
2. The ROCm stack must be installed on the system, including the :doc:`HIP runtime <hip:index>` and the HIP-Clang compiler.
Quick start RCCL build
======================
RCCL directly depends on the HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
For ROCm installation instructions, see the :doc:`package manager installation guide <rocm-install-on-linux:install/install-methods/package-manager-index>`.
Use the `install.sh helper script <https://github.com/ROCm/rccl/blob/develop/install.sh>`_,
located in the root directory of the RCCL repository,
to build and install RCCL with a single command. It uses hard-coded configurations that can be specified directly
when using cmake. However, it's a great way to get started quickly and provides an
example of how to build and install RCCL.
Building the library using the install script:
----------------------------------------------
To build the library using the install script, use this command:
.. code-block:: shell
./install.sh
For more information on the build options and flags for the install script, run the following command:
.. code-block:: shell
./install.sh --help
The RCCL build and installation helper script options are as follows:
.. code-block:: shell
--address-sanitizer Build with address sanitizer enabled
-d|--dependencies Install RCCL dependencies
--debug Build debug library
--enable_backtrace Build with custom backtrace support
--disable-colltrace Build without collective trace
--disable-msccl-kernel Build without MSCCL kernels
--enable-mscclpp Build with MSCCL++ support
-f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
-h|--help Prints this help message
-i|--install Install RCCL library (see --prefix argument below)
-j|--jobs Specify how many parallel compilation jobs to run ($nproc by default)
-l|--local_gpu_only Only compile for local GPU architecture
--amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)
--no_clean Don't delete files if they already exist
--npkit-enable Compile with npkit enabled
--openmp-test-enable Enable OpenMP in rccl unit tests
--roctx-enable Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program)
-p|--package_build Build RCCL package
--prefix Specify custom directory to install RCCL to (default: `/opt/rocm`)
--rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility
--run_tests_all Run all rccl unit tests (must be built already)
-r|--run_tests_quick Run small subset of rccl unit tests (must be built already)
--static Build RCCL as a static library instead of shared library
-t|--tests_build Build rccl unit tests, but do not run
--time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system)
--verbose Show compile commands
.. tip::
By default, the RCCL install script builds all the GPU targets that are defined in ``DEFAULT_GPUS`` in `CMakeLists.txt <https://github.com/ROCm/rccl/blob/develop/CMakeLists.txt>`_.
To target specific GPUs and potentially reduce the build time, use ``--amdgpu_targets`` along with
a semicolon (``;``) separated string list of the GPU targets.
+8
Voir le fichier
@@ -0,0 +1,8 @@
.. meta::
:description: RCCL licensing information
:keywords: RCCL, ROCm, library, API, license
License
=======
.. include:: ../LICENSE.txt
+45
Voir le fichier
@@ -0,0 +1,45 @@
root: index
subtrees:
- entries:
- file: what-is-rccl.rst
title: What is RCCL?
- caption: Install
entries:
- file: install/installation
title: Installation guide
- file: install/docker-install
title: Running RCCL using Docker
- file: install/building-installing
title: Building and installing from source
- caption: How to
entries:
- file: how-to/using-rccl-tuner-plugin-api
title: Using the RCCL Tuner plugin
- file: how-to/using-nccl
title: Using the NCCL Net plugin
- file: how-to/troubleshooting-rccl
title: Troubleshoot RCCL
- file: how-to/rccl-usage-tips
- caption: Examples
entries:
- url: https://github.com/ROCm/rccl/tree/develop/ext-tuner/example
title: RCCL Tuner plugin examples
- url: https://github.com/ROCm/rccl/tree/develop/ext-net/example
title: NCCL Net plugin examples
- caption: API reference
entries:
- file: api-reference/library-specification
title: Library specification
- file: api-reference/api-library
- file: api-reference/env-variables
title: Environment variables
- caption: About
entries:
- file: license
- file: attributions
+1
Voir le fichier
@@ -0,0 +1 @@
rocm-docs-core==1.26.0
+277
Voir le fichier
@@ -0,0 +1,277 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements.in
#
accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==0.7.16
# via sphinx
asttokens==3.0.0
# via stack-data
attrs==24.3.0
# via
# jsonschema
# jupyter-cache
# referencing
babel==2.15.0
# via
# pydata-sphinx-theme
# sphinx
beautifulsoup4==4.12.3
# via pydata-sphinx-theme
breathe==4.35.0
# via rocm-docs-core
certifi==2024.7.4
# via requests
cffi==1.16.0
# via
# cryptography
# pynacl
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# jupyter-cache
# sphinx-external-toc
comm==0.2.2
# via ipykernel
cryptography==44.0.1
# via pyjwt
debugpy==1.8.12
# via ipykernel
decorator==5.1.1
# via ipython
deprecated==1.2.14
# via pygithub
docutils==0.21.2
# via
# breathe
# myst-parser
# pydata-sphinx-theme
# sphinx
exceptiongroup==1.2.2
# via ipython
executing==2.1.0
# via stack-data
fastjsonschema==2.19.1
# via
# nbformat
# rocm-docs-core
gitdb==4.0.11
# via gitpython
gitpython==3.1.43
# via rocm-docs-core
greenlet==3.1.1
# via sqlalchemy
idna==3.7
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==8.6.1
# via
# jupyter-cache
# myst-nb
ipykernel==6.29.5
# via myst-nb
ipython==8.31.0
# via
# ipykernel
# myst-nb
jedi==0.19.2
# via ipython
jinja2==3.1.6
# via
# myst-parser
# sphinx
jsonschema==4.23.0
# via nbformat
jsonschema-specifications==2024.10.1
# via jsonschema
jupyter-cache==1.0.1
# via myst-nb
jupyter-client==8.6.3
# via
# ipykernel
# nbclient
jupyter-core==5.7.2
# via
# ipykernel
# jupyter-client
# nbclient
# nbformat
markdown-it-py==3.0.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==2.1.5
# via jinja2
matplotlib-inline==0.1.7
# via
# ipykernel
# ipython
mdit-py-plugins==0.4.1
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==1.1.2
# via rocm-docs-core
myst-parser==3.0.1
# via myst-nb
nbclient==0.10.2
# via
# jupyter-cache
# myst-nb
nbformat==5.10.4
# via
# jupyter-cache
# myst-nb
# nbclient
nest-asyncio==1.6.0
# via ipykernel
packaging==24.0
# via
# ipykernel
# sphinx
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
platformdirs==4.3.6
# via jupyter-core
prompt-toolkit==3.0.50
# via ipython
psutil==6.1.1
# via ipykernel
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.16.1
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.3.0
# via rocm-docs-core
pygments==2.18.0
# via
# accessible-pygments
# ipython
# pydata-sphinx-theme
# sphinx
pyjwt[crypto]==2.8.0
# via pygithub
pynacl==1.5.0
# via pygithub
python-dateutil==2.9.0.post0
# via jupyter-client
pyyaml==6.0.1
# via
# jupyter-cache
# myst-nb
# myst-parser
# rocm-docs-core
# sphinx-external-toc
pyzmq==26.2.0
# via
# ipykernel
# jupyter-client
referencing==0.36.1
# via
# jsonschema
# jsonschema-specifications
requests==2.32.4
# via
# pygithub
# sphinx
rocm-docs-core==1.26.0
# via -r requirements.in
rpds-py==0.22.3
# via
# jsonschema
# referencing
six==1.17.0
# via python-dateutil
smmap==5.0.1
# via gitdb
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.5
# via beautifulsoup4
sphinx==7.3.7
# via
# breathe
# myst-nb
# myst-parser
# pydata-sphinx-theme
# rocm-docs-core
# sphinx-book-theme
# sphinx-copybutton
# sphinx-design
# sphinx-external-toc
# sphinx-notfound-page
sphinx-book-theme==1.1.2
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
sphinx-design==0.6.0
# via rocm-docs-core
sphinx-external-toc==1.0.1
# via rocm-docs-core
sphinx-notfound-page==1.0.2
# via rocm-docs-core
sphinxcontrib-applehelp==1.0.8
# via sphinx
sphinxcontrib-devhelp==1.0.6
# via sphinx
sphinxcontrib-htmlhelp==2.0.5
# via sphinx
sphinxcontrib-jsmath==1.0.1
# via sphinx
sphinxcontrib-qthelp==1.0.7
# via sphinx
sphinxcontrib-serializinghtml==1.1.10
# via sphinx
sqlalchemy==2.0.37
# via jupyter-cache
stack-data==0.6.3
# via ipython
tabulate==0.9.0
# via jupyter-cache
tomli==2.0.1
# via sphinx
tornado==6.5.1
# via
# ipykernel
# jupyter-client
traitlets==5.14.3
# via
# comm
# ipykernel
# ipython
# jupyter-client
# jupyter-core
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.12.0
# via
# ipython
# myst-nb
# pydata-sphinx-theme
# pygithub
# referencing
# sqlalchemy
urllib3==2.5.0
# via
# pygithub
# requests
wcwidth==0.2.13
# via prompt-toolkit
wrapt==1.16.0
# via deprecated
zipp==3.21.0
# via importlib-metadata
+31
Voir le fichier
@@ -0,0 +1,31 @@
.. meta::
:description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
:keywords: RCCL, ROCm, library, API
.. _what-is:
******************
What is RCCL?
******************
The ROCm Communication Collectives Library (RCCL) includes multi-GPU and
multi-node collective communication primitives optimized for AMD GPUs.
It implements routines such as ``all-reduce``, ``all-gather``, ``reduce``,
``broadcast``, ``reduce-scatter``, ``gather``, ``scatter``, ``all-to-allv``,
and ``all-to-all``, as well as direct point-to-point (GPU-to-GPU) send
and receive operations. It is optimized to achieve high bandwidth
on platforms using PCIe and xGMI and networking using InfiniBand Verbs or TCP/IP
sockets. RCCL supports an arbitrary number of GPUs installed in a single node
or multiple nodes and can be used in either
single- or multi-process (for example, MPI) applications.
The collective operations are implemented using ring and tree algorithms and have been optimized
for throughput and latency by leveraging topology awareness, high-speed interconnects,
and RDMA-based collectives. For best performance, small operations can be either
batched into larger operations or aggregated through the API.
RCCL uses PCIe and xGMI high-speed interconnects for intra-node communication
as well as InfiniBand, RoCE, and TCP/IP for inter-node communication.
It supports an arbitrary number of GPUs installed in a single-node or
multi-node platform and can easily integrate into
single- or multi-process (for example, MPI) applications.
+419
Voir le fichier
@@ -0,0 +1,419 @@
# NCCL Net Plugin Documentation
This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL.
# Overview
To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins
implement the NCCL network API, and decouple NCCL binary builds which are built against a
particular version of the GPU stack (i.e. CUDA) from the network code which is built against a
particular version of the networking stack. That way, we can easily integrate any CUDA version
with any network stack version.
NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library
contains one or more implementations of the NCCL NET API, in the form of versioned structs,
filled with pointers to all required functions.
# Plugin architecture
## Plugin name and supporting multiple network plugins
When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it,
then look for symbols inside the library.
The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore
advised to name the library following that pattern, with a symlink pointing `libnccl-net.so`
to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path,
setting `NCCL_NET_PLUGIN` will allow users to select the right plugin.
## Struct versioning
Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing
over time. The versioning ensures that the plugin and the NCCL core are compatible.
Plugins are encouraged to provide multiple of those symbols, implementing multiple versions
of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL
versions.
Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
for the latest ncclNet struct version, but also looking for older ones so that older plugins
would still work.
## In-network collective operations, a.k.a. collNet
Additionally to the ncclNet structure, network plugins can provide a collNet structure which
implements in-network collective operations, if supported. That can be used by the NCCL collNet
algorithm to accelerate inter-node reductions in allReduce.
The collNet struct is a different, optional struct provided by the network plugin, but its
versioning is tied to the ncclNet struct and many functions are common between the two to
ease the implementation.
## Headers management
To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions
they support to their internal includes. An example is shown in `ext-net/example/` where we keep
all headers in the `nccl/` directory and provide thin layers to implement old versions on top
of newer ones.
The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v10)
Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
```
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
// what index this new vNIC exists at
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
} ncclNet_t;
```
## Error codes
All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon
success.
Otherwise, plugins can return one of the following:
- `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel
or a system library fails. This typically includes all network/hardware errors.
- `ncclInternalError` is returned when the NCCL core code is using the network plugin in an
incorrect way, for example allocating more requests than it should, or passing an invalid argument
to calls.
- `ncclInvalidUsage` should be returned when the error is most likely a user error. This can
include misconfiguration, but also sizes mismatch.
- `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by
the NCCL core layer.
- `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should
not need to rely on CUDA, this should not be common.
## Operation overview
NCCL will call the `init` function first, then query the number of network devices with the
`devices` function, getting each network device properties with `getProperties`.
If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice`
specifying a list of physical devices (the original devices listed from `devices`) it wishes to
merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null.
To establish a connection between two network devices, NCCL will first call `listen` on the
receiving side, pass the returned handle to the sender side of the connection, and call `connect`
with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
establishment.
`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller
wishes to make use of device networking. This parameter may be ignored by the plugin if it does
not support device-side networking.
Once the connection is established, communication will be done using the functions `isend`,
`irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
In certain conditions, `iflush` will be called after a receive calls completes to allow the network
plugin to flush data and ensure the GPU will observe the newly written data.
To close the connections NCCL will call `closeListen` to close the object returned by `listen`,
`closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned
by `accept`.
## API Functions
### Initialization
`name`
The `name` field should point to a character string with the name of the network plugin. This will
be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
Note: setting `NCCL_NET=<plugin name>` will ensure a specific network implementation is used, with
a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the
`libnccl-net.so`library name to load.
`init`
As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function.
This will allow the plugin to discover network devices and make sure they are usable. If the
`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
internal ones.
To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
the plugin code adding the following definitions:
```
#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
```
The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
record its own events with the NCCL profiler plugin.
`devices`
Once the plugin is initialized, NCCL will query the number of devices available. It should not
be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init`
function should not return `ncclSuccess`.
`getProperties`
Right after getting the number of devices, NCCL will query properties for each available network
device. These properties are critical when multiple adapters are present to ensure NCCL uses each
adapter in the most optimized way.
The `name` is only used for logging.
The `pciPath` is the base for all topology detection and should point to the PCI device directory
in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or
`/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should
be `NULL`.
The `guid` field is used to determine when network adapters are connected to multiple PCI
endpoints. For normal cases, it can be set to the device number. If multiple network devices have
the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence
it will not use the port multiple times.
The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be
set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin
supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
provide a `regMrDmaBuf` function.
The `regIsGlobal` field allows NCCL to register buffers in advance using e.g. a loopback connection
and later on, expect that another registration on a buffer contained within a previous registration
will be nearly immediate, as the buffer is already known by the network adapter. A typical
implementation would maintain a registration cache; the call to ncclCommRegister will create the
initial entry in the cache using regMr() on a loopback connection. Any later call to NCCL
operations will call regMr() again on the real connection, with the real buffer (could be at a
different offset within the original buffer, with a smaller size, etc), then deregMr() right after.
The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
on the network adapter.
The `forceFlush` field can request the NCCL core to call flush for all transfers. By default,
flushes are only called when the GPU architecture or PCI topology would not not guarantee correct
PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the
completion paths use different PCI links and therefore need a call to flush() to guarantee
ordering.
The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
important to ensure proper optimization of flows within the node.
The `port` field indicates the port number. This is important again for topology detection and flow
optimization within the node when a NIC with a single PCI connection is connected to the fabric
with multiple ports.
The `latency` field indicates the network latency in microseconds. This can be useful to improve
the NCCL tuning and make sure NCCL switches from tree to ring at the right size.
The `maxComms` field indicates the maximum number of connections we can create.
The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
receive).
The `netDeviceType` indicates which type of device networking this plugin supports. The current supported
options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`.
The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code.
The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for
point-to-point and collective calls. This will tell the NCCL core to cut large operations into
multiple smaller chunks if needed.
`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device.
### Connection establishment
Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
side.
`listen`
To create a connection, NCCL will start by calling `listen` on the receiver side. This function
takes a device number as input argument, and should return a local `listenComm` object, and a
`handle` to pass to the other side, so that the sender side can connect to the receiver.
The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
This call should never block, but contrary to `connect` and `accept`, `listenComm` should never
be `NULL` if the call succeeds.
`connect`
NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call
`connect` on the sender side on a given device index `dev`, providing the `handle`. `connect`
should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that
case, NCCL will call `accept` again until it succeeds.
`accept`
To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by
the `listen` call previously. If the sender did not connect yet, `accept` should not block. It
should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
succeeds.
The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
This field can be used by the network plugin to specify the QoS level of the connection. By default,
`trafficClass` is set to -1 but can be configured by the application during communicator initialization
to select a plugin-supported QoS level.
`closeListen`/`closeSend`/`closeRecv`
Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
`closeListen`/`closeSend`/`closeRecv` to free the associated resources.
### Communication
Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`.
To support RDMA capabilities, buffer registration and flush functions are provided.
To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL,
then queried with `test`. Each `sendComm` or `recvComm` must be able to handle
`NCCL_NET_MAX_REQUESTS` requests in parallel.
Note: That value should be multiplied by the multi-receive capability of the plugin for the sender
side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening
in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each
`sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations.
`regMr`
Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for
communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer
pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network
supports CUDA pointers.
The network plugin can use the output argument `mhandle` to keep any reference to that memory
registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and
`deregMr` calls.
`regMrDmaBuf`
If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf`
instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`.
`deregMr`
When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin
free resources. This function is used to deregister handles returned by both `regMr` and
`regMrDmaBuf`.
`isend`
Data will be sent through the connection using `isend`, passing the `sendComm` previously
created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be
used if the network supports multi-receive operations (see `irecv`) to distinguish between
different sends matching the same multi-receive. Otherwise it can be set to 0.
The `isend` operation returns a handle in the `request` argument for further calls to `test`. If
the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
`isend` again later.
The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
to support network defined events.
`irecv`
To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
`n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a
single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles`
arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend`
operations is received into the right buffer.
If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer,
otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are
handled by a single request handle.
The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
The contrary (receive size being lower than the send size) is an error, however.
NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using
LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion
of irecv and is resilient to redundant network writes. This allows the plugin to optimize request
completions on such irecvs (for example, complete the request immediately). The plugin is still
expected to set a valid request pointer on return which NCCL can poll to check for completion.
The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
network plugin to support network defined events.
Note: for a given connection, send/receive operations should always match in the order they were
posted. Tags provided for receive operations are only used to assign a given send operation to one
of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
matching on any receive operation posted.
`test`
After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles
until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the
real size sent or received, the latter being potentially lower than the size passed to `irecv`.
In the case of a multi-receive, all receives will be considered as done as a single operation (the
goal being to allow aggregation), hence they share a single request and a single `done` status.
However, they can have different sizes, so when `done` is non-zero, the `sizes` array should
contain the `n` sizes corresponding to the buffers passed to `irecv`.
Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never
call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`).
`iflush`
After a receive operation completes, if the operation was targeting GPU memory and received a
non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure
the GPU can read it right after without seeing stale data. This flush operation is decoupled from
the `test` code to improve latency of `LL*` protocols, as those are capable of determining when
data is valid or not.
`iflush` returns a request which needs to be queried with `test` until it completes.
+22
Voir le fichier
@@ -0,0 +1,22 @@
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
.DEFAULT_GOAL: build
include ../../makefiles/common.mk
SRCDIR ?= $(abspath ../..)
BUILDDIR ?= .
NCCLDIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.c)
build: ${BUILDDIR}/libnccl-net-example.so
${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${BUILDDIR}
$(CC) -Inccl -fPIC -shared -o $@ $^
clean:
rm -f ${BUILDDIR}/libnccl-net-example.so
+21
Voir le fichier
@@ -0,0 +1,21 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
#include <stdint.h>
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
#endif
+17
Voir le fichier
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;
#endif
+41
Voir le fichier
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_H_
#define NET_H_
#include <stdint.h>
#include <stdlib.h>
#include "err.h"
#include "net_device.h"
#include "common.h"
#define NCCL_NET_HANDLE_MAXSIZE 128
#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
#define NCCL_PTR_DMABUF 0x4
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32
#include "net_v10.h"
#include "net_v9.h"
#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
#include "net_v5.h"
#include "net_v4.h"
#include "net_v3.h"
#include "net_v2.h"
typedef ncclNet_v10_t ncclNet_t;
typedef ncclNetProperties_v10_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
#endif // end include guard
+32
Voir le fichier
@@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_DEVICE_H_
#define NET_DEVICE_H_
#define NCCL_NET_DEVICE_INVALID_VERSION 0x0
#define NCCL_NET_MTU_SIZE 4096
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
typedef struct {
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
void* handle;
size_t size;
int needsProxyProgress;
} ncclNetDeviceHandle_v7_t;
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
#endif
+101
Voir le fichier
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V10_H_
#define NET_V10_H_
#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
} ncclNetVDeviceProps_v10_t;
#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
typedef struct {
// Plugin-specific TC value
int trafficClass;
} ncclNetCommConfig_v10_t;
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int forceFlush; // Force a flush on receives
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
ncclNetVDeviceProps_v10_t vProps;
size_t maxP2pBytes; // Max transfer size for point-to-point operations
size_t maxCollBytes; // Max transfer size for collective operations
} ncclNetProperties_v10_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
// what index this new vNIC exists at
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
} ncclNet_v10_t;
#endif // end include guard
+50
Voir le fichier
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V2_H_
#define NET_V2_H_
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Return the device path in /sys. NCCL will call free on this path.
ncclResult_t (*pciPath)(int dev, char** path);
// Return whether this device supports host pointers and/or CUDA pointers
// as data from the current GPU. Supported types should be composed with
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v2_t;
#endif // end include guard
+50
Voir le fichier
@@ -0,0 +1,50 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V3_H_
#define NET_V3_H_
#define NCCL_NET_MAX_REQUESTS_V3 16
typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v3_t;
#endif // end include guard
+61
Voir le fichier
@@ -0,0 +1,61 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V4_H_
#define NET_V4_H_
#define NCCL_NET_HANDLE_MAXSIZE_V4 64
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
int speed; // Port speed in Mbps.
int port; // Port number.
int maxComms; // Maximum number of comms we can create
} ncclNetProperties_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v4_t;
#endif // end include guard
+54
Voir le fichier
@@ -0,0 +1,54 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V5_H_
#define NET_V5_H_
typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v5_t;
#endif // end include guard
+68
Voir le fichier
@@ -0,0 +1,68 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V6_H_
#define NET_V6_H_
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
}ncclNetProperties_v6_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v6_t;
#endif // end include guard
+75
Voir le fichier
@@ -0,0 +1,75 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V7_H_
#define NET_V7_H_
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v7_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v7_t;
#endif // end include guard
+79
Voir le fichier
@@ -0,0 +1,79 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V8_H_
#define NET_V8_H_
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
} ncclNetProperties_v8_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
} ncclNet_v8_t;
#endif // end include guard
+93
Voir le fichier
@@ -0,0 +1,93 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V9_H_
#define NET_V9_H_
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
} ncclNetVDeviceProps_v9_t;
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int forceFlush; // Force a flush on receives
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
ncclNetVDeviceProps_v9_t vProps;
size_t maxP2pBytes; // Max transfer size for point-to-point operations
size_t maxCollBytes; // Max transfer size for collective operations
} ncclNetProperties_v9_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
// what index this new vNIC exists at
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
} ncclNet_v9_t;
#endif // end include guard
+23
Voir le fichier
@@ -0,0 +1,23 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
ncclFloat8e4m3 = 10,
ncclFloat8e5m2 = 11,
} ncclDataType_t;
#endif
+418
Voir le fichier
@@ -0,0 +1,418 @@
/*************************************************************************
* Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "net.h"
#define __hidden __attribute__ ((visibility("hidden")))
#define NCCL_PLUGIN_MAX_RECVS 1
int max_requests = NCCL_NET_MAX_REQUESTS;
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
// Below are default values, if unsure don't change.
props->name = "Example";
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
props->pciPath = NULL;
// Only used to detect NICs with multiple PCI attachments.
props->guid = 0;
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
props->ptrSupport = NCCL_PTR_HOST;
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
props->regIsGlobal = 0;
// Force flush after receive. Needed if the control path and data path use a different path to the GPU
props->forceFlush = 0;
// Speed in *Mbps*. 100000 means 100G
props->speed = 100000;
// Port number, used in conjunction with guid
props->port = 0;
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
props->latency = 0;
// Maximum number of comm objects we can create.
props->maxComms = 1024*1024;
// Maximum number of receive operations taken by irecv().
props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
// Coupling with NCCL network device-side code.
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
// Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
props->vProps.ndevs = 1;
props->vProps.devs[0] = dev;
// maximum transfer sizes the plugin can handle
props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
return ncclSuccess;
}
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
#define PLUGIN_NAME "Plugin"
const ncclNet_v10_t ncclNetPlugin_v10 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
.getProperties = pluginGetProperties,
.listen = pluginListen,
.connect = pluginConnect,
.accept = pluginAccept,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice,
};
__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
return pluginInit(logFunction, NULL);
}
__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
return pluginGetProperties(dev, (ncclNetProperties_t*)props);
}
__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
}
__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
}
__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
}
__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
const ncclNet_v9_t ncclNetPlugin_v9 = {
.name = PLUGIN_NAME,
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v9,
.listen = pluginListen,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v9,
.irecv = pluginIrecv_v9,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice_v9,
};
__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v8->name = props.name;
props_v8->pciPath = props.pciPath;
props_v8->guid = props.guid;
props_v8->ptrSupport = props.ptrSupport;
props_v8->regIsGlobal = props.regIsGlobal;
props_v8->speed = props.speed;
props_v8->latency = props.latency;
props_v8->port = props.port;
props_v8->maxComms = props.maxComms;
props_v8->maxRecvs = props.maxRecvs;
props_v8->netDeviceType = props.netDeviceType;
props_v8->netDeviceVersion = props.netDeviceVersion;
return ncclSuccess;
}
__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
}
__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
}
const ncclNet_v8_t ncclNetPlugin_v8 = {
.name = PLUGIN_NAME,
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v8,
.listen = pluginListen,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v8,
.irecv = pluginIrecv_v8,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
};
__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v7->name = props.name;
props_v7->pciPath = props.pciPath;
props_v7->guid = props.guid;
props_v7->ptrSupport = props.ptrSupport;
props_v7->speed = props.speed;
props_v7->latency = props.latency;
props_v7->port = props.port;
props_v7->maxComms = props.maxComms;
props_v7->maxRecvs = props.maxRecvs;
props_v7->netDeviceType = props.netDeviceType;
props_v7->netDeviceVersion = props.netDeviceVersion;
return ncclSuccess;
}
__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
return pluginRegMr(collComm, data, size, type, mhandle);
}
const ncclNet_v7_t ncclNetPlugin_v7 = {
.name = PLUGIN_NAME,
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v7,
.listen = pluginListen,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr_v7,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v8,
.irecv = pluginIrecv_v8,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
};
__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v6->name = props.name;
props_v6->pciPath = props.pciPath;
props_v6->guid = props.guid;
props_v6->ptrSupport = props.ptrSupport;
props_v6->speed = props.speed;
props_v6->latency = props.latency;
props_v6->port = props.port;
props_v6->maxComms = props.maxComms;
props_v6->maxRecvs = props.maxRecvs;
return ncclSuccess;
}
__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; }
const ncclNet_v6_t ncclNetPlugin_v6 = {
.name = PLUGIN_NAME,
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr_v7,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v8,
.irecv = pluginIrecv_v8,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen
};
/* v5 Compat */
const ncclNet_v5_t ncclNetPlugin_v5 = {
.name = PLUGIN_NAME,
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v8,
.irecv = pluginIrecv_v8,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
};
/* v4 Compat */
static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
ncclNetProperties_t props;
ncclResult_t ret = pluginGetProperties(dev, &props);
if (ret != ncclSuccess) return ret;
props_v4->name = props.name;
props_v4->pciPath = props.pciPath;
props_v4->guid = props.guid;
props_v4->ptrSupport = props.ptrSupport;
props_v4->speed = props.speed;
props_v4->port = props.port;
props_v4->maxComms = props.maxComms;
return ncclSuccess;
}
static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
return pluginIsend_v8(sendComm, data, size, 0, mhandle, request);
}
static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
int tag = 0;
return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request);
}
static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
}
static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
ncclResult_t ret;
do {
ncclNetDeviceHandle_v7_t* handle = NULL;
ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
} while (ret == ncclSuccess && *sendComm == NULL);
return ret;
}
static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
ncclResult_t ret;
do {
ncclNetDeviceHandle_v7_t* handle = NULL;
ret = pluginAccept(listenComm, recvComm, &handle);
} while (ret == ncclSuccess && *recvComm == NULL);
return ret;
}
const ncclNet_v4_t ncclNetPlugin_v4 = {
.name = PLUGIN_NAME,
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v4,
.listen = pluginListen,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v4,
.irecv = pluginIrecv_v4,
.iflush = pluginIflush_v4,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
};
/* v3 Compat */
static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) {
void* req;
ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req);
int done = 0;
while (ret == ncclSuccess && done == 0) {
ret = pluginTest(req, &done, NULL);
}
return ret;
}
static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
max_requests = NCCL_NET_MAX_REQUESTS_V3;
return pluginInit(logFunction, NULL);
}
#include <string.h>
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
return ret;
}
static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4);
return pluginConnect_v4(dev, &pluginHandle, sendComm);
}
const ncclNet_v3_t ncclNetPlugin_v3 = {
.name = PLUGIN_NAME,
.init = pluginInit_v3,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v4,
.listen = pluginListen_v3,
.connect = pluginConnect_v3,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v4,
.irecv = pluginIrecv_v4,
.flush = pluginFlush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
};
/* v2 Compat */
const ncclNet_v2_t ncclNetPlugin_v2 = {
.name = PLUGIN_NAME,
.init = pluginInit_v3,
.devices = pluginDevices,
.pciPath = pluginPciPath,
.ptrSupport = pluginPtrSupport,
.listen = pluginListen,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
.deregMr = pluginDeregMr,
.isend = pluginIsend_v4,
.irecv = pluginIrecv_v4,
.flush = pluginFlush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
};
+22
Voir le fichier
@@ -0,0 +1,22 @@
CUDA_HOME?=/usr/local/cuda
INC:=-I$(CUDA_HOME)/include
PLUGIN_SO:=libnccl-net.so
default: $(PLUGIN_SO)
$(PLUGIN_SO): nccl-fastsocket/*.cc
$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
nccl-fastsocket/*.cc:
git clone https://github.com/google/nccl-fastsocket.git
install: $(BUILDDIR)/lib/$(PLUGIN_SO)
$(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(BUILDDIR)/lib
install -m 644 $< $@
clean:
rm -f $(PLUGIN_SO)
rm -Rf nccl-fastsocket
+461
Voir le fichier
@@ -0,0 +1,461 @@
# NCCL Profiler Plugin Documentation
This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL.
# Overview
To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin
interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and
use it for debugging and analysis.
Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
called `lirccl-profiler.so`. That shared library contains one or more implementations of the
NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
functions.
# Plugin architecture
## Plugin name and supporting multiple profiler plugins
When NCCL is initialized, it will look for a `librccl-profiler.so` library and dynamically load
it, then look for symbols inside the library.
The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
will look for a library with a name of `librccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
advised to name the library following that pattern, with a symlink pointing `librccl-profiler.so`
to `librccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `librccl-profiler.so` library.
## Struct versioning
Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing
over time. The versioning ensures that the plugin and the NCCL core are compatible.
Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the
NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions.
Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
for the latest ncclProfiler struct version, but also looking for older ones so that older plugins
would still work.
## Headers management
To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions
they support to their internal includes. An example is shown in `ext-profiler/example` where we
keep all headers in the `nccl/` directory and provide thin layers to implement old version on top
of newer ones.
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v4)
Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
```
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
```
## Error codes
As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal
functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for
them arises in the future. For now, any profiler interface call should only return `ncclSuccess`.
The only exception is `init` that can return an error so that NCCL can disable the plugin.
## Operation overview
NCCL will call the `init` function first for every new communicator that is initialized. The profiler
returns an opaque context handle that is used to isolate profiler instances across communicators.
Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources.
The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`.
These are used to start, stop and update events in the profiler, respectively.
## API Functions
### Initialization
#### name
The `name` field should point to a character string with the name of the profiler plugin. This will
be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
#### init
As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function.
This allows the plugin to initialize its internal context, used during profiling of NCCL events.
If the `init` function does not return `ncclSuccess`, NCCL disables the plugin.
#### finalize
When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees
up resources.
### Profiling
#### startEvent
When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler
context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and
returns an opaque profiler event handle that can be passed to other profiler functions, as discussed
later in the document.
The event descriptor contains all the event metadata. Every event type has its own descriptor. Below
is the `ncclProfilerEventDescr_t` struct.
```
typedef struct {
uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
int rank; // rank that generated the event
union {
struct { // collective events metadata
uint64_t seqNumber; // sequence number of this collective operation in the communicator
const char* func; // string containing name of the collective
void const* sendBuff; // address of send buffer
void* recvBuff; // address of recv buffer
size_t count; // data count
int root; // root rank
const char* datatype; // string containing the name of the datatype
uint8_t nChannels; // number of channels for this collective
uint8_t nWarps; // number of GPU warps for this collective
const char* algo; // string containing name of the algorithm for this collective
const char* proto; // string containing name of the protocol for this collective
} coll;
struct { // point-to-point events metadata
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer; // peer rank for this point-to-point
uint8_t nChannels; // number of channels for this p2p
} p2p;
struct { // proxyOp events metadata
pid_t pid; // process id that generated the associated `ncclProxyOp` object
uint8_t channelId; // id of the channel used by the associated `ncclProxyOp` object
int peer; // peer rank
int nSteps; // number of network transfers/steps required by the `ncclProxyOp`
int chunkSize; // chunk size for this `ncclProxyOp`
int isSend; // type of network operation
} proxyOp;
struct { // proxyStep events metadata
int step; // individual step in `ncclProxyOp`
} proxyStep;
struct {
uint8_t channelId; // id of the channel used by the kernel
uint64_t ptimer; // kernel supplied timestamp
} kernelCh;
struct {
int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions)
void* data; // pointer to network plugin defined event
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
```
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
`ncclProfileNetPlugin`.
#### stopEvent
`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event
has been stopped the handle can no longer be used with other profiler calls. Using the event
handle after `eventStop` is undefined behavior.
#### recordEventState
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
The state of these events can be updated, along with event attributes, using `recordEventState`.
These events can go through several states during their lifecycle.
The list of supported states for the updatable events is reported below.
```
typedef enum {
// ncclProfileProxyOp event states
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
ncclProfilerProxyOpInProgress_v4 = 19,// state marks transition of proxy op to progress
// ncclProfileProxyStep event states
ncclProfilerProxyStepSendGPUWait = 8, // state marks the waiting of send data from GPU for given network transfer/step
ncclProfilerProxyStepSendPeerWait_v4 = 20,// state marks the waiting of recv clear to send credits for given network transfer/step
ncclProfilerProxyStepSendWait = 9, // state marks the waiting of send data from network for given network transfer/step
ncclProfilerProxyStepRecvWait = 10,// state marks the waiting of recv data from network for given network transfer/step
ncclProfilerProxyStepRecvFlushWait = 11,// state marks the waiting of recv data flush to GPU for given network transfer/step
ncclProfilerProxyStepRecvGPUWait = 12,// state marks the waiting of recv data consumption from GPU for given network transfer/step
// ncclProfileProxyCtrl event states
ncclProfilerProxyCtrlIdle = 13,// state marks proxy progress thread idle
ncclProfilerProxyCtrlActive = 14,// state marks proxy progress thread active
ncclProfilerProxyCtrlSleep = 15,// state marks proxy progress thread sleeping
ncclProfilerProxyCtrlWakeup = 16,// state marks proxy progress thread waking up
ncclProfilerProxyCtrlAppend = 17,// state marks append of new network work item begin
ncclProfilerProxyCtrlAppendEnd = 18,// state marks append of new network work item end
// ncclProfileNetPlugin event states
ncclProfilerNetPluginUpdate = 21,// state marks update of network defined event
// ncclProfileKernelCh event states
ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update
} ncclProfilerEventState_v4_t;
```
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
provide a summary of the activity of the proxy progress thread for that channel. Most of the
states for this event were duplicated with `ncclProfileProxyStep` events. Therefore, starting
with version 4 of the profiler interface these states have been deprecated. The same level of
information can still be obtained through the `ncclProfileProxyStep` events.
`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
network requests for the GPU kernel. ProxyStep events describe individual network transfer in
the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing
network requests for the GPU kernel. This includes everything else that the proxy thread might be
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
processes work items for the enqueued NCCL operations.
`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
network defined event definition using the plugin id in the event descriptor. The plugin identifier
is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
unused and available for future extensions.
A network IB plugin can use this infrastructure to define a QP event as:
```C
#define NCCL_PROFILER_NET_IB_VER 1
enum {
ncclProfileQp = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int device; // network device id
uint64_t wr_id; // work request id
int opcode; // ibv opcode
int qpNum; // QP number
size_t length; // work request data length
} qp;
};
} ncclProfilerNetIbDescr_v1_t;
```
The network event infrastructure is network agnostic. A different network socket plugin can
use it to define a socket event as:
```C
#define NCCL_PROFILER_NET_SOCKET_VER 1
enum {
ncclProfileSocket = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int fd;
int op;
size_t length;
} sock;
};
} ncclProfilerNetSockDescr_v1_t;
```
The network plugin creates an event (descriptor) and passes it to the profiler callback,
along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
event descriptor, attaches the network plugin defined event as external data, and calls
the profiler `startEvent` function.
```C
ncclResult_t isend(..., void* phandle, ...) {
...
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
ncclProfilerNetIbDescr_v1_t eDescr = { };
eDescr.type = ncclProfileQp;
eDescr.qp = { ... };
ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
...
}
```
State transitions for the events described can also come with event attribute updates. For this
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
```
typedef union {
struct { // attributes for update for ncclProfileProxyStep events
size_t transSize; // transfer size field for this proxy step
} proxyStep;
struct { // attributes to update for ncclProfileProxyCtrl events
int appendedProxyOps; // number of appended proxy ops thus far
} proxyCtrl;
struct { // attributes to update for ncclProfileNetPlugin events
void* data; // network plugin opaque update data field
} netPlugin;
struct { // attribute to update for ncclProfileKernelCh events
uint64_t pTimer; // timestamp provided by the NCCL kernel
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
```
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
### Event hierarchy
NCCL core events (reported above) are organized into a hierarchy as reported below:
```
Group event
|
+- Collective event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
|
+- Point-to-point event
|
+- ProxyOp event
| |
| +- ProxyStep event
| |
| +- NetPlugin event
|
+- KernelCh event
ProxyCtrl event
```
# Profiler instrumentation and logging
## Profiling of collective and p2p operations
The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
collective and point-to-point operations, as well as proxy, kernel and network activity. Due to the asynchronous nature
of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
the profiler that the collective has been enqueued. The profiler can leverage proxy and/or kernel event information, if
these are enabled, to estimate when the collective ends. For example, the profiler can look at the `stopEvent`
call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
increment and decrement the reference counter, respectively.
## PXN
PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that
generated the operation. When this happens, the event hierarchy reported above breaks. Because the
profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the
parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy
thread originating the operation. To avoid the profiler instance in the remote proxy address space to
dereference a pointer from another address space the event descriptor includes the PID of the originator.
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
parent event.
# Known Limitations
In intra-node communication, or whenever a rank does not have any network activity for which proxy events
are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
collective. However, this time only represents the launch time of the collective and not the actual
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
With version 3 of the profiler interface network activity is no longer required to do intra-node profiling.
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
delayed, a similar loss of accuracy can be encountered.
To mitigate this effect, with version 4 of the profiler NCCL uses a per-channel ring buffer of 64 elements.
Every counter is complemented by two timestamps (ptimers) supplied by the NCCL kernel (one for start and one
for stop of the operation in the kernel). NCCL propagates these timestamps to the profiler plugin that it can
convert them to CPU time domain.
+22
Voir le fichier
@@ -0,0 +1,22 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
.DEFAULT_GOAL: build
include ../../makefiles/common.mk
SRCDIR ?= $(abspath ../..)
BUILDDIR ?= .
NCCLDIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.c)
build: ${BUILDDIR}/librccl-profiler.so
${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${BUILDDIR}
$(CC) -Inccl -fPIC -shared -o $@ $^
clean:
rm -f ${BUILDDIR}/librccl-profiler.so
+239
Voir le fichier
@@ -0,0 +1,239 @@
# NCCL Example Profiler Plugin Usage
This page describes how to use the NCCL example profiler plugin
# Overview
The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API
defines a set of events and data structures that NCCL uses to share event information with profiler
plugins. The user can control what events are instrumented by NCCL and when traces collected by the
profiler should be dumped through environment variables, as described in the rest of the document.
The user can also control other profiler parameters that alter its behavior. For example, users can
change the size of the event window the profiler keeps track of.
## Building the profiler plugin
To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
You can override `NCCL_HOME` to where the NCCL installation is on your system.
## Using the profiler plugin
1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`,
as documented in `ext-profiler/README.md`.
2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By
default, all collectives and send/recv operations will be traced. For more details about the event
representation used by the profiler refer to `ext-profiler/README.md`.
As an example, setting:
`NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
enables the profiling of the group, the collective and the proxy op events. The same events can be
expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
is that the profiler can easily correlate events that belong to the same NCCL operation and present
them accordingly.
3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
event format (more precisely, using asynchronous events).
4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and
open the created dump file to visualize the traces.
# Changing the profiler memory pool sizes
The example profiler uses separate memory pools for different types of events. The size of these memory
pools (i.e., the # events) determines the number of events that the profiler can keep track of at the
same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce`
operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it
finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the
profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event
handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned
event handle. The profiler has a total of 5 memory pools.
The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool
contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
generated by remote proxies. A list of pools and their size is reported below:
- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
Remote proxy operations are generated when PXN is in use. Refer to this article for more information
about PXN and how it works:
https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/
# Reported events
The example profiler generates traces using the json format. An example of trace is reported below:
```
[
{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
... [ trace truncated for brevity ]
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
{}]
```
Details about the fields used in the trace can be found at this link:
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
one collective and this is what is presented in the traces above).
The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args`
field in the traces contains NCCL specific information (aside from the chrome trace event format).
## AllReduce trace
The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field:
- seqNum : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator)
- commHash : communicator unique identifier
- rank : NCCL rank for the ncclAllReduce
- datatype : NCCL datatype
- algorithm : algorithm used to process the ncclAllReduce
- protocol : protocol used to process the ncclAllReduce
- nMaxChannels: max number of channels used to process the ncclAllReduce
If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
of collective and p2p operations`.
### Proxy Send
The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to send data to the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes to the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send
- DONE : struct containing the number of network sends completed and the time stamp of the last send completed
In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
which could help identify at which point the network problem occurred.
The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy SendBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
#### Proxy SendGPUWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
buffer.
#### Proxy SendWait
Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
### Proxy Recv
The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to recv data from the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes from the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted
- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
- DONE : struct containing the number of flush completed and the time stamp for the last flush completed
The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy RecvBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
become available.
#### Proxy RecvWait
Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
#### Proxy RecvFlushWait
Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
#### Proxy RecvGPUWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
+30
Voir le fichier
@@ -0,0 +1,30 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "event.h"
int taskEventQueueEmpty(struct group* g) {
return g->eventHead == NULL;
}
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
event->next = NULL;
if (g->eventHead) g->eventTail->next = event;
else g->eventHead = event;
g->eventTail = event;
}
struct taskEventBase* taskEventQueueHead(struct group* g) {
return g->eventHead;
}
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
struct taskEventBase* tmp = g->eventHead;
g->eventHead = g->eventHead->next;
if (g->eventHead == NULL) g->eventTail = NULL;
return tmp;
}
+194
Voir le fichier
@@ -0,0 +1,194 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef EVENT_H_
#define EVENT_H_
#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include "profiler.h"
#define MAX_CHANNELS 128 // Match RCCL's MAXCHANNELS
#define MAX_STEPS 16
#define MAX_OPS 16 // Up to 64K ranks for PAT
#define MAX_EVENTS_PER_REQ (8)
struct proxyOp;
struct proxyStep;
struct netPlugin {
uint8_t type;
int pluginType;
int pluginVer;
uint8_t pluginEvent;
union {
struct {
int device;
int qpNum;
int opcode;
uint64_t wr_id;
size_t length;
} qp;
struct {
int fd;
int op;
size_t length;
} sock;
};
double startTs;
double stopTs;
struct proxyStep* parent;
};
struct kernelCh {
uint8_t type;
uint8_t channelId;
struct taskEventBase* parent;
double startTs;
double stopTs;
uint64_t startGpuClk;
uint64_t stopGpuClk;
};
#define PROXY_STEP_SEND_GPU_WAIT 0
#define PROXY_STEP_SEND_PEER_WAIT 1
#define PROXY_STEP_SEND_WAIT 2
#define PROXY_STEP_RECV_WAIT 0
#define PROXY_STEP_RECV_FLUSH_WAIT 1
#define PROXY_STEP_RECV_GPU_WAIT 2
#define PROXY_STEP_MAX_STATES 3
struct proxyStep {
uint8_t type; // type of event: network transfer
int state;
int step; // network transfer id in given channel
int isSend; // send/recv channel operation
double timestamp[PROXY_STEP_MAX_STATES];
double startTs;
double stopTs;
struct proxyOp* parent;
struct netPlugin net[MAX_EVENTS_PER_REQ];
int nNetEvents;
};
struct proxyOp {
uint8_t type; // type of event: proxy operation
uint8_t channelId; // channel id for this proxy operation
pid_t pid;
int rank;
int peer; // peer rank for this proxy operation
int nSteps; // total number of network transfers for this proxy operation
int chunkSize; // chunk size for this proxy operation
int isSend; // send/recv channel operation
size_t transSize; // transfer data size for this proxy operation
double startTs;
double progrTs; // In progress state transition
double stopTs;
int stepCount; // last processed network operation for this proxy operation
struct proxyStep step[MAX_STEPS]; // array of network transfer events
struct taskEventBase* parent; // parent event p2p/collective
};
struct group;
struct context;
struct proxyCtrl {
uint8_t type;
struct context* ctx; // profiler context
double startTs;
double stopTs;
int state;
int appended; // appended proxy operations
};
// task level event base structure
struct taskEventBase {
uint8_t type; // event type: collective/p2p
int rank; // rank of the operation in NCCL communicator
const char* func; // ncclFunc*
int refCount; // number of references for this operation
struct group* parent; // parent event group
struct taskEventBase* next; // next top level event in group
double startTs;
double stopTs;
};
struct collective {
struct taskEventBase base; // base structure for this event
uint64_t seqNumber; // sequence number for this collective in communicator
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
const char* algo;
const char* proto;
int nWarps;
struct proxyOp op[MAX_CHANNELS][2*MAX_OPS];
int nProxyOps[MAX_CHANNELS];
struct kernelCh kernel[MAX_CHANNELS];
};
struct p2p {
struct taskEventBase base; // base structure for this event
uint8_t func;
void const* buff;
size_t count;
const char* datatype;
int peer;
uint8_t nChannels;
struct proxyOp op[MAX_CHANNELS];
struct kernelCh kernel[MAX_CHANNELS];
};
struct group {
uint8_t type;
struct context* ctx; // profiler context
int groupId;
int refCount;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct group* next; // next group event in queue
};
// arrays for different event objects
struct context {
const char* commName;
uint64_t commHash;
int nranks;
int rank;
int groupPoolSize;
int groupPoolBase;
int groupPoolIndex;
struct group* groupPool;
int collPoolSize;
int collPoolBase;
int collPoolIndex;
struct collective* collPool;
int p2pPoolSize;
int p2pPoolBase;
int p2pPoolIndex;
struct p2p* p2pPool;
int proxyCtrlPoolSize;
int proxyCtrlPoolBase;
int proxyCtrlPoolIndex;
struct proxyCtrl* proxyCtrlPool;
};
int taskEventQueueEmpty(struct group* g);
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
struct taskEventBase* taskEventQueueHead(struct group* g);
struct taskEventBase* taskEventQueueDequeue(struct group* g);
#endif
+15
Voir le fichier
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
+19
Voir le fichier
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;
#endif
+34
Voir le fichier
@@ -0,0 +1,34 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_IB_V1_H_
#define NET_IB_V1_H_
#define NCCL_PROFILER_NET_IB_VER 1
enum {
ncclProfileQp = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int device; // network device id
uint64_t wr_id; // work request id
int opcode; // ibv opcode
int qpNum; // QP number
size_t length; // work request data length
} qp;
};
} ncclProfilerNetIbDescr_v1_t;
#endif
+32
Voir le fichier
@@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NET_SOCKET_V1_H_
#define NET_SOCKET_V1_H_
#define NCCL_PROFILER_NET_SOCKET_VER 1
enum {
ncclProfileSocket = (1 << 0),
};
// The data structure version is encoded in the plugin identifier bitmask and
// passed to NCCL core through the profiler callback. NCCL copies the plugin
// identifier in the event descriptor before calling the profiler startEvent
// function. The profiler should inspect the plugin id to find out the source
// plugin as well as the version of the event struct
typedef struct {
uint8_t type; // event type (plugin defined)
union {
struct {
int fd;
int op;
size_t length;
} sock;
};
} ncclProfilerNetSockDescr_v1_t;
#endif
+76
Voir le fichier
@@ -0,0 +1,76 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_H_
#define PROFILER_H_
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
#include "err.h"
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
};
typedef enum {
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
ncclProfilerProxyOpInProgress_v4 = 19,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait = 8,
ncclProfilerProxyStepSendPeerWait_v4 = 20,
ncclProfilerProxyStepSendWait = 9,
ncclProfilerProxyStepRecvWait = 10,
ncclProfilerProxyStepRecvFlushWait = 11,
ncclProfilerProxyStepRecvGPUWait = 12,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle = 13,
ncclProfilerProxyCtrlActive = 14,
ncclProfilerProxyCtrlSleep = 15,
ncclProfilerProxyCtrlWakeup = 16,
ncclProfilerProxyCtrlAppend = 17,
ncclProfilerProxyCtrlAppendEnd = 18,
/* Network defined events states */
ncclProfilerNetPluginUpdate = 21,
/* Kernel event states */
ncclProfilerKernelChStop = 22,
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
#include "profiler_v4.h"
#include "profiler_v3.h"
#include "profiler_v2.h"
#include "profiler_v1.h"
#include "profiler_net.h"
typedef ncclProfiler_v4_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
#endif // end include guard
+22
Voir le fichier
@@ -0,0 +1,22 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_NET_H_
#define PROFILER_NET_H_
#define NCCL_PROFILER_NET_VER_BITS (16)
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
typedef enum {
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
} ncclProfilerNetType;
#include "net_ib_v1.h"
#include "net_socket_v1.h"
#endif
+109
Voir le fichier
@@ -0,0 +1,109 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V1_H_
#define PROFILER_V1_H_
#include <stdint.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
#endif
+106
Voir le fichier
@@ -0,0 +1,106 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V2_H_
#define PROFILER_V2_H_
#include <stdint.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v2_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v2_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v2_t;
#endif
+114
Voir le fichier
@@ -0,0 +1,114 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V3_H_
#define PROFILER_V3_H_
#include <stdint.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v3_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v3_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v3_t;
#endif
+123
Voir le fichier
@@ -0,0 +1,123 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V4_H_
#define PROFILER_V4_H_
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communciator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
#endif

Certains fichiers ne sont pas affichés car ce diff contient trop de modifications Voir plus