Add 'projects/rccl/' from commit '1f2f9f33bac3e8ecfd84c69af6063d7352c362fc'

git-subtree-dir: projects/rccl git-subtree-mainline: 3fd8a0d393 git-subtree-split: 1f2f9f33ba
2025-12-11 20:46:05 +00:00
Parent 3fd8a0d393 1f2f9f33ba
@@ -0,0 +1,71 @@
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+- name: pytestFolder
+  value: '.azuredevops/tests/pytest'
+
+parameters:
+- name: pytestList
+  type: object
+  default:
+    - HelloWorld
+
+trigger: none
+pr: none
+schedules:
+  - cron: "0 5 * 11-3 *"  # 11 PM CST (November - March)
+    displayName: "Nightly Build (CST)"
+    branches:
+      include:
+        - develop
+    always: false
+
+  - cron: "0 4 * 4-10 *"  # 11 PM CDT (April - October)
+    displayName: "Nightly Build (CDT)"
+    branches:
+      include:
+        - develop
+    always: false
+
+jobs:
+- job: rccl
+  timeoutInMinutes: 180
+  pool: rocm-ci_rccl_pool
+  workspace:
+    clean: all
+  steps:
+  - task: DeleteFiles@1
+    inputs:
+      Contents: '**/*'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
+    parameters:
+      submoduleBehaviour: recursive
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
+    parameters:
+      installEnabled: false
+      printDiskSpace: false
+      extraBuildFlags: >-
+        -DCMAKE_BUILD_TYPE=Release
+        -DBUILD_TESTS=ON
+        -GNinja
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
+    parameters:
+      componentName: rccl
+      testDir: $(Build.SourcesDirectory)/build/test
+      testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
+      testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+  - ${{ each pytestScript in parameters.pytestList }}:
+    - task: Bash@3
+      displayName: Test ${{ pytestScript }}
+      continueOnError: true
+      inputs:
+        targetType: inline
+        workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
+        script: pytest ${{ pytestScript }}.py
@@ -0,0 +1,77 @@
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+- name: pytestFolder
+  value: '.azuredevops/tests/pytest'
+
+parameters:
+- name: pytestList
+  type: object
+  default:
+    - HelloWorld
+
+trigger: none
+pr:
+  autoCancel: true
+  branches:
+    include:
+    - develop
+  paths:
+    exclude:
+    - .github
+    - .jenkins
+    - docs
+    - '*.md'
+    - LICENSE.txt
+    - NOTICES.txt
+  drafts: false
+
+stages:
+- stage: rcclStage
+  displayName: 'RCCL develop PR'
+  jobs:
+  - deployment: rccl_pr_approval
+    displayName: "CI Run Requires Approval"
+    environment: rccl
+  - job: rccl
+    timeoutInMinutes: 180
+    pool: rocm-ci_rccl_pool
+    workspace:
+      clean: all
+    steps:
+    - task: DeleteFiles@1
+      inputs:
+        Contents: '**/*'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
+      parameters:
+        submoduleBehaviour: recursive
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
+      parameters:
+        installEnabled: false
+        printDiskSpace: false
+        extraBuildFlags: >-
+          -DCMAKE_BUILD_TYPE=Release
+          -DBUILD_TESTS=ON
+          -DGPU_TARGETS=gfx942
+          -GNinja
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
+      parameters:
+        componentName: rccl
+        testDir: $(Build.SourcesDirectory)/build/test
+        testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
+        testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
+    - ${{ each pytestScript in parameters.pytestList }}:
+      - task: Bash@3
+        displayName: Test ${{ pytestScript }}
+        continueOnError: true
+        inputs:
+          targetType: inline
+          workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
+          script: pytest ${{ pytestScript }}.py
@@ -0,0 +1,44 @@
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+trigger: none
+pr: none
+schedules:
+  - cron: "0 5 * 11-3 *"  # 11 PM CST (November - March)
+    displayName: "Nightly Build (CST)"
+    branches:
+      include:
+        - develop
+    always: false
+
+  - cron: "0 4 * 4-10 *"  # 11 PM CDT (April - October)
+    displayName: "Nightly Build (CDT)"
+    branches:
+      include:
+        - develop
+    always: false
+
+jobs:
+- job: rccl
+  timeoutInMinutes: 240
+  pool: rocm-ci_rccl_slurm_pool
+  workspace:
+    clean: all
+  steps:
+  - task: DeleteFiles@1
+    inputs:
+      Contents: '**/*'
+  - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
+    parameters:
+      submoduleBehaviour: recursive
+  - template: templates/build.yml
+  - template: templates/test_rccl-UnitTests.yml
+  - template: templates/test_rccl-tests.yml
@@ -0,0 +1,49 @@
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+trigger: none
+pr:
+  autoCancel: true
+  branches:
+    include:
+    - develop
+  paths:
+    exclude:
+    - .github
+    - .jenkins
+    - docs
+    - '*.md'
+    - LICENSE.txt
+    - NOTICES.txt
+  drafts: false
+
+stages:
+- stage: rcclStage
+  displayName: 'RCCL develop PR'
+  jobs:
+  - deployment: rccl_pr_approval
+    displayName: "CI Run Requires Approval"
+    environment: rccl
+  - job: rccl
+    timeoutInMinutes: 240
+    pool: rocm-ci_rccl_slurm_pool
+    workspace:
+      clean: all
+    steps:
+    - task: DeleteFiles@1
+      inputs:
+        Contents: '**/*'
+    - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
+      parameters:
+        submoduleBehaviour: recursive
+    - template: templates/build.yml
+    - template: templates/test_rccl-UnitTests.yml
+    - template: templates/test_rccl-tests.yml
@@ -0,0 +1,73 @@
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+parameters:
+- name: pipelinesRepoRef
+  type: string
+  default: refs/heads/develop
+- name: systemsRepoRef
+  type: string
+  default: refs/heads/develop
+- name: systemsSparseCheckoutDir
+  type: string
+  default: 'projects/rocprofiler-sdk'
+- name: triggerDownstreamJobs
+  type: boolean
+  default: true
+
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+    ref: ${{ parameters.pipelinesRepoRef }}
+  - repository: systems_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/rocm-systems
+    ref: ${{ parameters.systemsRepoRef }}
+
+trigger:
+  batch: true
+  branches:
+    include:
+    - develop
+    - mainline
+  paths:
+    exclude:
+    - .github
+    - .jenkins
+    - docs
+    - '.*.y*ml'
+    - '*.md'
+    - LICENSE.txt
+    - NOTICES.txt
+
+pr:
+  autoCancel: true
+  branches:
+    include:
+    - develop
+    - mainline
+  paths:
+    exclude:
+    - .github
+    - .jenkins
+    - docs
+    - '.*.y*ml'
+    - '*.md'
+    - LICENSE.txt
+    - NOTICES.txt
+  drafts: false
+
+stages:
+- stage: rccl
+  jobs:
+  - template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo
+    parameters:
+      sparseCheckoutDir: ''
+      systemsRepo: systems_repo
+      systemsSparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
+      triggerDownstreamJobs: ${{ parameters.triggerDownstreamJobs }}
@@ -0,0 +1,54 @@
+#!/bin/bash
+#SBATCH --job-name=rccl-build
+#SBATCH --output=rccl-build-%j.out
+#SBATCH --error=rccl-build-%j.out
+#SBATCH --time=60
+#SBATCH --nodes=1
+#SBATCH --exclusive
+#SBATCH --partition=gt
+
+short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
+echo "Node identifier: $short_id"
+
+source /etc/profile.d/lmod.sh
+module load rocm/6.4.1
+
+# Setup local binary path
+export PATH="$HOME/.local/bin:$PATH"
+mkdir -p "$HOME/.local/bin"
+
+# Install Ninja if not already available
+if ! command -v ninja &>/dev/null; then
+  echo "Ninja not found. Installing locally..."
+  wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
+  unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
+  chmod +x "$HOME/.local/bin/ninja"
+fi
+
+echo "Using Ninja at: $(which ninja)"
+ninja --version
+
+# Define GPU target
+export GPU_TARGETS="gfx942"
+
+cd "${SLURM_SUBMIT_DIR:-$PWD}"
+## Building RCCL
+mkdir -p build
+cd build
+cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
+cmake --build .
+cmake --build . --target install
+
+# Building RCCL Replayer
+cd ../tools/RcclReplayer 2>/dev/null || cd ../RcclReplayer
+RCCL_DIR="../../build" ROCM_DIR="$ROCM_PATH" MPI_DIR="$MPI_HOME" make
+
+cd "${SLURM_SUBMIT_DIR:-$PWD}"
+## Building RCCL-Tests
+git clone https://github.com/ROCm/rccl-tests
+cd rccl-tests
+mkdir -p build
+cd build
+cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" ..
+cmake --build .
+cmake --build . --target install
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH --job-name=rccl-UnitTests
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.out
+#SBATCH --time=180
+#SBATCH --nodes=1
+#SBATCH --exclusive
+#SBATCH --partition=gt
+
+short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
+echo "Node identifier: $short_id"
+
+source /etc/profile.d/lmod.sh
+module load rocm/6.4.1
+cd "$BINARIES_DIR/bin"
+LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name=rccl-tests
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.out
+#SBATCH --time=60
+#SBATCH --nodes=1
+#SBATCH --exclusive
+#SBATCH --partition=gt
+
+short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
+echo "Node identifier: $short_id"
+
+source /etc/profile.d/lmod.sh
+module load rocm/6.4.1
+
+cd ${PIPELINE_WORKSPACE}/TestResults
+mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
+export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
+
+export PATH="$BINARIES_DIR/bin:$PATH"
+export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
+
+### create hostlist
+#nodelist=($(scontrol show hostnames))
+#echo "SLURM nodes:"
+#echo ${nodelist[@]}
+#echo ""
+#
+#hosts_8ppn=()
+#for node in "${nodelist[@]}"
+#do
+#    hosts_8ppn+=("${node}:8")
+#done
+#echo ${hosts_8ppn[@]}
+
+### Run multi- and single-node RCCL-Tests
+## Run single-node RCCL-Tests
+for n in 1
+do
+    total=$((n*8))
+    #h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
+
+    for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
+    do
+        for dtype in float bfloat16 half fp8_e5m2
+        do
+            out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
+            #cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
+            cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
+
+            echo "Running ${coll}" 2>&1 | tee ${out_filename}
+            echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
+            eval ${cmd} 2>&1 | tee -a ${out_filename}
+
+            sleep 2
+        done
+    done
+done
+
+## To add
+### Summarize results
+### Convert to junit
@@ -0,0 +1,86 @@
+# small subset of files to check for install to determine pass/fail
+parameters:
+- name: expectedInstallFiles
+  type: object
+  default:
+    - bin/rccl-UnitTests
+    - include/rccl/rccl.h
+    - lib/cmake/rccl/rccl-config.cmake
+    - lib/librccl.so
+    - share/doc/rccl/LICENSE.txt
+    - share/rccl/msccl-algorithms
+    - share/rccl/msccl-unit-test-algorithms
+
+steps:
+  - task: Bash@3
+    displayName: Build Job
+    env:
+      BINARIES_DIR: $(Build.BinariesDirectory)
+    inputs:
+      targetType: inline
+      script: |
+        echo "##[section]Starting build job..."
+
+        rm -rf $(Build.BinariesDirectory)/*
+
+        echo "Submitting build job..."
+        mkdir -p $(Build.BinariesDirectory)
+        BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
+        echo "Submitted build job: $BUILD_JOB_ID"
+        echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
+
+        echo "Waiting for build job to start..."
+        while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
+          echo "##[section]Build job $BUILD_JOB_ID is still running..."
+          sleep 60
+        done
+
+        echo "Waiting for final status via sacct..."
+        LOOP_COUNT=0
+        MAX_LOOPS=30  # Maximum of 30 loops (30 minutes)
+        while true; do
+          STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
+          echo "##[section]Build job state: $STATE"
+          if [[ "$STATE" == "COMPLETED" ]]; then
+            break
+          elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
+            echo "Build failed with state $STATE"
+            break
+          fi
+          sleep 60
+          LOOP_COUNT=$((LOOP_COUNT + 1))
+          if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
+            echo "Time limit reached while waiting for final status."
+            exit 1  # Exit with an error code if time limit is reached
+          fi
+        done
+
+        echo "Checking for expected installed files..."
+        MISSING_FILES=0
+
+        expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
+        i=1
+        total=$(echo "$expectedFiles" | wc -w)
+        while [ $i -le $total ]; do
+          relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
+          fullpath="$BINARIES_DIR/$relpath"
+          if [ ! -e "$fullpath" ]; then
+            echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
+            MISSING_FILES=1
+          fi
+          i=$((i + 1))
+        done
+
+        if [ "$MISSING_FILES" -eq 1 ]; then
+          echo "One or more expected files are missing from the install directory."
+          exit 1
+        else
+          echo "All expected files are present in the install directory."
+        fi
+  - task: Bash@3
+    displayName: Build Logs
+    condition: always()
+    inputs:
+      targetType: inline
+      script: |
+        cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
@@ -0,0 +1,69 @@
+steps:
+  - task: Bash@3
+    displayName: RCCL UnitTests
+    env:
+      BINARIES_DIR: $(Build.BinariesDirectory)
+      PIPELINE_WORKSPACE: $(Pipeline.Workspace)
+    inputs:
+      targetType: inline
+      script: |
+        echo "Submitting test job..."
+        TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh)
+        echo "Submitted test job: $TEST_JOB_ID"
+        echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
+
+        echo "Waiting for test job to start..."
+        while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
+          echo "##[section]Test job $TEST_JOB_ID is still running..."
+          sleep 60
+        done
+
+        echo "Waiting for final status via sacct..."
+        LOOP_COUNT=0
+        MAX_LOOPS=120  # Maximum of 120 loops (120 minutes)
+        while true; do
+          STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
+          echo "##[section]Test job state: $STATE"
+          if [[ "$STATE" == "COMPLETED" ]]; then
+            break
+          elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
+            echo "Test failed with state $STATE"
+            break
+          fi
+          sleep 60
+          LOOP_COUNT=$((LOOP_COUNT + 1))
+          if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
+            echo "Time limit reached while waiting for final status."
+            exit 1  # Exit with an error code if time limit is reached
+          fi
+        done
+
+        echo "Checking test result XML for failures..."
+        TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1)
+        if [ -z "$TEST_XML" ]; then
+          echo "##vso[task.logissue type=error]No $TEST_XML file found"
+          echo "##vso[task.complete result=Failed;]DONE"
+          exit 1
+        fi
+
+        if grep -q 'failures="[^0]' "$TEST_XML"; then
+          echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
+          echo "##vso[task.complete result=Failed;]DONE"
+          exit 1
+        else
+          echo "No test failures detected."
+        fi
+  - task: Bash@3
+    displayName: Test Logs
+    condition: always()
+    inputs:
+      targetType: inline
+      script: |
+        cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found"
+  - task: PublishTestResults@2
+    displayName: 'Publish Results'
+    condition: succeededOrFailed()
+    inputs:
+      searchFolder: $(Pipeline.Workspace)
+      testResultsFormat: JUnit
+      testResultsFiles: '**/rccl-UnitTests_output.xml'
@@ -0,0 +1,77 @@
+steps:
+  - task: Bash@3
+    displayName: RCCL-Tests
+    env:
+      BINARIES_DIR: $(Build.BinariesDirectory)
+      PIPELINE_WORKSPACE: $(Pipeline.Workspace)
+    inputs:
+      targetType: inline
+      script: |
+        echo "Submitting test job..."
+        TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh)
+        echo "Submitted test job: $TEST_JOB_ID"
+        echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
+
+        echo "Waiting for test job to start..."
+        while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
+          echo "##[section]Test job $TEST_JOB_ID is still running..."
+          sleep 60
+        done
+
+        echo "Waiting for final status via sacct..."
+        LOOP_COUNT=0
+        MAX_LOOPS=120  # Maximum of 120 loops (120 minutes)
+        while true; do
+          STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
+          echo "##[section]Test job state: $STATE"
+          if [[ "$STATE" == "COMPLETED" ]]; then
+            break
+          elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
+            echo "Test failed with state $STATE"
+            break
+          fi
+          sleep 60
+          LOOP_COUNT=$((LOOP_COUNT + 1))
+          if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
+            echo "Time limit reached while waiting for final status."
+            exit 1  # Exit with an error code if time limit is reached
+          fi
+        done
+
+        echo "Checking test result json for failures..."
+        TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json')
+        if [ -z "$TEST_JSON" ]; then
+          echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found"
+          echo "##vso[task.complete result=Failed;]DONE"
+          exit 1
+        fi
+
+        #echo "Checking test result XML for failures..."
+        #TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1)
+        #if [ -z "$TEST_XML" ]; then
+        #  echo "##vso[task.logissue type=error]No $TES_XML file found"
+        #  echo "##vso[task.complete result=Failed;]DONE"
+        #  exit 1
+        #fi
+
+        #if grep -q 'failures="[^0]' "$TEST_XML"; then
+        #  echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
+        #  echo "##vso[task.complete result=Failed;]DONE"
+        #  exit 1
+        #else
+        #  echo "No test failures detected."
+        #fi
+  - task: Bash@3
+    displayName: Test Logs
+    condition: always()
+    inputs:
+      targetType: inline
+      script: |
+        cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found"
+#  - task: PublishTestResults@2
+#    displayName: 'Publish Results'
+#    condition: succeededOrFailed()
+#    inputs:
+#      searchFolder: $(Pipeline.Workspace)
+#      testResultsFormat: JUnit
+#      testResultsFiles: '**/rccl-tests_output.xml'
@@ -0,0 +1,5 @@
+import pytest
+
+def test_HelloWorld():
+    greeting = "Hello, World!"
+    assert greeting == "Hello, World!"
@@ -0,0 +1,139 @@
+# Style file for MLSE Libraries based on the modified rocBLAS style
+
+# Common settings
+BasedOnStyle:  WebKit
+TabWidth:        4
+IndentWidth:     4
+UseTab:          Never
+ColumnLimit: 100
+UseCRLF: false
+
+# Other languages JavaScript, Proto
+---
+Language: Json
+DisableFormat: true
+
+---
+Language: Cpp
+
+# http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code
+# int formatted_code;
+# // clang-format off
+#     void    unformatted_code  ;
+# // clang-format on
+# void formatted_code_again;
+
+DisableFormat: false
+Standard: Cpp11
+
+AccessModifierOffset: -4
+AlignAfterOpenBracket: true
+AlignArrayOfStructures: Right
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BitFieldColonSpacing: Both
+# Configure each individual brace in BraceWrapping
+BreakBeforeBraces: Custom
+# Control of individual brace wrapping cases
+BraceWrapping:
+    AfterCaseLabel: true
+    AfterClass: true
+    AfterControlStatement: Always
+    AfterEnum: true
+    AfterExternBlock: false
+    AfterFunction: true
+    AfterNamespace: true
+    AfterStruct: true
+    AfterUnion: true
+    BeforeCatch: true
+    BeforeElse: true
+    BeforeLambdaBody: true
+    BeforeWhile: true
+    IndentBraces: false
+    SplitEmptyFunction: false
+    SplitEmptyRecord: false
+    SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: All
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+BreakStringLiterals: true
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: false
+DerivePointerAlignment: false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: Always
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros: []
+IfMacros: []
+IncludeBlocks: Preserve
+IndentAccessModifiers: false
+IndentCaseBlocks: true
+IndentCaseLabels: true
+IndentExternBlock: NoIndent
+IndentPPDirectives: BeforeHash
+IndentWrappedFunctionNames: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PPIndentWidth: -1
+PackConstructorInitializers: NextLine
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+QualifierAlignment: Leave
+ReferenceAlignment: Pointer
+ReflowComments: false
+ShortNamespaceLines: 0
+SortIncludes: CaseSensitive
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: Never
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: Never
+SpacesInCStyleCastParentheses: false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+---
@@ -0,0 +1,9 @@
+* @ROCm/rccl-reviewers
+
+# Documentation files
+docs/ @ROCm/rocm-documentation
+*.md @ROCm/rocm-documentation
+*.rst @ROCm/rocm-documentation
+.readthedocs.yaml @ROCm/rocm-documentation
+
+src/include/api_trace.h @ROCm/ROCM-DevTools-Team
@@ -0,0 +1,23 @@
+## Details
+___Do not mention proprietary info or link to internal work items in this PR.___
+
+**Work item:** _"Internal", or link to GitHub issue (if applicable)._
+
+**What were the changes?**  
+_One sentence describing the work done._
+
+**Why were the changes made?**  
+_Explain the motivation behind the work. Provide any publicly-available historical context._
+
+**How was the outcome achieved?**  
+_Technical details behind the work. Explain any publicly-available hardware peculiarities._
+
+**Additional Documentation:**  
+_What else should the reviewer know?_
+
+## Approval Checklist
+___Do not approve until these items are satisfied.___
+- [ ] Verify the CHANGELOG has been updated, if
+  - there are any NCCL API version changes,
+  - any changes impact library users, and/or
+  - any changes impact any other ROCm library.
@@ -0,0 +1,17 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/docs/sphinx" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
+    labels:
+      - "dependencies"
+      - "ci:docs-only"
+    reviewers:
+      - "samjwu"
@@ -0,0 +1,134 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT 
+
+import fnmatch
+import json
+import os
+from pathlib import Path
+import subprocess
+import sys
+from typing import Iterable, Optional, Mapping
+
+def gha_set_output(vars: Mapping[str, str | Path]):
+    """Sets values in a step's output parameters.
+
+    This appends to the file located at the $GITHUB_OUTPUT environment variable.
+
+    See
+      * https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
+      * https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
+    """
+    print(f"Setting github output:\n{vars}")
+
+    step_output_file = os.getenv("GITHUB_OUTPUT")
+    if not step_output_file:
+        print("  Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
+        return
+
+    with open(step_output_file, "a") as f:
+        f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
+
+def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
+    """Returns the paths of modified files relative to the base reference."""
+    try:
+        return subprocess.run(
+            ["git", "diff", "--name-only", base_ref],
+            stdout=subprocess.PIPE,
+            check=True,
+            text=True,
+            timeout=60,
+        ).stdout.splitlines()
+    except TimeoutError:
+        print(
+            "Computing modified files timed out. Not using PR diff to determine"
+            " jobs to run.",
+            file=sys.stderr,
+        )
+        return None
+    
+GITHUB_WORKFLOWS_CI_PATTERNS = [
+    "therock*.yml",
+]
+
+
+def is_path_workflow_file_related_to_ci(path: str) -> bool:
+    return any(
+        fnmatch.fnmatch(path, ".github/workflows/" + pattern)
+        for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
+    )
+    
+def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
+    if paths is None:
+        return False
+    return any(is_path_workflow_file_related_to_ci(p) for p in paths)
+
+# Paths matching any of these patterns are considered to have no influence over
+# build or test workflows so any related jobs can be skipped if all paths
+# modified by a commit/PR match a pattern in this list.
+SKIPPABLE_PATH_PATTERNS = [
+    "docs/*",
+    "*.gitignore",
+    "*.md",
+    "*LICENSE*",
+    "*NOTICES*",
+    '.github/CODEOWNERS',
+    '.github/*.md',
+    '.github/dependabot.yml',
+    '.azuredevops*',
+]
+
+def is_path_skippable(path: str) -> bool:
+    """Determines if a given relative path to a file matches any skippable patterns."""
+    return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
+
+def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
+    """Returns true if at least one path is not in the skippable set."""
+    if paths is None:
+        return False
+    return any(not is_path_skippable(p) for p in paths)
+
+def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
+    """Returns true if CI workflows should run given a list of modified paths."""
+
+    if paths is None:
+        print("No files were modified, skipping TheRock CI jobs")
+        return False
+
+    paths_set = set(paths)
+    github_workflows_paths = set(
+        [p for p in paths if p.startswith(".github/workflows")]
+    )
+    other_paths = paths_set - github_workflows_paths
+    
+    related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
+    contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
+
+    print("should_ci_run_given_modified_paths findings:")
+    print(f"  contains_other_non_skippable_files: {contains_other_non_skippable_files}")
+
+    if related_to_ci:
+        print("Enabling build jobs since a related workflow file was modified")
+        return True
+    elif contains_other_non_skippable_files:
+        print("Enabling TheRock CI jobs since a non-skippable path was modified")
+        return True
+    else:
+        print(
+            "Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
+        )
+        return False
+
+def main(args):
+    base_ref = args.get("base_ref")
+    modified_paths = get_modified_paths(base_ref)
+    print("modified_paths (max 200):", modified_paths[:200])
+    enable_jobs = should_ci_run_given_modified_paths(modified_paths)
+    output = {
+        'enable_therock_ci': json.dumps(enable_jobs)
+    }
+    gha_set_output(output)
+
+if __name__ == "__main__":
+    args = {}
+    args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
+    main(args)
@@ -0,0 +1,147 @@
+name: TheRock CI Linux
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      extra_cmake_options:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  therock-build-linux:
+    name: Build Linux Packages
+    runs-on: azure-linux-scale-rocm
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:1f1ce0ab151146c7f86ee4345be74c42d8ca83200d9d26843e8a71df01ecad4e
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+      TEATIME_FORCE_INTERACTIVE: 0
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      CACHE_DIR: ${{ github.workspace }}/.container-cache
+      # The ccache.conf will be written by setup_ccache.py before this gets used.
+      CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf
+    steps:
+      - name: Checkout TheRock repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
+
+      - name: Checkout rccl repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/rccl"
+          path: rccl
+
+      - name: Checkout rccl-tests repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/rccl-tests"
+          path: rccl-tests
+
+      - name: Install python deps
+        run: |
+          pip install -r requirements.txt
+
+      # safe.directory must be set before Runner Health Status
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Fetch sources
+        run: |
+          ./build_tools/fetch_sources.py --jobs 12
+
+      - name: Configure Projects
+        env:
+          amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
+          package_version: ADHOCBUILD
+          extra_cmake_options: ${{ inputs.extra_cmake_options }}
+          BUILD_DIR: build
+        run: |
+          python3 build_tools/github_actions/build_configure.py
+
+      - name: Build therock-dist
+        run: cmake --build build
+
+      - name: Build therock-archives
+        run: cmake --build build --target therock-archives
+
+      - name: Report
+        #if: ${{ !cancelled() }}
+        run: |
+          echo "Full SDK du:"
+          echo "------------"
+          du -h -d 1 build/dist/rocm
+          echo "Artifact Archives:"
+          echo "------------------"
+          ls -lh build/artifacts/*.tar.xz
+          echo "Artifacts:"
+          echo "----------"
+          du -h -d 1 build/artifacts
+          echo "CCache Stats:"
+          echo "-------------"
+          ccache -s -v
+          tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log
+
+      - name: Configure AWS Credentials for non-forked repos
+        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
+
+      - name: Post Build Upload
+        if: always()
+        run: |
+          python3 build_tools/github_actions/post_build_upload.py \
+            --run-id ${{ github.run_id }} \
+            --artifact-group ${{ env.AMDGPU_FAMILIES }} \
+            --build-dir build \
+            --upload
+
+  therock-test-linux-multi-node:
+    name: "Test multi-node"
+    if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
+    permissions:
+      contents: read
+      id-token: write
+    needs: [therock-build-linux]
+    uses: ./.github/workflows/therock-test-packages-multi-node.yml
+    with:
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      artifact_group: ${{ inputs.artifact_group }}
+      test_runs_on: nova-linux-slurm-scale-runner
+      artifact_run_id: ${{ github.run_id }}
+
+  therock-test-linux-single-node:
+    name: "Test single-node"
+    if: ${{ inputs.amdgpu_families == 'gfx94X-dcgpu' }}
+    needs: [therock-build-linux]
+    uses: ./.github/workflows/therock-test-packages-single-node.yml
+    with:
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      artifact_group: ${{ inputs.artifact_group }}
+      test_runs_on: linux-mi325-1gpu-ossci-rocm-frac
+      artifact_run_id: ${{ github.run_id }}
@@ -0,0 +1,91 @@
+name: TheRock CI for rccl
+
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    types:
+      - labeled
+      - opened
+      - synchronize
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup:
+    runs-on: ubuntu-24.04
+    env:
+      # The commit being checked out is the merge commit for a PR. Its first
+      # parent will be the tip of the base branch.
+      BASE_REF: HEAD^
+    outputs:
+      enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          # We need the parent commit to do a diff
+          fetch-depth: 2
+
+      - name: "Configuring CI options"
+        id: configure
+        run: python .github/scripts/therock_configure_ci.py
+
+  therock-ci-linux:
+    name: TheRock CI Linux
+    needs: setup
+    if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
+    permissions:
+      contents: read
+      id-token: write
+    strategy:
+      fail-fast: false
+      matrix:
+        amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu]
+    uses: ./.github/workflows/therock-ci-linux.yml
+    secrets: inherit
+    with:
+      amdgpu_families: ${{ matrix.amdgpu_family }}
+      artifact_group: ${{ matrix.amdgpu_family }}
+      extra_cmake_options: >
+        -DTHEROCK_ENABLE_ALL=OFF 
+        -DTHEROCK_BUILD_TESTING=ON 
+        -DTHEROCK_BUNDLE_SYSDEPS=ON 
+        -DTHEROCK_ENABLE_COMM_LIBS=ON 
+        -DTHEROCK_ENABLE_ROCPROFV3=ON 
+        -DTHEROCK_USE_EXTERNAL_RCCL=ON 
+        -DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON 
+        -DTHEROCK_RCCL_SOURCE_DIR=./rccl 
+        -DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
+        -DTHEROCK_ENABLE_MPI=ON
+
+  therock_ci_summary:
+    name: TheRock CI Summary
+    if: always()
+    needs:
+      - setup
+      - therock-ci-linux
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Output failed jobs
+        run: |
+          echo '${{ toJson(needs) }}'
+          FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
+            | jq --raw-output \
+            'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
+          )"
+          if [[ "${FAILED_JOBS}" != "" ]]; then
+            echo "The following jobs failed: ${FAILED_JOBS}"
+            exit 1
+          fi
@@ -0,0 +1,96 @@
+name: TheRock Test Packages multi-node
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      test_runs_on:
+        type: string
+      artifact_run_id:
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      test_runs_on:
+        type: string
+      artifact_run_id:
+        type: string
+        
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  test_rccl_multi_node:
+    name: 'Test multi-node'
+    runs-on: ${{ inputs.test_runs_on }}
+    defaults:
+      run:
+        shell: bash
+    permissions:
+      contents: read
+      id-token: write
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
+      OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
+      THEROCK_BIN_DIR: "./build/bin"
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          ARTIFACT_GROUP: ${{ inputs.artifact_group }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: "--rccl --tests"
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      # The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
+      # salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
+      - name: Test gfx950
+        if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
+        run: |
+          salloc -N 4 -p meta64 -t 04:00:00 --exclusive bash -c "
+          source /home/arravikum/TheRock/.venv/bin/activate &&
+          cd /home/arravikum/cvs &&
+          python input/setup.py &&
+          pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
+              --cluster_file ./input/cluster.json \
+              --config_file ./input/mi350_config.json \
+              --log-file=/tmp/rccl_log.log \
+              --html=/home/arravikum/cvs/test_reports/ci_test_report.html \
+              --capture=tee-sys \
+              --self-contained-html"
+
+      - name: Configure AWS Credentials for non-forked repos
+        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
+
+      - name: Post test report upload
+        if: always()
+        working-directory: ${{ github.workspace }}
+        run: |
+          export PYTHONPATH="${PYTHONPATH}:${{ github.workspace }}/build_tools"
+          python3 build_tools/github_actions/upload_test_report_script.py \
+            --run-id "${{ github.run_id }}" \
+            --amdgpu-family "${{ inputs.amdgpu_families }}" \
+            --report-path "/home/arravikum/cvs/test_reports" \
+            --log-destination "/logs/gfx950-dcgpu" \
+            --index-file-name "index_rccl_test_report.html"
@@ -0,0 +1,74 @@
+name: TheRock Test Packages single-node
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      test_runs_on:
+        type: string
+      artifact_run_id:
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      test_runs_on:
+        type: string
+      artifact_run_id:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  test_rccl_single_node:
+    name: 'Test single-node'
+    runs-on: ${{ inputs.test_runs_on }}
+    container:
+      image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 110
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
+      OUTPUT_ARTIFACTS_DIR: "./build"
+      THEROCK_BIN_DIR: "./build/bin"
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "ROCm/TheRock"
+          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          ARTIFACT_GROUP: ${{ inputs.artifact_group }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: "--rccl --tests"
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      - name: Test
+        timeout-minutes: 15
+        # Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
+        # RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
+        # TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
+        run: |
+          pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
+            --log-cli-level=info \
+            -k "not test_rccl_correctness_tests"
@@ -0,0 +1,8 @@
+# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+*.gcov
+/coverage/
+build/
+ext/
+
+# Visual Studio Code
+.vscode
@@ -0,0 +1,10 @@
+[submodule "ext-src/mscclpp"]
+	path = ext-src/mscclpp
+	url = https://github.com/microsoft/mscclpp.git
+	ignore = dirty
+	shallow = true
+[submodule "ext-src/json"]
+	path = ext-src/json
+	url = https://github.com/nlohmann/json.git
+	ignore = dirty
+	shallow = true
@@ -0,0 +1,18 @@
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.10"
+
+sphinx:
+   configuration: docs/conf.py
+
+formats: [htmlzip, pdf, epub]
+
+python:
+   install:
+   - requirements: docs/sphinx/requirements.txt
@@ -0,0 +1,370 @@
+# Changelog for RCCL
+
+Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
+
+## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
+
+### Changed
+
+* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
+* Disabled `reduceCopyPacks` pipelining for `gfx950`.
+
+## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
+
+### Changed
+* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes.
+
+### Resolved Issues
+
+* Fixed crash when using the librccl-profiler plugin with the all-to-all collective after the 2.27 update.
+
+## RCCL 2.27.7 for ROCm 7.1.0
+
+### Added
+* Added `RCCL_IB_QPS_PER_P2P` to set the number of QPs per connection for P2P operations. When set (≥1), P2P operations (Send/Recv) use `RCCL_IB_QPS_PER_P2P`, while other collective operations continue to use `NCCL_IB_QPS_PER_CONNECTION`. When not set, `NCCL_IB_QPS_PER_CONNECTION` applies to all operations.
+* Added `RCCL_FORCE_ENABLE_DMABUF` as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks.
+* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
+* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
+* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
+
+
+### Changed
+
+* The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script.
+* Compatibility with NCCL 2.27.7.
+
+### Optimized
+* Enabled and optimized batched P2P operations to improve small message performance for AllToAll and AllGather.
+* Optimized channel count selection to improve efficiency for small to medium message sizes in ReduceScatter.
+* Changed code inlining to improve latency for small message sizes for AllReduce, AllGather, and ReduceScatter.
+
+### Known issues
+* Symmetric memory kernels are currently disabled due to ongoing CUMEM enablement work.
+* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
+
+## RCCL 2.26.6 for ROCm 7.0.0
+
+### Resolved issues
+
+* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
+* Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.
+* Suboptimal algorithmic switching point for AllReduce on MI300x.
+* Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes.
+* Fixed broken functionality within the LL protocol on gfx950 by disabling inlining of LLGenericOp kernels.
+
+### Added
+
+* Added new GPU target `gfx950`.
+* Added support for `unroll=1` in device-code generation to improve performance,
+* Set a default of 112 channels for a single node with `8 * gfx950`,
+* Enabled LL128 protocol on `gfx950`.
+* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
+* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
+* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
+* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
+* Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled.
+* Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap `bf16` arithmetic and bridge the gap between `fp32` performance and `bf16` for both `gfx942` and `gfx950`. Pipelining has been made tunable via `rcclSetPipelining`, similar to algorithms/protocols so that regression is avoided in certain message sizes.
+* Added a direct allgather algorithm. This is enabled by default for multi-node if there are 16 nodes or fewer. The message size threshold is 4MB.
+* Added `RCCL_OVERRIDE_PROTO` and `RCCL_OVERRIDE_ALGO` to allow direct replacement of protocol and algorithm choices. Unlike `NCCL_PROTO` and `NCCL_ALGO`, which re-run the model across enabled combinations and may not guarantee the intended override, these new options enforce the specified selections explicitly.
+
+### Changed
+
+* Compatibility with NCCL 2.23.4.
+* Compatibility with NCCL 2.24.3.
+* Compatibility with NCCL 2.25.1.
+* Compatibility with NCCL 2.26.6.
+
+### Optimized
+* Improved the performance of the `FP8` Sum operation by upcasting to `FP16`.
+
+### Known Issues
+* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
+
+## RCCL 2.22.3 for ROCm 6.4.2
+
+### Added
+
+* Added support for the LL128 protocol on gfx942.
+
+## RCCL 2.22.3 for ROCm 6.4.1
+
+### Resolved issues
+
+* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode.
+* Fixed IntraNet performance.
+* Fixed an issue where, in rare circumstances, the application could stop responding due to a proxy thread synchronization issue.
+
+### Known issues
+
+* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault.
+  The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
+* Within the RCCL-UnitTests test suite, failures occur in tests ending with the `ManagedMem` and `ManagedMemGraph` suffixes. These failures only affect the test results and do not affect the RCCL component itself. This issue will be resolved in the next major release.
+
+## RCCL 2.22.3 for ROCm 6.4.0
+
+### Added
+
+* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters.
+* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`.
+* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`).
+
+### Changed
+
+* Compatibility with NCCL 2.22.3
+* Added support for the rail-optimized tree algorithm for the MI300 series. This feature requires the use of all eight GPUs within
+  each node. It limits NIC traffic to use only GPUs of the same index across nodes and should not impact performance
+  on non-rail-optimized network topologies. The original method of building trees can be enabled by setting the
+  environment variable `RCCL_DISABLE_RAIL_TREES=1`.
+* Additional debug information about how the trees are built can be logged to the GRAPH logging subsys by setting
+  `RCCL_OUTPUT_TREES=1`.
+* Added documentation about the NPS4 and CPX partition modes performance benefits on the MI300X.
+
+## RCCL 2.21.5 for ROCm 6.3.1
+
+### Added
+
+### Changed
+
+* Enhanced user documentation
+
+### Resolved issues
+
+* Corrected user help strings in `install.sh`
+
+## RCCL 2.21.5 for ROCm 6.3.0
+
+### Added
+
+* MSCCL++ integration for AllReduce and AllGather on gfx942
+* Performance collection to rccl_replayer
+* Tuner Plugin example for MI300
+* Tuning table for large number of nodes
+* Support for amdclang++
+* Allow NIC ID remapping using `NCCL_RINGS_REMAP` environment variable
+
+### Changed
+
+* Compatibility with NCCL 2.21.5
+* Increased channel count for MI300X multi-node
+* Enabled MSCCL for single-process multi-threaded contexts
+* Enabled gfx12
+* Enabled CPX mode for MI300X
+* Enabled tracing with rocprof
+* Improved version reporting
+* Enabled GDRDMA for Linux kernel 6.4.0+
+
+### Resolved issues
+
+* Fixed model matching with PXN enable
+
+## RCCL 2.20.5 for ROCm 6.2.1
+### Fixed
+- GDR support flag now set with DMABUF
+### Known issues
+- On systems running Linux kernel 6.8.0, such as Ubuntu 24.04, Direct Memory Access (DMA) transfers between the GPU and NIC are disabled and impacts multi-node RCCL performance.
+  - This issue was reproduced with RCCL 2.20.5 (ROCm 6.2.0 and 6.2.1) on systems with Broadcom Thor-2 NICs and affects other systems with RoCE networks using Linux 6.8.0 or newer.
+  - Older RCCL versions are also impacted.
+  - This issue will be addressed in a future ROCm release.
+
+## RCCL 2.20.5 for ROCm 6.2.0
+### Changed
+- Compatibility with NCCL 2.20.5
+- Compatibility with NCCL 2.19.4
+- Performance tuning for some collective operations on MI300
+- Enabled NVTX code in RCCL
+- Replaced rccl_bfloat16 with hip_bfloat16
+- NPKit updates:
+  - Removed warm-up iteration removal by default, need to opt in now
+  - Doubled the size of buffers to accommodate for more channels
+- Modified rings to be rail-optimized topology friendly
+- Replaced ROCmSoftwarePlatform links with ROCm links
+### Added
+- Support for fp8 and rccl_bfloat8
+- Support for using HIP contiguous memory
+- Implemented ROC-TX for host-side profiling
+- Enabled static build
+- Added new rome model
+- Added fp16 and fp8 cases to unit tests
+- New unit test for main kernel stack size
+- New -n option for topo_expl to override # of nodes
+- Improved debug messages of memory allocations
+### Fixed
+- Bug when configuring RCCL for only LL128 protocol
+- Scratch memory allocation after API change for MSCCL
+
+## RCCL 2.18.6 for ROCm 6.1.0
+### Changed
+- Compatibility with NCCL 2.18.6
+
+## RCCL 2.18.3 for ROCm 6.0.0
+### Changed
+- Compatibility with NCCL 2.18.3
+
+## RCCL 2.17.1-1 for ROCm 5.7.0
+### Changed
+- Compatibility with NCCL 2.17.1-1
+- Performance tuning for some collective operations
+### Added
+- Minor improvements to MSCCL codepath
+- NCCL_NCHANNELS_PER_PEER support
+- Improved compilation performance
+- Support for gfx94x
+### Fixed
+- Potential race-condition during ncclSocketClose()
+
+## RCCL 2.16.2 for ROCm 5.6.0
+### Changed
+- Compatibility with NCCL 2.16.2
+### Fixed
+- Remove workaround and use indirect function call
+
+## RCCL 2.15.5 for ROCm 5.5.0
+### Changed
+- Compatibility with NCCL 2.15.5
+- Unit test executable renamed to rccl-UnitTests
+### Added
+- HW-topology aware binary tree implementation
+- Experimental support for MSCCL
+- New unit tests for hipGraph support
+- NPKit integration
+### Fixed
+- rocm-smi ID conversion
+- Support for HIP_VISIBLE_DEVICES for unit tests
+- Support for p2p transfers to non (HIP) visible devices
+### Removed
+- Removed TransferBench from tools.  Exists in standalone repo: https://github.com/ROCm/TransferBench
+
+## RCCL-2.13.4 for ROCm 5.4.0
+### Changed
+- Compatibility with NCCL 2.13.4
+- Improvements to RCCL when running with hipGraphs
+- RCCL_ENABLE_HIPGRAPH environment variable is no longer necessary to enable hipGraph support
+- Minor latency improvements
+### Fixed
+- Resolved potential memory access error due to asynchronous memset
+
+## RCCL-2.12.10 for ROCm 5.3.0
+### Changed
+- Improvements to LL128 algorithms
+### Added
+- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
+- Integrating with NPKit (https://github.com/microsoft/NPKit) profiling code
+
+## RCCL-2.12.10 for ROCm 5.2.3
+### Added
+- Compatibility with NCCL 2.12.10
+- Packages for test and benchmark executables on all supported OSes using CPack.
+- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1
+  - Additional details provided if Binary File Descriptor library (BFD) is pre-installed
+- Adding support for reusing ports in NET/IB channels
+  - Opt-in with NCCL_IB_SOCK_CLIENT_PORT_REUSE=1 and NCCL_IB_SOCK_SERVER_PORT_REUSE=1
+  - When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
+    (e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
+    to resolve the massive port usage issue
+  - Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
+### Removed
+- Removed experimental clique-based kernels
+
+## RCCL-2.11.4 for ROCm 5.2.0
+### Changed
+- Unit testing framework rework
+- Minor bug fixes
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## RCCL-2.11.4 for ROCm 5.1.0
+### Added
+- Compatibility with NCCL 2.11.4
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## RCCL-2.10.3 for ROCm 5.0.0
+### Added
+- Compatibility with NCCL 2.10.3
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## RCCL-2.9.9 for ROCm 4.5.0
+### Changed
+- Packaging split into a runtime package called rccl and a development package called rccl-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release.
+### Added
+- Compatibility with NCCL 2.9.9
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## [RCCL-2.8.4 for ROCm 4.3.0]
+### Added
+- Ability to select the number of channels to use for clique-based all reduce (RCCL_CLIQUE_ALLREDUCE_NCHANNELS).  This can be adjusted to tune for performance when computation kernels are being executed in parallel.
+### Optimizations
+- Additional tuning for clique-based kernel AllReduce performance (still requires opt in with RCCL_ENABLE_CLIQUE=1)
+- Modification of default values for number of channels / byte limits for clique-based all reduce based on device architecture
+### Changed
+- Replaced RCCL_FORCE_ENABLE_CLIQUE to RCCL_CLIQUE_IGNORE_TOPO
+- Clique-based kernels can now be enabled on topologies where all active GPUs are XGMI-connected
+- Topologies not normally supported by clique-based kernels require RCCL_CLIQUE_IGNORE_TOPO=1
+### Fixed
+- Install script '-r' flag invoked alone no longer incorrectly deletes any existing builds.
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## [RCCL-2.8.4 for ROCm 4.2.0]
+### Added
+- Compatibility with NCCL 2.8.4
+
+### Optimizations
+- Additional tuning for clique-based kernels
+- Enabling GPU direct RDMA read from GPU
+- Fixing potential memory leak issue when re-creating multiple communicators within same process
+- Improved topology detection
+### Known issues
+- None
+
+## [RCCL-2.7.8 for ROCm 4.1.0]
+### Added
+- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
+- Clique-based kernels may offer better performance for smaller input sizes
+- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
+### Optimizations
+- Performance improvements for Rome-based systems
+### Known issues
+- Clique-based kernels are currently experimental and have not been fully tested on all topologies.  By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
+- Clique-based kernels may hang if there are differences between environment variables set across ranks.
+- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
+
+
+## [RCCL-2.7.8 for ROCm 3.9.0]
+### Added
+- Adding support for alltoallv RCCL kernel
+### Optimizations
+- Modifications to topology based on XGMI links
+### Known issues
+- None
+
+## [RCCL-2.7.6 for ROCm 3.8.0]
+### Added
+- Support for static library builds
+### Known issues
+- None
+
+## [RCCL-2.7.6 for ROCm 3.7.0]
+### Added
+- Updated to RCCL API version of 2.7.6
+- Added gather, scatter and all-to-all collectives
+
+## [RCCL-2.7.0 for ROCm 3.6.0]
+### Added
+- Updated to RCCL API version of 2.6.4
+
+## [RCCL-2.7.0 for ROCm 3.5.0]
+### Added
+- Compatibility with NCCL 2.6
+- Network interface improvements with API v3
+### Optimizations
+- Fixing issues and built time improvements for hip-clang
+- Network topology detection
+- Improved CPU type detection
+- Infiniband adaptive routing support
+### Changed
+- Switched to hip-clang as default compiler
+### Deprecated
+- Deprecated hcc build
@@ -0,0 +1,84 @@
+arrayIndexThenCheck:src/bootstrap.cc:304
+arrayIndexThenCheck:src/debug.cc:88
+arrayIndexThenCheck:src/graph/search.cc:844
+arrayIndexThenCheck:src/graph/search.cc:916
+arrayIndexThenCheck:src/graph/search.cc:927
+clarifyCalculation:src/graph/topo.cc:702
+clarifyCalculation:src/graph/topo.cc:720
+clarifyCondition:src/enqueue.cc:416
+funcArgNamesDifferent:src/graph/topo.cc:135
+funcArgNamesDifferent:src/graph/topo.h:144
+nullPointerRedundantCheck:src/misc/utils.cc:102
+nullPointerRedundantCheck:src/misc/utils.cc:109
+nullPointerRedundantCheck:src/proxy.cc:143
+nullPointerRedundantCheck:src/proxy.cc:144
+nullPointerRedundantCheck:src/proxy.cc:147
+nullPointerRedundantCheck:src/proxy.cc:148
+nullPointerRedundantCheck:src/proxy.cc:149
+nullPointerRedundantCheck:src/proxy.cc:150
+nullPointerRedundantCheck:src/proxy.cc:151
+nullPointerRedundantCheck:src/proxy.cc:155
+nullPointerRedundantCheck:src/proxy.cc:159
+nullPointerRedundantCheck:src/proxy.cc:160
+nullPointerRedundantCheck:src/proxy.cc:161
+nullPointerRedundantCheck:src/proxy.cc:163
+nullPointerRedundantCheck:src/proxy.cc:165
+nullPointerRedundantCheck:src/proxy.cc:167
+nullPointerRedundantCheck:src/proxy.cc:168
+nullPointerRedundantCheck:src/proxy.cc:340
+nullPointerRedundantCheck:src/proxy.cc:342
+nullPointerRedundantCheck:src/proxy.cc:93
+nullPointerRedundantCheck:src/proxy.cc:94
+redundantAssignment:src/proxy.cc:161
+redundantAssignment:src/proxy.cc:163
+redundantCopy:src/graph/rings.cc:16
+redundantCopy:src/graph/rings.cc:17
+terminateStrncpy:src/misc/utils.cc:99
+terminateStrncpy:src/transport/net_socket.cc:245
+unreachableCode:src/transport/net.cc:555
+unreadVariable:src/graph/tuning.cc:109
+unreadVariable:src/graph/tuning.cc:110
+unreadVariable:src/graph/tuning.cc:113
+unusedFunction:src/graph/topo.cc:37
+unusedFunction:src/graph/topo.cc:836
+unusedFunction:src/misc/gdrwrap.cc:109
+unusedFunction:src/misc/gdrwrap.cc:117
+unusedFunction:src/misc/gdrwrap.cc:130
+unusedFunction:src/misc/gdrwrap.cc:144
+unusedFunction:src/misc/gdrwrap.cc:158
+unusedFunction:src/misc/gdrwrap.cc:172
+unusedFunction:src/misc/gdrwrap.cc:186
+unusedFunction:src/misc/gdrwrap.cc:200
+unusedFunction:src/misc/gdrwrap.cc:209
+unusedFunction:src/misc/gdrwrap.cc:218
+unusedFunction:src/misc/gdrwrap.cc:232
+unusedFunction:src/misc/gdrwrap.cc:52
+unusedFunction:src/misc/ibvwrap.cc:203
+unusedFunction:src/misc/ibvwrap.cc:239
+unusedFunction:src/misc/ibvwrap.cc:255
+unusedFunction:src/misc/nvmlwrap.cc:112
+unusedFunction:src/misc/nvmlwrap_stub.cc:31
+unusedFunction:src/misc/nvmlwrap_stub.cc:35
+unusedFunction:src/transport.cc:71
+unusedLabel:src/bootstrap.cc:349
+unusedLabel:src/clique/ShmObject.h:112
+unusedLabel:src/clique/ShmObject.h:204
+unusedLabel:src/enqueue.cc:108
+unusedLabel:src/enqueue.cc:1093
+unusedLabel:src/enqueue.cc:989
+unusedLabel:src/init.cc:1189
+unusedLabel:src/init.cc:1240
+unusedLabel:src/init.cc:1267
+unusedLabel:src/transport.cc:238
+unusedStructMember:src/graph/xml.cc:410
+unusedStructMember:src/graph/xml.cc:411
+unusedStructMember:src/graph/xml.cc:412
+unusedStructMember:src/graph/xml.cc:428
+unusedStructMember:src/graph/xml.cc:431
+unusedStructMember:src/graph/xml.cc:432
+unusedStructMember:src/graph/xml.cc:435
+unusedStructMember:src/graph/xml.cc:437
+variableScope:src/graph/search.cc:494
+variableScope:src/init.cc:240
+variableScope:src/transport/net_ib.cc:117
+variableScope:src/transport/net_socket.cc:431
@@ -0,0 +1,46 @@
+
+Attributions
+
+Contains contributions from NVIDIA.
+
+Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+*  Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+*  Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+*  Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+https://github.com/NVIDIA/NVTX
+
+for more information and license details.
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : src.build
+install : src.install
+BUILDDIR ?= $(abspath ./build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := src pkg
+clean: ${TARGETS:%=%.clean}
+test.build: src.build
+LICENSE_FILES := LICENSE.txt
+LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
+lic: $(LICENSE_TARGETS)
+
+${BUILDDIR}/%.txt: %.txt
+	@printf "Copying    %-35s > %s\n" $< $@
+	mkdir -p ${BUILDDIR}
+	cp $< $@
+
+src.%:
+	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.%:
+	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.debian.prep: lic
+pkg.txz.prep: lic
@@ -0,0 +1,128 @@
+Notices and Licenses file
+_______________________________________________________________
+
+Dependencies on nvidia-nccl v2.27.3-1 (BSD3)
+
+Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
+
+_______________________________________________________________
+
+Dependencies on NPKit (MIT License)
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+
+_______________________________________________________________
+
+Dependencies on MSCCL++ (MIT License)
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+
+See:
+
+    https://github.com/microsoft/mscclpp
+
+for more information and license details.
+
+_______________________________________________________________
+
+Dependencies on Latency Profiler (MIT License)
+
+    Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+
+See:
+
+    src/include/latency_profiler
+    src/misc/latency_profiler
@@ -0,0 +1,147 @@
+# RCCL
+
+ROCm Communication Collectives Library
+
+[![RCCL](https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/status%2Frccl?repoName=ROCm%2Frccl&branchName=develop)](https://dev.azure.com/ROCm-CI/ROCm-CI/_build/latest?definitionId=107&repoName=ROCm%2Frccl&branchName=develop)
+[![TheRock CI](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml/badge.svg?branch=develop&event=push)](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml)
+
+> **Note:** The published documentation is available at [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/index.html) in an organized easy-to-read format that includes a table of contents and search functionality. The documentation source files reside in the [rccl/docs](https://github.com/ROCm/rccl/tree/develop/docs) folder in this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
+
+## Introduction
+
+RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations.  It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+
+The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
+
+## Requirements
+
+1. ROCm supported GPUs
+2. ROCm stack installed on the system (HIP runtime & HIP-Clang)
+
+## Quickstart RCCL Build
+
+RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
+For ROCm installation instructions, see https://github.com/ROCm/ROCm.
+
+The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL.
+
+### To build the library using the install script:
+
+```shell
+./install.sh
+```
+
+For more info on build options/flags when using the install script, use `./install.sh --help`
+```shell
+./install.sh --help
+RCCL build & installation helper script
+ Options:
+       --address-sanitizer     Build with address sanitizer enabled
+    -c|--enable-code-coverage  Enable code coverage
+    -d|--dependencies          Install RCCL dependencies
+       --debug                 Build debug library
+       --enable_backtrace      Build with custom backtrace support
+       --disable-colltrace     Build without collective trace
+       --disable-msccl-kernel  Build without MSCCL kernels
+       --enable-mscclpp        Build with MSCCL++ support
+       --enable-mscclpp-clip   Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
+       --disable-roctx         Build without ROCTX logging
+    -f|--fast                  Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
+    -h|--help                  Prints this help message
+    -i|--install               Install RCCL library (see --prefix argument below)
+    -j|--jobs                  Specify how many parallel compilation jobs to run ($nproc by default)
+    -l|--local_gpu_only        Only compile for local GPU architecture
+       --amdgpu_targets        Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)
+       --no_clean              Don't delete files if they already exist
+       --npkit-enable          Compile with npkit enabled
+       --log-trace             Build with log trace enabled (i.e. NCCL_DEBUG=TRACE)
+       --openmp-test-enable    Enable OpenMP in rccl unit tests
+    -p|--package_build         Build RCCL package
+       --prefix                Specify custom directory to install RCCL to (default: `/opt/rocm`)
+       --run_tests_all         Run all rccl unit tests (must be built already)
+    -r|--run_tests_quick       Run small subset of rccl unit tests (must be built already)
+       --static                Build RCCL as a static library instead of shared library
+    -t|--tests_build           Build rccl unit tests, but do not run
+       --time-trace            Plot the build time of RCCL (requires `ninja-build` package installed on the system)
+       --verbose               Show compile commands
+```
+
+By default, RCCL builds for all GPU targets defined in `DEFAULT_GPUS` in `CMakeLists.txt`. To target specific GPU(s), and potentially reduce build time, use `--amdgpu_targets` as a `;` separated string listing GPU(s) to target.
+
+## Manual build
+
+### To build the library using CMake:
+
+```shell
+$ git clone --recursive https://github.com/ROCm/rccl.git
+$ cd rccl
+$ mkdir build
+$ cd build
+$ cmake ..
+$ make -j 16      # Or some other suitable number of parallel jobs
+```
+If you have already cloned, you can checkout the external submodules manually.
+```shell
+$ git submodule update --init --recursive --depth=1
+```
+You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example:
+```shell
+$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release ..
+```
+Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
+
+### To build the RCCL package and install package :
+
+Assuming you have already cloned this repository and built the library as shown in the previous section:
+
+```shell
+$ cd rccl/build
+$ make package
+$ sudo dpkg -i *.deb
+```
+
+RCCL package install requires sudo/root access because it installs under `/opt/rocm/`. This is an optional step as RCCL can instead be used directly by including the path containing `librccl.so`.
+
+## Docker build
+
+Refer to [docker/README.md](docker/README.md "docker/README.md")
+
+## Tests
+
+There are rccl unit tests implemented with the Googletest framework in RCCL.  The rccl unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh).
+To invoke the rccl unit tests, go to the build folder, then the test subfolder, and execute the appropriate rccl unit test executable(s).
+
+rccl unit test names are now of the format:
+
+    CollectiveCall.[Type of test]
+
+Filtering of rccl unit tests should be done with environment variable and by passing the `--gtest_filter` command line flag, for example:
+
+```shell
+UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
+```
+
+will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests for more information on how to form more advanced filters.
+
+There are also other performance and error-checking tests for RCCL.  These are maintained separately at https://github.com/ROCm/rccl-tests.
+See the rccl-tests README for more information on how to build and run those tests.
+
+## Library and API Documentation
+
+Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation.
+
+### How to build documentation
+
+Run the steps below to build documentation locally.
+
+```shell
+cd docs
+pip3 install -r sphinx/requirements.txt
+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+```
+
+## Copyright
+
+All source code and accompanying documentation is copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+
+All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
@@ -0,0 +1,40 @@
+# MIT License
+#
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
+# while ROCMChecks gives a warning if this variable is modified manually without a target.
+
+# We now choose to disable ROCMChecks for this one case.
+
+set(DISABLE_ROCM_CHECK OFF)
+
+function(rocm_check_toolchain_var var access value list_file)
+  if(NOT DISABLE_ROCM_CHECK)
+    _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
+  endif()
+endfunction()
+
+macro(CHECK_SYMBOL_EXISTS)
+  set(DISABLE_ROCM_CHECK ON)
+  _check_symbol_exists(${ARGN})
+  set(DISABLE_ROCM_CHECK OFF)
+endmacro()
@@ -0,0 +1,192 @@
+# MIT License
+#
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Dependencies
+
+# HIP dependency is handled earlier in the project cmake file
+# when VerifyCompiler.cmake is included.
+
+# GIT
+
+# Test dependencies
+
+# For downloading, building, and installing required dependencies
+include(cmake/DownloadProject.cmake)
+
+include(FetchContent)
+
+if(NOT INSTALL_DEPENDENCIES)
+    find_package(GTest 1.11)
+endif()
+
+if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES)
+    if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
+        # hip-clang cannot compile googlebenchmark for some reason
+        set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
+    endif()
+
+#       unset(GTEST_INCLUDE_DIR CACHE)
+#	unset(GTEST_INCLUDE_DIRS CACHE)
+    message(STATUS "GTest not found. Downloading and building GTest.")
+    # Download, build and install googletest library
+    set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "")
+
+    download_project(PROJ                googletest
+                     GIT_REPOSITORY      https://github.com/google/googletest.git
+                     GIT_TAG             release-1.12.0
+                     INSTALL_DIR         ${GTEST_ROOT}
+                     CMAKE_ARGS          -DBUILD_GTEST=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE} -DBUILD_SHARED_LIBS=OFF
+                     LOG_DOWNLOAD        TRUE
+                     LOG_CONFIGURE       TRUE
+                     LOG_BUILD           TRUE
+                     LOG_INSTALL         TRUE
+                     UPDATE_DISCONNECTED TRUE
+    )
+    set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gtest/include CACHE PATH "")
+    set(GMOCK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gmock/include CACHE PATH "")
+    if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib)
+        set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest_main.a CACHE PATH "")
+        set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock_main.a CACHE PATH "")
+    elseif(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64)
+        set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest_main.a CACHE PATH "")
+        set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock_main.a CACHE PATH "")
+    else()
+        message(FATAL_ERROR "Cannot find gtest library installation path.")
+    find_package(GTest REQUIRED CONFIG PATHS ${GTEST_ROOT})
+    find_package(GMock REQUIRED CONFIG PATHS ${GTEST_ROOT})
+    endif()
+elseif(GTest_FOUND AND BUILD_TESTS)
+    set(GTEST_BOTH_LIBRARIES "GTest::gtest;GTest::gtest_main")
+    set(GMOCK_BOTH_LIBRARIES "GTest::gmock;GTest::gmock_main")
+endif()
+
+# Find or download/install rocm-cmake project
+set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
+find_package(ROCM 0.7.3 QUIET CONFIG PATHS /opt/rocm)
+if(NOT ROCM_FOUND)
+    set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
+    file(
+        DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip
+        ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+        STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
+    )
+    list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
+    if(rocm_cmake_download_error_code)
+        message(FATAL_ERROR "Error: downloading "
+            "https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
+            "error_code: ${rocm_cmake_download_error_code} "
+            "log: ${rocm_cmake_download_log} "
+        )
+    endif()
+
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+        WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}
+        RESULT_VARIABLE rocm_cmake_unpack_error_code
+    )
+    execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
+      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
+    execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
+      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
+
+    if(rocm_cmake_unpack_error_code)
+        message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed")
+    endif()
+    find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
+endif()
+
+set(CMAKE_INSTALL_LIBDIR lib CACHE STRING "Define install directory for libraries" FORCE)
+
+# Find or download/install fmt
+find_package(fmt QUIET)
+if(NOT fmt_FOUND)
+    set(FMT_INSTALL OFF)
+    message(STATUS "fmt not found, fetching from source...")
+    FetchContent_Declare(
+        fmt
+        GIT_REPOSITORY https://github.com/fmtlib/fmt
+        GIT_TAG        e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1
+    )
+    FetchContent_MakeAvailable(fmt)
+else()
+    message(STATUS "Using system fmt")
+    get_target_property(FMT_INCLUDE_DIRS fmt::fmt-header-only INTERFACE_INCLUDE_DIRECTORIES)
+    message(STATUS "fmt include directories: ${FMT_INCLUDE_DIRS}")
+endif()
+
+# Find available local ROCM targets
+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
+function(rocm_local_targets VARIABLE)
+  set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
+  find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
+  if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
+    execute_process(
+      COMMAND "${_rocm_agent_enumerator}"
+      RESULT_VARIABLE _found_agents
+      OUTPUT_VARIABLE _rocm_agents
+      ERROR_QUIET
+      )
+    if (_found_agents EQUAL 0)
+      string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
+      unset(result)
+      foreach (agent IN LISTS _rocm_agents)
+        if (NOT agent STREQUAL "gfx000")
+          list(APPEND result "${agent}")
+        endif()
+      endforeach()
+      if(result)
+        list(REMOVE_DUPLICATES result)
+        set(${VARIABLE} "${result}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+# Iterate over the "source" list and check if there is a duplicate file name
+# NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe
+function(add_file_unique FILE_LIST FILE)
+  get_filename_component(FILE_NAME "${FILE}" NAME)
+
+  # Iterate over whatever is in the list so far
+  foreach(curr_file IN LISTS ${FILE_LIST})
+    get_filename_component(curr_file_name ${curr_file} NAME)
+
+    # Check if duplicate
+    if(${FILE_NAME} STREQUAL ${curr_file_name})
+      get_filename_component(DIR_PATH "${FILE}" DIRECTORY)
+      get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE)
+      get_filename_component(FILE_EXT "${FILE}" EXT)
+
+      # Construct a new file name by adding _tmp
+      set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE)
+    endif()
+  endforeach()
+endfunction()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+include(ROCMHeaderWrapper)
@@ -0,0 +1,14 @@
+# Distributed under the OSI-approved MIT License.  See accompanying
+# file LICENSE or https://github.com/Crascit/DownloadProject for details.
+
+cmake_minimum_required(VERSION 2.8.2)
+
+project(${DL_ARGS_PROJ}-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(${DL_ARGS_PROJ}-download
+                    ${DL_ARGS_UNPARSED_ARGUMENTS}
+                    SOURCE_DIR          "${DL_ARGS_SOURCE_DIR}"
+                    BUILD_IN_SOURCE     TRUE
+                    TEST_COMMAND        ""
+)
@@ -0,0 +1,170 @@
+# Distributed under the OSI-approved MIT License.  See accompanying
+# file LICENSE or https://github.com/Crascit/DownloadProject for details.
+#
+# MODULE:   DownloadProject
+#
+# PROVIDES:
+#   download_project( PROJ projectName
+#                    [PREFIX prefixDir]
+#                    [DOWNLOAD_DIR downloadDir]
+#                    [SOURCE_DIR srcDir]
+#                    [BINARY_DIR binDir]
+#                    [QUIET]
+#                    ...
+#   )
+#
+#       Provides the ability to download and unpack a tarball, zip file, git repository,
+#       etc. at configure time (i.e. when the cmake command is run). How the downloaded
+#       and unpacked contents are used is up to the caller, but the motivating case is
+#       to download source code which can then be included directly in the build with
+#       add_subdirectory() after the call to download_project(). Source and build
+#       directories are set up with this in mind.
+#
+#       The PROJ argument is required. The projectName value will be used to construct
+#       the following variables upon exit (obviously replace projectName with its actual
+#       value):
+#
+#           projectName_SOURCE_DIR
+#           projectName_BINARY_DIR
+#
+#       The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically
+#       need to be provided. They can be specified if you want the downloaded source
+#       and build directories to be located in a specific place. The contents of
+#       projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the
+#       locations used whether you provide SOURCE_DIR/BINARY_DIR or not.
+#
+#       The DOWNLOAD_DIR argument does not normally need to be set. It controls the
+#       location of the temporary CMake build used to perform the download.
+#
+#       The PREFIX argument can be provided to change the base location of the default
+#       values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments
+#       are provided, then PREFIX will have no effect. The default value for PREFIX is
+#       CMAKE_BINARY_DIR.
+#
+#       The QUIET option can be given if you do not want to show the output associated
+#       with downloading the specified project.
+#
+#       In addition to the above, any other options are passed through unmodified to
+#       ExternalProject_Add() to perform the actual download, patch and update steps.
+#
+#       Only those ExternalProject_Add() arguments which relate to downloading, patching
+#       and updating of the project sources are intended to be used. Also note that at
+#       least one set of download-related arguments are required.
+#
+#       If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to
+#       prevent a check at the remote end for changes every time CMake is run
+#       after the first successful download. See the documentation of the ExternalProject
+#       module for more information. It is likely you will want to use this option if it
+#       is available to you. Note, however, that the ExternalProject implementation contains
+#       bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when
+#       using the URL download method or when specifying a SOURCE_DIR with no download
+#       method. Fixes for these have been created, the last of which is scheduled for
+#       inclusion in CMake 3.8.0. Details can be found here:
+#
+#           https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c
+#           https://gitlab.kitware.com/cmake/cmake/issues/16428
+#
+#       If you experience build errors related to the update step, consider avoiding
+#       the use of UPDATE_DISCONNECTED.
+#
+# EXAMPLE USAGE:
+#
+#   include(DownloadProject)
+#   download_project(PROJ                googletest
+#                    GIT_REPOSITORY      https://github.com/google/googletest.git
+#                    GIT_TAG             master
+#                    UPDATE_DISCONNECTED 1
+#                    QUIET
+#   )
+#
+#   add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
+#
+#========================================================================================
+
+
+set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}")
+
+include(CMakeParseArguments)
+
+function(download_project)
+
+    set(options QUIET)
+    set(oneValueArgs
+        PROJ
+        PREFIX
+        DOWNLOAD_DIR
+        SOURCE_DIR
+        BINARY_DIR
+    )
+    set(multiValueArgs "")
+
+    cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # Hide output if requested
+    if (DL_ARGS_QUIET)
+        set(OUTPUT_QUIET "OUTPUT_QUIET")
+    else()
+        unset(OUTPUT_QUIET)
+        message(STATUS "Downloading/updating ${DL_ARGS_PROJ}")
+    endif()
+
+    # Set up where we will put our temporary CMakeLists.txt file and also
+    # the base point below which the default source and binary dirs will be.
+    # The prefix must always be an absolute path.
+    if (NOT DL_ARGS_PREFIX)
+        set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}")
+    else()
+        get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE
+                               BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    endif()
+    if (NOT DL_ARGS_DOWNLOAD_DIR)
+        set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download")
+    endif()
+
+    # Ensure the caller can know where to find the source and build directories
+    if (NOT DL_ARGS_SOURCE_DIR)
+        set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src")
+    endif()
+    if (NOT DL_ARGS_BINARY_DIR)
+        set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build")
+    endif()
+    set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE)
+    set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE)
+
+    # The way that CLion manages multiple configurations, it causes a copy of
+    # the CMakeCache.txt to be copied across due to it not expecting there to
+    # be a project within a project.  This causes the hard-coded paths in the
+    # cache to be copied and builds to fail.  To mitigate this, we simply
+    # remove the cache if it exists before we configure the new project.  It
+    # is safe to do so because it will be re-generated.  Since this is only
+    # executed at the configure step, it should not cause additional builds or
+    # downloads.
+    file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt")
+
+    # Create and build a separate CMake project to carry out the download.
+    # If we've already previously done these steps, they will not cause
+    # anything to be updated, so extra rebuilds of the project won't occur.
+    # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project
+    # has this set to something not findable on the PATH.
+    configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in"
+                   "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt")
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+                        -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}"
+                        .
+                    RESULT_VARIABLE result
+                    ${OUTPUT_QUIET}
+                    WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
+    )
+    if(result)
+        message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build . -j16
+                    RESULT_VARIABLE result
+                    ${OUTPUT_QUIET}
+                    WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
+    )
+    if(result)
+        message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
+    endif()
+
+endfunction()
@@ -0,0 +1,39 @@
+# MIT License
+#
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+find_path(IBVERBS_INCLUDE_DIRS
+  NAMES infiniband/verbs.h
+  HINTS
+  ${IBVERBS_INCLUDE_DIR}
+  ${IBVERBS_ROOT_DIR}
+  ${IBVERBS_ROOT_DIR}/include)
+
+find_library(IBVERBS_LIBRARIES
+  NAMES ibverbs
+  HINTS
+  ${IBVERBS_LIB_DIR}
+  ${IBVERBS_ROOT_DIR}
+  ${IBVERBS_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
+mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
@@ -0,0 +1,36 @@
+# MIT License
+#
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+find_path(MSCCLPP_INCLUDE_DIRS
+    NAMES mscclpp/gpu.hpp
+    HINTS
+    ${MSCCLPP_ROOT}/include)
+
+find_library(MSCCLPP_LIBRARIES
+    NAMES mscclpp_nccl
+    HINTS
+    ${MSCCLPP_ROOT}/lib)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
+mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
+    
@@ -0,0 +1,229 @@
+# MIT License
+#
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Dependencies
+
+# HIP dependency is handled earlier in the project cmake file
+# when VerifyCompiler.cmake is included.
+
+# GIT
+
+# Test dependencies
+
+# For downloading, building, and installing required dependencies
+include(cmake/DownloadProject.cmake)
+
+if(ENABLE_MSCCLPP)
+    # Try to find the mscclpp install
+    set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "")
+    execute_process(
+        COMMAND mkdir -p ${MSCCLPP_ROOT}
+    )
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+    find_package(mscclpp_nccl)
+
+    #if(NOT mscclpp_nccl_FOUND)
+        # Ensure the source code is checked out
+        set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
+        set(JSON_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json CACHE PATH "")
+        if((NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt) OR (NOT EXISTS ${JSON_SOURCE}/CMakeLists.txt))
+            message(STATUS "Checking out external code")
+            execute_process(
+                COMMAND git submodule update --init --recursive
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            )
+        endif()
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        set(CMAKE_INHERITED_ARGS "")
+        set(CMAKE_ARGS_LIST "CMAKE_PREFIX_PATH;CMAKE_INSTALL_RPATH_USE_LINK_PATH;HIP_COMPILER")
+        foreach(arg IN LISTS CMAKE_ARGS_LIST)
+            if(DEFINED ${arg})
+                string(REPLACE ";" "%" ARG_VALUE "${${arg}}") # Replace ; with new list separator symbol % to avoid CMake errors
+                string(STRIP "${ARG_VALUE}" ARG_VALUE) # Eliminate whitespace, reducing to empty string if necessary
+
+                # Only add a cmake argument if it has a value
+                if("${ARG_VALUE}" STREQUAL "")
+                    continue()
+                endif()
+                string(APPEND CMAKE_INHERITED_ARGS "-D${arg}=\"${ARG_VALUE}\" ")
+            endif()
+        endforeach()
+
+        if(NOT DEFINED CACHE{MSCCLPP_GPU_TARGETS})
+            message(STATUS "Building MSCCL++ only for supported variants: gfx942;gfx950")
+            set(MSCCLPP_GPU_TARGETS "gfx942;gfx950")
+            if(BUILD_ADDRESS_SANITIZER)
+                set(MSCCLPP_GPU_TARGETS "gfx942:xnack+;gfx950:xnack+")
+            endif()
+        else()
+            message(STATUS "Building MSCCL++ for ${MSCCLPP_GPU_TARGETS}")
+        endif()
+
+        string(REPLACE ";" "%" MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}")
+
+        download_project(PROJ                mscclpp_nccl
+                         #GIT_REPOSITORY      https://github.com/microsoft/mscclpp.git
+                         #GIT_TAG             4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
+                         INSTALL_DIR         ${MSCCLPP_ROOT}
+                         LIST_SEPARATOR      %
+			 CMAKE_ARGS          "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DMSCCLPP_ENABLE_EXECUTOR=${ENABLE_MSCCLPP_EXECUTOR} -DMSCCLPP_ENABLE_FORMAT_CHECKS=${ENABLE_MSCCLPP_FORMAT_CHECKS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
+                         LOG_DOWNLOAD        FALSE
+                         LOG_CONFIGURE       FALSE
+                         LOG_BUILD           FALSE
+                         LOG_INSTALL         FALSE
+                         UPDATE_DISCONNECTED TRUE
+                         SOURCE_DIR          ${MSCCLPP_SOURCE}
+        )
+
+        find_package(mscclpp_nccl REQUIRED)
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+        execute_process(
+            COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
+            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+        )
+
+    #endif()
+
+    execute_process(COMMAND objcopy
+                    --redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
+                    "${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a"
+                    "${PROJECT_BINARY_DIR}/libmscclpp_nccl.a"
+    )
+    add_library(mscclpp_nccl STATIC IMPORTED)
+    set_target_properties(mscclpp_nccl PROPERTIES IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/libmscclpp_nccl.a)
+
+endif()
@@ -0,0 +1,24 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+
+cmake_minimum_required(VERSION 3.16)
+
+message("Building rccl RAS client executable")
+
+add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc")
+
+target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include)
+target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src)
+target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include)
+
+target_link_libraries(rcclras PRIVATE hip::host)
+target_link_libraries(rcclras PRIVATE dl)
+
+if(BUILD_SHARED_LIBS)
+  target_link_libraries(rcclras PRIVATE rccl hip::device)
+else()
+  add_dependencies(rcclras rccl)
+  target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib)
+endif()
+
+
+rocm_install(TARGETS rcclras)
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+HIP_FILE=$1
+
+if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
+  sed -i "s/__syncthreads()/__syncthreads(); insert_random_delay_per_warp()/" "$HIP_FILE"
+
+  echo "Added fault injection to $HIP_FILE"
+fi
@@ -0,0 +1,42 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+HIP_FILE=$1
+
+if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
+  perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?)(, bool isNetOffload.*?)?>/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE"
+  perl -pi -e 's/(template<typename T, typename RedOp(?:, typename Proto)?(?:, int RCCLMetadata)?)(, bool isNetOffload.*?)?>/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE"
+  perl -pi -e 's/(ProtoSimple<[^,]*?,[^,]+?)>/\1, USE_ACC, COLL_UNROLL>/g' "$HIP_FILE"
+  perl -pi -e 's/(runRing<T.*?)((, (true|false))?>\()/\1, USE_ACC, COLL_UNROLL\2/g' "$HIP_FILE"
+  perl -pi -e 's/(runTreeUpDown<T.*?)>\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE"
+  perl -pi -e 's/(runTreeSplit<T.*?)>\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE"
+
+  perl -pi -e 's/(runTreeSplit<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
+  perl -pi -e 's/(runTreeUpDown<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
+  perl -pi -e 's/(runRing<T, RedOp, (ProtoLL|ProtoLL128), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
+  perl -pi -e 's/(runRing<T, RedOp, (ProtoLL|ProtoLL128), (RCCL_ONE_NODE_RING_SIMPLE|RCCL_METADATA_EMPTY), USE_ACC, COLL_UNROLL.*?)>/\1, 0>/' "$HIP_FILE"
+
+  perl -pi -e 's/(runRing<T, RedOp, Proto, (RCCL_ONE_NODE_RING_SIMPLE|RCCL_METADATA_EMPTY), USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
+  perl -pi -e 's/(runRing<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
+  perl -pi -e 's/(runTreeSplit<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
+  perl -pi -e 's/(runTreeUpDown<T, RedOp, Proto, USE_ACC, COLL_UNROLL.*?)>/\1, Pipeline>/' "$HIP_FILE"
+  sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE"
+  sed -i "s/\\(RunWorkColl<[^,]*,[^,]*,[^,]*,[^,]*,[^>]*\\)>/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE"
+fi
@@ -0,0 +1,81 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(EXTRACT_TIMEOUT 5 CACHE STRING "Timeout in seconds for roc-obj-* calls")
+
+## List the objects for each gfx architecture
+execute_process( COMMAND roc-obj-ls librccl.so
+    RESULT_VARIABLE list_result
+    OUTPUT_VARIABLE cmd_output
+    ERROR_VARIABLE cmd_error
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_STRIP_TRAILING_WHITESPACE
+    TIMEOUT ${EXTRACT_TIMEOUT}
+)
+
+if(list_result EQUAL 0)
+    ## Convert cmd output to list of lines
+    string(REGEX REPLACE "\n$" "" cmd_output "${cmd_output}")
+    string(REPLACE "\n" ";" cmd_output "${cmd_output}")
+
+    ## Extract file paths for the selected gfx archs
+    foreach(line ${cmd_output})
+        if(line MATCHES "(gfx90a|gfx942|gfx950)")
+            string(REGEX MATCH "\\file://(.*)" file_match ${line})
+            if(file_match)
+                list(APPEND file_paths ${file_match})
+            endif()
+        endif()
+    endforeach()
+
+    ## Extract objects from files
+    foreach(file ${file_paths})
+        execute_process(
+          COMMAND roc-obj-extract ${file}
+          RESULT_VARIABLE extraction_result
+          ERROR_VARIABLE extraction_error
+          OUTPUT_STRIP_TRAILING_WHITESPACE
+          ERROR_STRIP_TRAILING_WHITESPACE
+          TIMEOUT ${EXTRACT_TIMEOUT}
+        )
+        if(extraction_result STREQUAL "TIMEOUT")
+          message(
+            WARNING
+              "[Timeout] Extraction of '${file}' did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${extraction_error}.
+                    Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc."
+          )
+        elseif(NOT extraction_result EQUAL 0)
+          message(
+            WARNING
+              "[Error ${extraction_result}] Could not extract objects from '${file}'. stderr: ${extraction_error}"
+          )
+        endif()
+    endforeach()
+
+elseif(list_result STREQUAL "TIMEOUT")
+  message(
+    WARNING
+      "[Timeout] roc-obj-ls did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${cmd_error}.
+                     Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc"
+  )
+else()
+    ## We don't want to stop building unit-tests if this command fails.
+    message(WARNING "[Error ${list_result}] roc-obj-ls failed. stderr: ${cmd_error}")
+endif()
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Attempt to collect the latest git hash
+# Use RCCL_SOURCE_DIR if passed, otherwise fallback to CMAKE_CURRENT_SOURCE_DIR
+if(NOT DEFINED RCCL_SOURCE_DIR)
+  set(RCCL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
+if(NOT DEFINED RCCL_BINARY_DIR)
+  set(RCCL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+execute_process(COMMAND git log --pretty=format:'%h' -n 1
+                WORKING_DIRECTORY ${RCCL_SOURCE_DIR}
+                OUTPUT_VARIABLE GIT_REV
+                ERROR_QUIET)
+
+# Check if git information was found
+if ("${GIT_REV}" STREQUAL "")
+  set(CURR_GIT_VERSION "const char *rcclGitHash =\"Unknown \";")
+else()
+  # Check for changes (denote with a '+') after hash
+  execute_process(
+    COMMAND bash -c "git diff --quiet --exit-code || echo +"
+    WORKING_DIRECTORY ${RCCL_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_DIFF)
+  # Collect branch information
+  execute_process(
+    COMMAND git rev-parse --abbrev-ref HEAD
+    WORKING_DIRECTORY ${RCCL_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_BRANCH)
+
+  string(STRIP "${GIT_REV}" GIT_REV)
+  string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
+  string(STRIP "${GIT_DIFF}" GIT_DIFF)
+  string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
+
+  set(CURR_GIT_VERSION "const char *rcclGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
+endif()
+
+# Compare file with older git version file (git_version.cpp)
+if (EXISTS ${RCCL_BINARY_DIR}/git_version.cpp)
+  #MESSAGE(STATUS "Found ${RCCL_BINARY_DIR}/git_version.cpp")
+  file(READ ${RCCL_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
+  #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
+  #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
+  if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
+    message(STATUS "Updating git_version.cpp")
+    file(WRITE ${RCCL_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+  else()
+    message(STATUS "No changes to git_version.cpp required")
+  endif()
+else()
+  # Create git_version.cpp if it doesn't exist yet
+  file(WRITE ${RCCL_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+endif()
@@ -0,0 +1,124 @@
+## base docker image
+ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04
+ARG ROCM_IMAGE_TAG=latest
+FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}"
+
+## rccl repo
+ARG RCCL_REPO=https://github.com/ROCm/rccl
+ARG RCCL_BRANCH=develop
+
+## rccl-tests repo
+ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests
+ARG RCCL_TESTS_BRANCH=develop
+
+## AMD GPU Targets
+ARG GPU_TARGETS=gfx942
+
+## creating scratch space
+ENV WORKDIR /workspace
+RUN mkdir -p ${WORKDIR}
+WORKDIR ${WORKDIR}
+
+## install dependencies
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    git \
+    make \
+    rocm-cmake \
+    ninja-build \
+    gfortran \
+    build-essential \
+    libomp5 \
+    libomp-dev \
+    libbfd-dev \
+    libboost-all-dev \
+    libnuma1 \
+    libnuma-dev \
+    libpthread-stubs0-dev \
+    libzstd-dev \
+    lcov \
+    zip \
+    zlib1g-dev \
+    wget \
+    pkg-config \
+    unzip \
+    chrpath \
+    doxygen \
+    lshw \
+    build-essential \
+    libssl-dev \
+    curl \
+    libncursesw5-dev \
+    xz-utils \
+    liblzma-dev \
+    python3-pip \
+    python3-setuptools \
+    python3-venv \
+    python3-dev \
+    python3-tk \
+    python3-yaml \
+    vim \
+    less \
+    && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \
+    && chmod +x cmake-3.28.0-linux-x86_64.sh \
+    && bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \
+    && rm cmake-3.28.0-linux-x86_64.sh
+
+## Set ROCm path
+ENV ROCM_PATH=/opt/rocm
+
+## Install UCX
+ENV UCX_INSTALL_PREFIX=/opt/ucx
+RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \
+    && mkdir -p ucx \
+    && tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \
+    && cd ucx \
+    && mkdir build \
+    && cd build \
+    && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \
+    && make -j16 install \
+    && cd ../.. \
+    && rm -rf ucx ucx-1.16.0.tar.gz
+
+## Install OpenMPI
+ENV MPI_INSTALL_PREFIX=/opt/ompi
+RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \
+    && mkdir -p ompi4 \
+    && tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \
+    && cd ompi4 \
+    && mkdir build \
+    && cd build \
+    && ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \
+    && make -j16 install \
+    && cd ../.. \
+    && rm -rf ompi4 openmpi-4.1.6.tar.gz
+
+
+## building RCCL
+ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl/install
+RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" \
+    && cd ./rccl \
+    && ./install.sh --amdgpu_targets=${GPU_TARGETS} --prefix=${RCCL_INSTALL_PREFIX}
+
+## building RCCL-Tests
+RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \
+    && cd ./rccl-tests \
+    && mkdir build \
+    && cd build \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DCMAKE_PREFIX_PATH="${RCCL_INSTALL_PREFIX};${MPI_INSTALL_PREFIX}" -DGPU_TARGETS=${GPU_TARGETS} .. \
+    && make -j16
+
+
+## set environment variables
+ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"
+ENV UCX_WARN_UNUSED_ENV_VARS=n
+ENV OMPI_ALLOW_RUN_AS_ROOT=1
+ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+ENV NCCL_DEBUG=VERSION
+
@@ -0,0 +1,42 @@
+# Using RCCL/RCCL-Tests in a docker environment
+
+## Docker build
+
+Assuming you have docker installed on your system:
+
+### To build the docker image :
+
+By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch), targetting `gfx942` GPUs.
+```shell
+$ docker build -t rccl-tests -f Dockerfile.ubuntu --pull .
+```
+
+The base docker image, rccl repo, rccl-tests repo, and GPU targets can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image for the MI250 GPU:
+```shell
+$ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull .
+```
+
+### To start an interactive docker container on a system with AMD GPUs :
+
+```shell
+$ docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash
+```
+
+### To run rccl-tests (all\_reduce\_perf) on 8 AMD GPUs (inside the docker container) :
+
+If using ROCm 6.3.x or earlier
+```shell
+$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
+```
+
+If using ROCm 6.4.0 or later
+```shell
+$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
+```
+
+For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests.
+
+
+## Copyright
+
+All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
@@ -0,0 +1,5 @@
+_build/
+_doxygen/
+doxygen/html
+doxygen/xml
+sphinx/_toc.yml
@@ -0,0 +1,18 @@
+.. meta::
+   :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
+   :keywords: RCCL, ROCm, library, API
+
+.. _api-library:
+
+***********
+API library
+***********
+
+RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+
+The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
+
+Operations
+==========
+
+.. doxygenindex::
@@ -0,0 +1,165 @@
+.. meta::
+   :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
+   :keywords: RCCL, ROCm, library, API, reference, environment variable, environment
+
+.. _env-variables:
+
+********************************************************************
+RCCL environment variables
+********************************************************************
+
+This section describes the most important RCCL environment variables,
+which are grouped by functionality.
+
+Configuration and setup
+========================
+
+The configuration and setup environment variables for RCCL are collected
+in the following table.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``NCCL_CONF_FILE``
+        | Specifies the path to the RCCL configuration file.
+      - | String path to configuration file
+        | Default: ``~/.rccl.conf`` or ``/etc/rccl.conf``
+
+    * - | ``NCCL_HOSTID``
+        | Sets the host identifier for multi-node communication.
+      - | String value for host identification
+        | Used for host hash generation
+
+Logging and debugging
+=====================
+
+The logging and debugging environment variables for RCCL are collected
+in the following table.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``RCCL_LOG_LEVEL``
+        | Controls RCCL logging verbosity.
+      - | Integer value (default: ``1``)
+        | Higher values increase logging detail
+
+    * - | ``NCCL_DEBUG_SUBSYS``
+        | Controls which subsystems generate debug output.
+      - | Comma-separated list of subsystems (e.g., ``INIT,COLL``)
+        | Prefix with ``^`` to invert selection
+
+Algorithm and protocol control
+==============================
+
+The algorithm and protocol control environment variables for RCCL are
+collected in the following table.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``NCCL_ALGO``
+        | Forces specific algorithm selection for collectives.
+      - | Algorithm name string
+        | Used to override automatic algorithm selection
+
+    * - | ``NCCL_PROTO``
+        | Forces specific protocol selection for communication.
+      - | Protocol name string
+        | Used to override automatic protocol selection
+
+Network and topology
+====================
+
+The network and topology environment variables for RCCL are collected
+in the following table.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``NCCL_IB_HCA``
+        | Specifies InfiniBand device:port to use.
+      - | Device specification string
+        | Prefix with ``^`` for exclusion, ``=`` for exact match
+
+    * - | ``NCCL_IB_GID_INDEX``
+        | Defines the Global ID index used in RoCE mode.
+      - | Integer value (default: ``-1``)
+        | See InfiniBand ``show_gids`` command for valid values
+
+    * - | ``NCCL_SOCKET_IFNAME``
+        | Specifies which IP interfaces to use for communication.
+      - | Interface prefix string or list
+        | Multiple prefixes separated by ``,``
+        | Prefix with ``^`` for exclusion, ``=`` for exact match
+        | Example: ``eth`` (all eth interfaces), ``=eth0`` (exact match)
+
+    * - | ``NCCL_SOCKET_FAMILY``
+        | Forces IPv4/IPv6 interface selection.
+      - | ``AF_INET``: Force IPv4
+        | ``AF_INET6``: Force IPv6
+        | Unset: Use first available
+
+    * - | ``NCCL_NET_MERGE_LEVEL``
+        | Controls network device merging behavior.
+      - | Integer value specifying merge level
+        | Default: ``PATH_PORT``
+
+    * - | ``NCCL_NET_FORCE_MERGE``
+        | Forces merging of network devices.
+      - | String specifying forced merge configuration
+
+    * - | ``NCCL_RINGS``
+        | Defines custom ring topology.
+      - | Ring topology specification string
+        | Overrides automatic topology detection
+
+    * - | ``RCCL_TREES``
+        | Defines custom tree topology.
+      - | Tree topology specification string
+        | Alternative to ring topology
+
+    * - | ``NCCL_RINGS_REMAP``
+        | Controls ring remapping for specific topologies.
+      - | Remapping specification string
+        | Used with Rome 4P2H topology
+
+Development and testing (advanced)
+==================================
+
+The development and testing environment variables for RCCL are
+collected in the following table. These variables are primarily
+intended for debugging and development purposes.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``CUDA_LAUNCH_BLOCKING``
+        | Controls CUDA kernel launch blocking behavior.
+      - | ``0``: Non-blocking launches
+        | ``1`` or non-zero: Blocking launches
+
+    * - | ``NCCL_COMM_ID``
+        | Enables multi-process mode in test applications.
+      - | Any non-empty value enables multi-process mode
+        | Used with test executables for distributed testing
@@ -0,0 +1,114 @@
+.. meta::
+   :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
+   :keywords: RCCL, ROCm, library, API
+
+.. _library-specification:
+
+============================
+RCCL library specification
+============================
+
+This document provides details of the API library. 
+
+Communicator functions
+----------------------
+
+.. doxygenfunction:: ncclGetUniqueId
+
+.. doxygenfunction:: ncclCommInitRank
+
+.. doxygenfunction:: ncclCommInitAll
+
+.. doxygenfunction:: ncclCommDestroy
+
+.. doxygenfunction:: ncclCommAbort
+
+.. doxygenfunction:: ncclCommCount
+
+.. doxygenfunction:: ncclCommCuDevice
+
+.. doxygenfunction:: ncclCommUserRank
+
+Collective communication operations
+-----------------------------------
+
+Collective communication operations must be called separately for each communicator in a communicator clique.
+
+They return when operations have been enqueued on the hipstream.
+
+Since they may perform inter-CPU synchronization, each call has to be done from a different thread or process, or need to use Group Semantics (see below).
+
+.. doxygenfunction:: ncclReduce
+
+.. doxygenfunction:: ncclBcast
+
+.. doxygenfunction:: ncclBroadcast
+
+.. doxygenfunction:: ncclAllReduce
+
+.. doxygenfunction:: ncclReduceScatter
+
+.. doxygenfunction:: ncclAllGather
+
+.. doxygenfunction:: ncclSend
+
+.. doxygenfunction:: ncclRecv
+
+.. doxygenfunction:: ncclGather
+
+.. doxygenfunction:: ncclScatter
+
+.. doxygenfunction:: ncclAllToAll
+
+Group semantics
+---------------
+When managing multiple GPUs from a single thread, and since NCCL collective
+calls may perform inter-CPU synchronization, we need to "group" calls for
+different ranks/devices into a single call.
+
+Grouping NCCL calls as being part of the same collective operation is done
+using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+collective calls until the ncclGroupEnd call, which will wait for all calls
+to be complete. Note that for collective communication, ncclGroupEnd only
+guarantees that the operations are enqueued on the streams, not that
+the operation is effectively done.
+
+Both collective communication and ncclCommInitRank can be used in conjunction
+of ncclGroupStart/ncclGroupEnd.
+
+.. doxygenfunction:: ncclGroupStart
+
+.. doxygenfunction:: ncclGroupEnd
+
+Library functions
+-----------------
+
+.. doxygenfunction:: ncclGetVersion
+
+.. doxygenfunction:: ncclGetErrorString
+
+Types
+-----
+
+There are few data structures that are internal to the library. The pointer types to these
+structures are given below. The user would need to use these types to create handles and pass them
+between different library functions.
+
+.. doxygentypedef:: ncclComm_t
+
+.. doxygenstruct:: ncclUniqueId
+
+
+
+Enumerations
+------------
+
+This section provides all the enumerations used.
+
+.. doxygenenum:: ncclResult_t
+
+.. doxygenenum:: ncclRedOp_t
+
+.. _rccl-supported-data-types:
+
+.. doxygenenum:: ncclDataType_t
@@ -0,0 +1,47 @@
+.. meta::
+   :description: RCCL attributions information
+   :keywords: RCCL, ROCm, library, API, attributions
+
+.. toctree::
+   :maxdepth: 4
+   :caption: Attributions
+
+Attributions
+============
+
+Contains contributions from NVIDIA.
+
+Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+-  Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+-  Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+-  Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+For more information and license details, see `https://github.com/NVIDIA/NVTX <https://github.com/NVIDIA/NVTX>`_
@@ -0,0 +1,36 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import subprocess
+
+from rocm_docs import ROCmDocs
+
+name = "RCCL"
+get_major = r'sed -n -e "s/^NCCL_MAJOR.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk'
+get_minor = r'sed -n -e "s/^NCCL_MINOR.*\([0-9]\{2,\}\).*/\1/p" ../makefiles/version.mk'
+get_patch = r'sed -n -e "s/^NCCL_PATCH.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk'
+major = subprocess.getoutput(get_major)
+minor = subprocess.getoutput(get_minor)
+patch = subprocess.getoutput(get_patch)
+version_number = f"{major}.{minor}.{patch}"
+
+# for PDF output on Read the Docs
+project = f"{name} Documentation"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number
+
+external_toc_path = "./sphinx/_toc.yml"
+
+docs_core = ROCmDocs(f"{name} {version_number} Documentation")
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
+docs_core.setup()
+
+external_projects_current_project = "rccl"
+
+for sphinx_var in ROCmDocs.SPHINX_VARS:
+    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
@@ -0,0 +1,281 @@
+.. meta::
+   :description: Usage tips for the RCCL library of collective communication primitives
+   :keywords: RCCL, ROCm, library, API, peer-to-peer, transport
+
+.. _rccl-usage-tips:
+
+
+*****************************************
+RCCL usage tips
+*****************************************
+
+This topic describes some of the more common RCCL extensions, such as NPKit and MSCCL, and provides tips on how to
+configure and customize the application.
+
+NPKit
+=====
+
+RCCL integrates `NPKit <https://github.com/microsoft/npkit>`_, a profiler framework that
+enables the collection of fine-grained trace events in RCCL components, especially in giant collective GPU kernels.
+See the `NPKit sample workflow for RCCL <https://github.com/microsoft/NPKit/tree/main/rccl_samples>`_ for
+a fully-automated usage example. It also provides useful templates for the following manual instructions.
+
+To manually build RCCL with NPKit enabled, pass ``-DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_...(other NPKit compile-time switches)"`` to the ``cmake`` command. 
+All NPKit compile-time switches are declared in the RCCL code base as macros with the prefix ``ENABLE_NPKIT_``.
+These switches control the information that is collected.
+
+.. note::
+   
+   NPKit only supports the collection of non-overlapped events on the GPU.
+   The ``-DNPKIT_FLAGS`` settings must follow this rule.
+
+To manually run RCCL with NPKit enabled, set the environment variable ``NPKIT_DUMP_DIR``
+to the NPKit event dump directory. NPKit only supports one GPU per process.
+To manually analyze the NPKit dump results, use `npkit_trace_generator.py <https://github.com/microsoft/NPKit/blob/main/rccl_samples/npkit_trace_generator.py>`_.
+
+MSCCL/MSCCL++
+=============
+
+RCCL integrates `MSCCL <https://github.com/microsoft/msccl>`_ and `MSCCL++ <https://github.com/microsoft/mscclpp>`_ to
+leverage these highly efficient GPU-GPU communication primitives for collective operations.
+Microsoft Corporation collaborated with AMD for this project.
+
+MSCCL uses XMLs for different collective algorithms on different architectures. 
+RCCL collectives can leverage these algorithms after the user provides the corresponding XML.
+The XML files contain sequences of send-recv and reduction operations for the kernel to run.
+
+MSCCL is enabled by default on the AMD Instinct™ MI300X accelerator. On other platforms, users might have to enable it
+using the setting ``RCCL_MSCCL_FORCE_ENABLE=1``. By default, MSCCL is only used if every rank belongs
+to a unique process. To disable this restriction for multi-threaded or single-threaded configurations,
+use the setting ``RCCL_MSCCL_ENABLE_SINGLE_PROCESS=1``.
+
+RCCL allreduce and allgather collectives can leverage the efficient MSCCL++ communication kernels
+for certain message sizes. MSCCL++ support is available whenever MSCCL support is available.
+To run a RCCL workload with MSCCL++ support, set the following RCCL environment variable:
+
+.. code-block:: shell
+
+   RCCL_MSCCLPP_ENABLE=1
+
+To set the message size threshold for using MSCCL++, use the environment variable ``RCCL_MSCCLPP_THRESHOLD``,
+which has a default value of 1MB. After ``RCCL_MSCCLPP_THRESHOLD`` has been set,
+RCCL invokes MSCCL++ kernels for all message sizes less than or equal to the specified threshold.
+
+The following restrictions apply when using MSCCL++. If these restrictions are not met,
+operations fall back to using MSCCL or RCCL.
+
+*  The message size must be a non-zero multiple of 32 bytes
+*  It does not support ``hipMallocManaged`` buffers
+*  Allreduce only supports the ``float16``, ``int32``, ``uint32``, ``float32``, and ``bfloat16`` data types
+*  Allreduce only supports the sum operation
+
+Enabling peer-to-peer transport
+===============================
+
+To enable peer-to-peer access on machines with PCIe-connected GPUs,
+set the HSA environment variable as follows:
+
+.. code-block:: shell
+
+   HSA_FORCE_FINE_GRAIN_PCIE=1
+
+This feature requires GPUs that support peer-to-peer access along with
+proper large BAR addressing support.
+
+Ignoring CPU affinity with multi-node
+=====================================
+
+Depending on the job launcher and the requirements of your workload, performance as the communication workload scales
+can be improved by setting ``NCCL_IGNORE_CPU_AFFINITY``.  This allows the RCCL communication library to 
+ignore the job's supplied CPU affinity and use the GPU affinity only.
+
+.. code-block:: shell
+
+   NCCL_IGNORE_CPU_AFFINITY=1
+
+For general usage, this environment variable is not set so it doesn't interfere with the user or launcher
+supplied preferences.
+
+Improving performance on the MI300X 
+===================================
+
+This section outlines ways to improve RCCL performance on MI300X systems,
+including guidelines for systems with fewer than eight GPUs and the most efficient
+GPU partition modes.
+
+Configuration with fewer than eight GPUs
+----------------------------------------
+
+On a system with eight MI300X accelerators, each pair of accelerators is
+connected with dedicated Infinity Fabric™ links in a fully connected topology.
+For collective operations, this can achieve good performance when all eight
+accelerators (and all Infinity Fabric links) are used. When fewer than eight
+GPUs are used, however, this can only achieve a fraction of the potential
+bandwidth on the system. However, if your workload warrants using fewer than
+eight MI300X accelerators on a system, you can set the run-time variable
+``NCCL_MIN_NCHANNELS`` to increase the number of channels. For example:
+
+.. code-block:: shell
+
+   export NCCL_MIN_NCHANNELS=32
+
+Increasing the number of channels can benefit performance, but it also increases
+GPU utilization for collective operations.
+Additionally, RCCL pre-defines a higher number of channels when only two or four
+accelerators are in use on a 8\*MI300X system. In this situation, RCCL uses 32
+channels with two MI300X accelerators and 24 channels for four MI300X
+accelerators.
+
+.. _nps4_cpx_mi300_rccl:
+
+NPS4 and CPX partition modes
+----------------------------
+
+The term compute partitioning modes, or Modular Chiplet Platform (MCP), refers to the
+logical partitioning of XCDs into devices in the ROCm stack. The names are
+derived from the number of logical partitions that are created out of the eight
+XCDs. In the default mode, SPX (Single Partition X-celerator), all eight XCDs are
+viewed as a single logical compute element, meaning that the :doc:`amd-smi <amdsmi:index>`
+utility will show a single MI300X device. In CPX (Core Partitioned X-celerator)
+mode, each XCD appears as a separate logical GPU, for example, as eight separate
+GPUs in :doc:`amd-smi <amdsmi:index>` per MI300X. CPX mode can be viewed as
+having explicit scheduling privileges for each individual compute element (XCD).
+
+While compute partitioning modes change the space on which you can assign work
+to compute units, the memory partitioning modes (known as Non-Uniform Memory
+Access (NUMA) Per Socket (NPS)) change the number of NUMA domains that a device
+exposes. In other words, it changes the number of HBM stacks which are
+accessible to a compute unit, and therefore the size of its memory space. However,
+for the MI300X, the number of memory partitions must be less than or equal to
+the number of compute partitions. NPS4 (viewing pairs of HBM stacks as a
+disparate element), for example, is only enabled when in CPX mode (viewing each
+XCD as a disparate element).
+
+- Compute partition modes 
+
+  - In SPX mode, workgroups launched to the device are distributed
+    round-robin to the XCDs in the device, meaning that the programmer cannot
+    have explicit control over which XCD a workgroup is assigned to.
+
+  - In CPX mode, workgroups are launched to a single XCD, meaning the
+    programmer has explicit control over work placement onto the XCDs.
+  
+- Memory partition modes 
+
+  - In NPS1 mode (compatible with CPX and SPX), the entire memory is accessible
+    to all XCDs.
+
+  - In NPS4 mode (compatible with CPX), each memory quadrant of the memory is
+    directly visible to the logical devices in its quadrant. An XCD can still
+    access all portions of memory through multi-GPU programming techniques.
+
+The MI300 CPX mode can be accessed using the following :doc:`amdsmi:index`
+commands.
+
+.. code-block:: shell
+
+   amd-smi set --gpu all --compute-partition CPX
+   amd-smi set --gpu all --memory-partition NPS4
+
+RCCL performance with CPX and NPS4
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To run RCCL allreduce on 64 GPUs with CPX+NPS4 mode on the MI300X, use this
+example:
+
+.. code-block:: shell
+
+   mpirun -np 64 --bind-to numa rccl-tests/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1
+
+To run RCCL allreduce on 8 GPUs in the same OAM with CPX+NPS4 mode on the
+MI300X, use this example:
+
+.. code-block:: shell
+
+   export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+   mpirun -np 8 --bind-to numa rccl-tests/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1
+
+RCCL delivers improved allreduce performance in CPX mode for TP=8 (8 GPUs in
+the same OAM) on the MI300X.
+
+.. code-block:: shell
+
+   export HIP_FORCE_DEV_KERNARG=1
+   export RCCL_MSCCLPP_THRESHOLD=1073741824
+
+   export MSCCLPP_READ_ALLRED=1 
+   export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+   mpirun -np 8 --bind-to numa rccl-tests/build/all_reduce_perf -b 32 -e 1G -f 2 -g 1 -G 2 -w 20 -n 50
+
+Here are the benchmark results for in-place (where the output buffer is used as
+the input buffer) and out-of-place allreduce bus bandwidth.
+
+.. figure:: ../data/how-to/rccl-usage-tips/in-place_allreduce.png
+    :alt: In-place allreduce benchmark results
+    :align: center
+
+.. figure:: ../data/how-to/rccl-usage-tips/out-of-place_allreduce.png
+    :alt: Out-of-place allreduce benchmark results
+    :align: center
+
+A significant performance improvement is achievable with optimized CPX mode,
+which peaks at ~340 GB/s with a single OAM. The difference in bus bandwidth
+between the unoptimized and optimized modes increases as the buffer size grows.
+
+Using RCCL and CPX in PyTorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PyTorch all_reduce benchmark is used to reproduce the performance reported
+by RCCL-Tests with the RCCL and CPX optimizations.
+
+.. note::
+
+   To use RCCL with CPX mode in PyTorch, check the RCCL version used by PyTorch.
+
+   For a virtualenv with a .whl-based PyTorch setup (such as nightly/rocm6.2),
+   this would be in 
+   ``<path-to-your-venv>/lib/<python-version>/site-packages/torch/lib/librccl.so``
+   This is the version of RCCL that is packaged as part of ROCm version 6.2.
+
+   RCCL for CPX mode was enabled in ROCm 6.3.0. To use the CPX features, replace
+   the existing ``librccl.so`` with one from ROCm 6.3.0 or newer or from a local
+   build of the RCCL develop branch.
+
+To test the effects of RCCL on PyTorch, the `stas00 all reduce benchmark <https://github.com/stas00/ml-engineering/blob/master/network/benchmarks/all_reduce_bench.py>`_
+was used. The following command is used to run a single OAM allreduce
+benchmark:
+
+.. code-block:: shell
+
+   export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+   python -u -m torch.distributed.run --nproc_per_node=8 --rdzv_endpoint localhost:6000  --rdzv_backend c10d all_reduce_bench.py
+
+For better performance, the ``HIP_FORCE_DEV_KERNARG``, ``RCCL_MSCCLPP_THRESHOLD``,
+and ``TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK`` environment variables are
+set during the benchmark in the following manner:
+
+.. code-block:: shell
+
+   export TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=1
+   export HIP_FORCE_DEV_KERNARG=1
+   export RCCL_MSCCLPP_THRESHOLD=$((2*1024*1024*1024))
+   export MSCCLPP_READ_ALLRED=1
+   export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+   python -u -m torch.distributed.run --nproc_per_node=8 --rdzv_endpoint localhost:6000  --rdzv_backend c10d all_reduce_bench.py
+
+The default allreduce PyTorch benchmark peak bus bandwidth performance is
+~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a
+single OAM peaks at ~315 GB/s.
+
+Context tracking on GPUs
+----------------------------------------
+Context tracking is disabled by default for optimal performance. However, enabling of context tracking can significantly improve performance
+in certain scenarios. To enable context tracking, set the following environment variable:
+
+.. code-block:: shell
+
+
+   export RCCL_ENABLE_CONTEXT_TRACKING=1
+
@@ -0,0 +1,249 @@
+.. meta::
+   :description: A guide to troubleshooting the RCCL library of multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
+   :keywords: RCCL, ROCm, library, API, debug
+
+.. _troubleshooting-rccl:
+
+*********************
+Troubleshooting RCCL
+*********************
+
+This topic explains the steps to troubleshoot functional and performance issues with RCCL.
+While debugging, collect the output from the commands in this guide. This data
+can be used as supporting information when submitting an issue report to AMD.
+
+.. _debugging-system-info:
+
+Collecting system information
+=============================
+
+Collect this information about the ROCm version, GPU/accelerator, platform, and configuration.
+
+*  Verify the ROCm version. This might be a release version or a
+   mainline or staging version. Use this command to display the version:
+
+   .. code:: shell
+
+      cat /opt/rocm/.info/version
+
+   Run the following command and collect the output:
+
+   .. code:: shell
+
+      rocm_agent_enumerator
+
+   Also, collect the name of the GPU or accelerator:
+
+   .. code:: shell
+
+      rocminfo
+
+*  Run these ``rocm-smi`` commands to display the system topology.
+
+   .. code:: shell
+
+      rocm-smi
+      rocm-smi --showtopo
+      rocm-smi --showdriverversion
+
+*  Determine the values of the ``PATH`` and ``LD_LIBRARY_PATH`` environment variables.
+
+   .. code:: shell
+
+      echo $PATH
+      echo $LD_LIBRARY_PATH
+
+*  Collect the HIP configuration.
+
+   .. code:: shell
+
+      /opt/rocm/bin/hipconfig --full
+
+*  Verify the network settings and setup. Use the ``ibv_devinfo`` command 
+   to display information about the available RDMA devices and determine 
+   whether they are installed and functioning properly. Run ``rdma link``
+   to print a summary of the network links.
+
+   .. code:: shell
+
+      ibv_devinfo
+      rdma link
+
+Isolating the issue
+-------------------
+
+The problem might be a general issue or specific to the architecture or system.
+To narrow down the issue, collect information about the GPU or accelerator and other
+details about the platform and system. Some issues to consider include:
+
+*  Is ROCm running on:
+
+   *  A bare-metal setup
+   *  In a Docker container (determine the name of the Docker image)
+   *  In an SR-IOV virtualized
+   *  Some combination of these configurations
+
+*  Is the problem only seen on a specific GPU architecture?
+*  Is it only seen on a specific system type?
+*  Is it happening on a single node or multinode setup?
+*  Use the following troubleshooting techniques to attempt to isolate the issue.
+
+   *  Build or run the develop branch version of RCCL and see if the problem persists.
+   *  Try an earlier RCCL version (minor or major).
+   *  If you recently changed the ROCm runtime configuration, AMD Kernel-mode GPU Driver (KMD), or compiler,
+      rerun the test with the previous configuration.
+
+.. _collecting-rccl-info:
+
+Collecting RCCL information
+=============================
+
+Collect the following information about the RCCL installation and configuration.
+
+*  Run the ``ldd`` command to list any dynamic dependencies for RCCL.
+
+   .. code:: shell
+
+      ldd <specify-path-to-librccl.so>
+
+*  Determine the RCCL version. This might be the pre-packaged component in
+   ``/opt/rocm/lib`` or a version that was built from source. To verify the RCCL version,
+   enter the following command, then run either rccl-tests or an e2e application.
+
+   .. code:: shell
+
+      export NCCL_DEBUG=VERSION
+
+*  Run rccl-tests and collect the results. For information on how to build and run rccl-tests, see the
+   `rccl-tests GitHub <https://github.com/ROCm/rccl-tests/blob/develop/README.md>`_.
+
+*  Collect the RCCL logging information. Enable the debug logs, 
+   then run rccl-tests or any e2e workload to collect the logs. Use the 
+   following command to enable the logs.
+
+   .. code:: shell
+
+      export NCCL_DEBUG=INFO
+
+.. _use-rccl-replayer:
+
+Using the RCCL Replayer
+------------------------
+
+The RCCL Replayer is a debugging tool designed to analyze and replay the collective logs obtained from RCCL runs. 
+It can be helpful when trying to reproduce problems, because it uses dummy data and doesn't have any dependencies 
+on non-RCCL calls. For more information, 
+see `RCCL Replayer GitHub documentation <https://github.com/ROCm/rccl/tree/develop/tools/RcclReplayer>`_.
+
+You must build the RCCL Replayer before you can use it. To build it, run these commands. Ensure ``MPI_DIR`` is set to 
+the path where MPI is installed.
+
+.. code:: shell
+
+   cd rccl/tools/rccl_replayer
+   MPI_DIR=/path/to/mpi make
+
+To use the RCCL Replayer, follow these steps: 
+
+#. Collect the per-rank logs from the RCCL run by adding the following environment variables.
+   This prevents any race conditions that might cause ranks to interrupt the output from other ranks.
+
+   .. code:: shell
+
+      NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=COLL NCCL_DEBUG_FILE=some_name_here.%h.%p.log
+
+#. Combine all the logs into a single file. This will become the input to the RCCL Replayer.
+
+   .. code:: shell
+
+      cat some_name_here_*.log > some_name_here.log
+
+#. Run the RCCL Replayer using the following command. Replace ``<numProcesses>`` with the number of MPI processes to 
+   run, ``</path/to/logfile>`` with the path to the collective log file generated during 
+   the RCCL runs, and ``<numGpusPerMpiRank>`` with the number of GPUs per MPI rank used in the application.
+
+   .. code:: shell
+
+      mpirun -np <numProcesses> ./rcclReplayer </path/to/logfile> <numGpusPerMpiRank>
+
+   In a multi-node application environment, you can replay the collective logs on multiple nodes
+   using the following command:
+
+   .. code:: shell
+
+      mpirun --hostfile <path/to/hostfile.txt> -np <numProcesses> ./rcclReplayer </path/to/logfile> <numGpusPerMpiRank>
+
+   .. note::
+
+      Depending on the MPI library you're using, you might need to modify the ``mpirun`` command.
+
+.. _analyze-performance-info:
+
+Analyzing performance issues
+=============================
+
+If the issues involve performance issues in an e2e workload, try the following 
+microbenchmarks and collect the results. Follow the instructions in the subsequent sections
+to run these benchmarks and provide the results to the support team.
+
+*  TransferBench
+*  RCCL Unit Tests
+*  rccl-tests
+  
+Collect the TransferBench data
+---------------------------------
+
+TransferBench allows you to benchmark simultaneous copies between
+user-specified devices. For more information, 
+see the :doc:`TransferBench documentation <transferbench:index>`.
+
+To collect the TransferBench data, follow these steps:
+
+#. Clone the TransferBench Git repository.
+
+   .. code:: shell
+
+      git clone https://github.com/ROCm/TransferBench.git 
+
+#. Change to the new directory and build the component.
+
+   .. code:: shell
+
+      cd TransferBench
+      make
+
+#. Run the TransferBench utility with the following parameters and save the results.
+
+   .. code:: shell
+
+      USE_FINE_GRAIN=1 GFX_UNROLL=2 ./TransferBench a2a 64M 8
+
+Collect the RCCL microbenchmark data
+-------------------------------------
+
+To use the RCCL tests to collect the RCCL benchmark data, follow these steps:
+
+#. Disable NUMA auto-balancing using the following command:
+
+   .. code:: shell
+
+      sudo sysctl kernel.numa_balancing=0
+
+   Run the following command to verify the setting. The expected output is ``0``.
+
+   .. code:: shell
+
+      cat /proc/sys/kernel/numa_balancing
+
+#. Build MPI, RCCL, and rccl-tests. To download and install MPI, see either 
+   `OpenMPI <https://www.open-mpi.org/software/ompi/v5.0/>`_ or `MPICH <https://www.mpich.org/>`_.
+   To learn how to build and run rccl-tests, see the `rccl-tests GitHub <https://github.com/ROCm/rccl-tests/blob/develop/README.md>`_.
+
+#. Run rccl-tests with MPI and collect the performance numbers.
+
+RCCL and NCCL comparisons
+=============================
+
+If you are also using NVIDIA hardware or NCCL and notice a performance gap between the two systems,
+collect the system and performance data on the NVIDIA platform. 
+Provide both sets of data to the support team.
@@ -0,0 +1,333 @@
+.. meta::
+   :description: How to use the NCCL Net API
+   :keywords: RCCL, ROCm, library, API, NCCL Net, plugin
+
+.. _using-nccl:
+
+*****************************
+Using the NCCL Net plugin API
+*****************************
+
+NCCL provides a way to use external plugins to let NCCL run on many network types. This 
+topic describes the NCCL Net plugin API and explains how to implement a network plugin for NCCL.
+
+Plugins implement the NCCL network API and decouple NCCL binary builds, which are built against a
+particular version of the GPU stack (such as NVIDIA CUDA), from the network code, which is built against a
+particular version of the networking stack. Using this method, you can easily integrate any CUDA version
+with any network stack version.
+
+NCCL network plugins are packaged as a shared library called ``librccl-net.so``. The shared library
+contains one or more implementations of the NCCL Net API in the form of versioned structs,
+which are filled with pointers to all required functions.
+
+Plugin architecture
+===================
+
+When NCCL is initialized, it searches for a ``librccl-net.so`` library and dynamically loads it,
+then searches for symbols inside the library.
+
+The ``NCCL_NET_PLUGIN`` environment variable allows multiple plugins to coexist. If it's set, NCCL
+looks for a library named ``librccl-net-${NCCL_NET_PLUGIN}.so``. It is therefore
+recommended that you name the library according to that pattern, with a symlink pointing from ``librccl-net.so``
+to ``librccl-net-${NCCL_NET_PLUGIN}.so``. This lets users select the correct plugin
+if there are multiple plugins in the path.
+
+Struct versioning
+-----------------
+
+After a library is found, NCCL looks for a symbol named ``ncclNet_vX``, with ``X`` increasing
+over time. This versioning pattern ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide a number of these symbols, implementing many versions
+of the NCCL Net API. This is so the same plugin can be compiled for and support a wide range of NCCL
+versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions. It can look
+for the latest ``ncclNet`` struct version but also search for older versions, so that older plugins
+still work.
+
+In-network collective operations (collNet)
+----------------------------------------------
+
+In addition to the ``ncclNet`` structure, network plugins can provide a ``collNet`` structure which
+implements any supported in-network collective operations. This is an optional
+structure provided by the network plugin,
+but its versioning is tied to the ``ncclNet`` structure and many functions are common between the two to
+ease implementation. The ``collNet`` structure can be used by the NCCL ``collNet``
+algorithm to accelerate inter-node reductions in allReduce.
+
+Header management
+------------------
+
+To help users effortlessly build plugins, plugins should copy the ``ncclNet_vX`` definitions
+they support to their list of internal includes. An example is shown in ``ext-net/example/``, which stores
+all headers in the ``nccl/`` directory and provides thin layers to implement old versions on top
+of newer ones.
+
+The ``nccl/`` directory is populated with ``net_vX.h`` files, which extract all relevant definitions
+from the old API versions. It also provides error codes in ``err.h``.
+
+API (v6)
+=========
+
+Here is the main ``ncclNet_v6`` struct. Each function is explained in later sections.
+
+.. code:: shell
+
+    typedef struct {
+    // Name of the network (mainly for logs)
+    const char* name;
+    // Initialize the network.
+    ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+    // Return the number of adapters.
+    ncclResult_t (*devices)(int* ndev);
+    // Get various device properties.
+    ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+    // Create a receiving object and provide a handle to connect to it. The
+    // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+    // between ranks to create a connection.
+    ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+    // Connect to a handle and return a sending comm object for that peer.
+    // This call must not block for the connection to be established, and instead
+    // should return successfully with sendComm == NULL with the expectation that
+    // it will be called again until sendComm != NULL.
+    ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+    // Finalize connection establishment after remote peer has called connect.
+    // This call must not block for the connection to be established, and instead
+    // should return successfully with recvComm == NULL with the expectation that
+    // it will be called again until recvComm != NULL.
+    ncclResult_t (*accept)(void* listenComm, void** recvComm);
+    // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+    // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+    ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+    /* DMA-BUF support */
+    ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+    ncclResult_t (*deregMr)(void* comm, void* mhandle);
+    // Asynchronous send to a peer.
+    // May return request == NULL if the call cannot be performed (or would block)
+    ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+    // Asynchronous recv from a peer.
+    // May return request == NULL if the call cannot be performed (or would block)
+    ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+    // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+    // visible to the GPU
+    ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+    // Test whether a request is complete. If size is not NULL, it returns the
+    // number of bytes sent/received.
+    ncclResult_t (*test)(void* request, int* done, int* sizes);
+    // Close and free send/recv comm objects
+    ncclResult_t (*closeSend)(void* sendComm);
+    ncclResult_t (*closeRecv)(void* recvComm);
+    ncclResult_t (*closeListen)(void* listenComm);
+    } ncclNet_v6_t;
+
+Error codes
+-----------
+
+All plugins functions use NCCL error codes as their return value. ``ncclSuccess`` should be returned upon
+success. Otherwise, plugins can return one of the following codes:
+
+* ``ncclSystemError`` is the most common error for network plugins. It should be returned when a call to the Linux kernel or a system library fails. This typically includes all network and hardware errors.
+* ``ncclInternalError`` is returned when the NCCL core code is using the network plugin in an incorrect way, for example, allocating more requests than it should or passing an invalid argument in API calls.
+* ``ncclInvalidUsage`` should be returned when the error is most likely due to user error. This can include misconfiguration, but also size mismatches.
+* ``ncclInvalidArgument`` should not typically be used by plugins because arguments should be checked by the NCCL core layer.
+* ``ncclUnhandledCudaError`` is returned when an error is received from NVIDIA CUDA. Network plugins should not need to rely on CUDA, so this error should not be common.
+
+Operational overview
+--------------------
+
+NCCL first calls the ``init`` function, queries the number of network devices with the
+``devices`` function, and retrieves the properties from each network device using ``getProperties``.
+
+To establish a connection between two network devices, NCCL first calls ``listen`` on the
+receiving side. It passes the returned handle to the sender side of the connection, and uses it to call ``connect``.
+Finally, ``accept`` is called on the receiving side to finalize the establishment of the connection.
+
+After the connection is established, communication is performed using the functions ``isend``,
+``irecv``, and ``test``. Prior to calling ``isend`` or ``irecv``, NCCL calls the ``regMr`` function on
+all buffers to allow RDMA NICs to prepare the buffers. ``deregMr`` is used to unregister buffers.
+
+In certain conditions, ``iflush`` is called after a ``receive`` call completes to allow the network
+plugin to flush data and ensure the GPU processes the newly written data.
+
+To close the connections, NCCL calls ``closeListen`` to close the object returned by ``listen``,
+``closeSend`` to close the object returned by ``connect``, and ``closeRecv`` to close the object returned
+by ``accept``.
+
+API Functions
+-------------
+
+The RCCL Tuner plugin API provides the following interface for initialization, connection management, and
+communications.
+
+Initialization
+^^^^^^^^^^^^^^
+
+*  ``name`` - The ``name`` field should point to a character string with the name of the network plugin. This name is used for all logging, especially when ``NCCL_DEBUG=INFO`` is set.
+
+   .. note::
+
+      Setting ``NCCL_NET=<plugin name>`` ensures a specific network implementation is used, with
+      a matching ``name``. This is not to be confused with ``NCCL_NET_PLUGIN`` which defines a suffix for the
+      ``librccl-net.so`` library name to load.
+
+*  ``init`` - As soon as NCCL finds the plugin and the correct ``ncclNet`` symbol, it calls the ``init`` function. This allows the plugin to discover network devices and ensure they are usable.
+   If the ``init`` function does not return ``ncclSuccess``, then NCCL does not use the plugin and falls back to internal ones.
+
+   To allow the plugin logs to seamlessly integrate into the NCCL logs, NCCL provides a logging function to ``init``. This function is typically used to allow ``INFO`` and ``WARN`` macros within the plugin code by adding the following definitions:
+
+   .. code:: shell
+
+      #define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+      #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+
+*  ``devices`` - After the plugin is initialized, NCCL queries the number of devices available. 
+   This should not be zero. Otherwise, NCCL initialization will fail. If no device is present or usable, the ``init`` function should not return ``ncclSuccess``.
+
+*  ``getProperties`` - Right after retrieving the number of devices, NCCL queries the properties for each available network device. 
+   These properties are necessary when multiple adapters are present to ensure NCCL uses each adapter in the optimal way.
+
+   *  The ``name`` is only used for logging.
+
+   *  The ``pciPath`` is the base for all topology detection and should point to the PCI device directory
+      in ``/sys``. This is typically the directory pointed to by ``/sys/class/net/eth0/device`` or
+      ``/sys/class/infiniband/mlx5_0/device``. If the network interface is virtual, then ``pciPath`` should
+      be ``NULL``.
+
+   *  The ``guid`` field is used to determine whether network adapters are connected to multiple PCI
+      endpoints. For normal cases, this is set to the device number. If multiple network devices have
+      the same ``guid``, then NCCL understands them to be sharing the same network port to the fabric. In this case,
+      it will not use the port multiple times.
+
+   *  The ``ptrSupport`` field indicates whether or not CUDA pointers are supported. If so, it should be
+      set to ``NCCL_PTR_HOST|NCCL_PTR_CUDA``. Otherwise, it should be set to ``NCCL_PTR_HOST``. If the plugin
+      supports ``dmabuf``, it should set ``ptrSupport`` to ``NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF`` and
+      provide a ``regMrDmaBuf`` function.
+
+   *  The ``regIsGlobal`` field allows NCCL to register buffers in advance, for example, using a loopback connection.
+      Later, it also lets NCCL expect that a subsequent registration on a buffer from a previous registration
+      will happen nearly immediately, because the buffer is already known by the network adapter. A typical
+      implementation maintains a registration cache, with the call to ``ncclCommRegister`` creating the
+      initial entry in the cache using ``regMr()`` on a loopback connection. Any later call to the NCCL
+      system can call ``regMr()`` again on the real connection, with the real buffer (which could be at a
+      different offset within the original buffer, with a smaller size, for example). It
+      could then call ``deregMr()`` immediately afterwards.
+      The ``ncclCommDeregister`` call should issue the final call to ``deregMr()`` and effectively remove the mapping
+      on the network adapter.
+
+   *  The ``speed`` field indicates the speed of the network port in Mbps (10^6 bits per second).
+      This ensures proper optimization of flows within the node.
+
+   *  The ``port`` field indicates the port number. This is important for topology detection and
+      flow optimization within the node when a NIC with a single PCI connection is connected to the fabric through multiple ports.
+
+   *  The ``latency`` field indicates the network latency in microseconds. This can be useful to
+      improve the NCCL tuning and ensure NCCL switches from tree to ring at the correct size.
+
+   *  The ``maxComms`` field indicates the maximum number of connections that can be created.
+
+   *  The ``maxRecvs`` field indicates the maximum number for grouped receive operations (see grouped receive).
+
+Connection establishment
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Connections are used in an unidirectional manner, with a sender side and a receiver
+side.
+
+*  ``listen`` - To create a connection, NCCL calls ``listen`` on the receiver side.
+   This function accepts a device number as an input argument and returns a local ``listenComm`` object and a ``handle``
+   to pass to the other side of the connection, so that the sender can connect to the receiver.
+   The ``handle`` is a buffer of size ``NCCL_NET_HANDLE_MAXSIZE`` and is provided by NCCL.
+   This call should never block, but unlike ``connect`` and ``accept``, ``listenComm`` should never be ``NULL``
+   if the call succeeds.
+
+*  ``connect`` - NCCL uses its bootstrap infrastructure to provide the ``handle`` to the sender side,
+   then calls ``connect`` on the sender side on a given device index ``dev`` and provides the ``handle``.
+   ``connect`` should not block either. Instead, it sets ``sendComm`` to ``NULL`` and returns ``ncclSuccess``.
+   In that case, NCCL will keep calling ``accept`` again until it succeeds.
+
+*  ``accept`` - To finalize the connection, the receiver side calls ``accept`` on the ``listenComm`` object
+   previously returned by the ``listen`` call. If the sender did not connect yet, ``accept`` should not block.
+   It should return ``ncclSuccess``, setting ``recvComm`` to ``NULL``. NCCL will keep calling ``accept``
+   again until it succeeds.
+
+*  ``closeListen`` / ``closeSend`` / ``closeRecv`` - When a ``listenComm``, ``sendComm``, or ``recvComm`` object is no longer
+   needed, NCCL calls ``closeListen``, ``closeSend``, or ``closeRecv`` to free the associated resources.
+
+Communication
+^^^^^^^^^^^^^
+
+Communication is handled using the asynchronous send and receive operations: ``isend``, ``irecv``, and ``test``.
+To support RDMA capabilities, buffer registration and flush functions are provided.
+
+To keep track of asynchronous send, receive, and flush operations, requests are returned to NCCL,
+then queried using ``test``. Each ``sendComm`` or ``recvComm`` must be able to handle
+``NCCL_NET_MAX_REQUESTS`` requests in parallel.
+
+.. note::
+
+   This value should be multiplied by the multi-receive capability of the plugin for the sender
+   side, so the plugin can effectively have ``NCCL_NET_MAX_REQUESTS`` multi-receive operations happening
+   in parallel. If ``maxRecvs`` is 8 and ``NCCL_NET_MAX_REQUESTS`` is 8, then each
+   ``sendComm`` must be able to handle up to 64 (8x8) concurrent ``isend`` operations.
+
+*  ``regMr`` - Prior to sending or receiving data, NCCL calls ``regMr`` with any buffers later used for communication.
+   It provides a ``sendComm`` or ``recvComm`` object for the ``comm`` argument,
+   the buffer pointer ``data``, the ``size``, and the ``type``. The type is either ``NCCL_PTR_HOST`` or ``NCCL_PTR_CUDA`` if
+   the network supports CUDA pointers.
+
+   The network plugin can use the output argument ``mhandle`` to store any reference to the memory registration, because
+   ``mhandle`` is returned for all ``isend``, ``irecv``, ``iflush``, and ``deregMr`` calls.
+
+*  ``regMrDmaBuf`` - If the plugin has set the ``NCCL_PTR_DMABUF`` property in ``ptrSupport``, 
+   NCCL uses ``regMrDmaBuf`` instead of ``regMr``. If the property was not set, ``regMrDmaBuf`` can be set to ``NULL``.
+
+*  ``deregMr`` - When buffers are no longer used for communication, NCCL calls ``deregMr`` to let the plugin
+   free resources. This function is used to deregister handles returned by ``regMr`` and ``regMrDmaBuf``.
+
+*  ``isend`` - Data is sent through the connection using ``isend``, passing the ``sendComm`` object previously created
+   by ``connect``, the buffer described by ``data``, ``size``, and ``mhandle``. A ``tag`` must
+   be used if the network supports multi-receive operations (see ``irecv``) to distinguish between different send requests
+   matching the same multi-receive. Otherwise it can be set to ``0``.
+
+   The ``isend`` operation returns a handle in the ``request`` argument for further calls to ``test``.
+   If the ``isend`` operation cannot be initiated, ``request`` is set to ``NULL``. NCCL will call ``isend`` again later.
+
+*  ``irecv`` - To receive data, NCCL calls ``irecv`` with the ``recvComm`` returned by ``accept``.
+   The argument ``n`` configures NCCL for multi-receive, to allow grouping of multiple sends
+   through a single network connection. Each buffer can be described by the ``data``, ``sizes``, and ``mhandles`` arrays.
+   ``tags`` specify a tag for each receive so that each of the ``n`` independent ``isend`` operations is received
+   into the right buffer.
+
+   If all receive operations can be initiated, ``irecv`` returns a handle in the ``request`` pointer. Otherwise,
+   it sets the pointer to ``NULL``. In the case of multi-receive, all ``n`` receive operations are handled by a single request handle.
+
+   The sizes provided to ``irecv`` can (and will) be larger than the size of the ``isend`` operation.
+   However, it is an error if the receive size is smaller than the send size.
+
+   .. note::
+
+      For a given connection, send and receive operations should always match in the order they were
+      posted. Tags provided for receive operations are only used to assign a given send operation to one
+      of the buffers of the first (multi-)receive operation in the queue, not to allow for out-of-order tag
+      matching on any receive operation posted.
+
+*  ``test`` - After an ``isend`` or ``irecv`` operation is initiated, NCCL calls ``test`` on the request handles until
+   the operation completes. When that happens, ``done`` is set to ``1`` and ``sizes`` is set to the real size sent or received,
+   the latter could potentially be lower than the size passed to ``irecv``.
+
+   In the case of a multi-receive, all receives are considered as part of a single operation, the goal
+   being to allow aggregation. Therefore, they share a single request and a single ``done`` status. However,
+   they can have different sizes, so if ``done`` is non-zero, the ``sizes`` array should contain the ``n`` sizes
+   corresponding to the buffers passed to ``irecv``.
+
+   After ``test`` returns ``1`` in ``done``, the request handle can be freed. This means that NCCL will never
+   call ``test`` again on that request, unless it is reallocated by another call to ``isend`` or ``irecv``.
+
+*  ``iflush`` - After a receive operation completes, if the operation was targeting GPU memory and received
+   a non-zero number of bytes, NCCL calls ``iflush``. This lets the network flush any buffer to ensure
+   the GPU can read it immediately without seeing stale data. This flush operation is decoupled from
+   the ``test`` code to improve the latency of ``LL*`` protocols, because those are capable of determining
+   when data is valid or not.
+
+   ``iflush`` returns a request which must be queried using ``test`` until it completes.
@@ -0,0 +1,135 @@
+.. meta::
+   :description: How to use the RCCL Tuner plugin API
+   :keywords: RCCL, ROCm, library, API, Tuner, plugin
+
+.. _using-rccl-tuner-plugin:
+
+*******************************
+Using the RCCL Tuner plugin API
+*******************************
+
+An external plugin enables users to hand-tailor the selection of an algorithm,
+protocol, and number of channels (thread blocks) based on an input configuration specifying the
+message size, number of nodes and GPUs, and link types (for instance, PCIe, XGMI, or NET).
+One advantage of this plugin is that each user can create and maintain their own hand-tailored tuner
+without relying on RCCL to develop and maintain it. This topic describes the API required to implement
+an external tuner plugin for RCCL.
+
+The following usage notes are relevant when using the RCCL Tuner plugin API:
+
+*  The API allows partial outputs: tuners can set only the algorithm and protocol and let RCCL set the remaining fields,
+   such as the number of channels.
+*  If ``getCollInfo()`` fails, RCCL uses its default internal mechanisms to determine the best collective configuration.
+*  ``getCollInfo`` is called for each collective invocation per communicator, so special care
+   must be taken to avoid introducing excessive latency.
+*  The supported RCCL algorithms are ``NCCL_ALGO_TREE``, and ``NCCL_ALGO_RING``.
+*  The supported RCCL protocols are ``NCCL_PROTO_SIMPLE``, ``NCCL_PROTO_LL`` and ``NCCL_PROTO_LL128``.
+
+   *  Until support is present for network collectives, use the example in the ``pluginGetCollInfo`` API implementation
+      to ignore other algorithms as follows:
+
+      .. code-block:: cpp
+
+         if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
+         if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
+         if (a == NCCL_ALGO_NVLS && collNetSupport != 1) continue;
+
+.. note::
+   
+   The `example plugin <https://github.com/ROCm/rccl/blob/develop/ext-tuner/example/plugin.c>`_
+   uses math models to approximate the bandwidth and latency of the available selection of algorithms and protocols
+   and select the one with the lowest calculated latency. It is customized for the AMD Instinct MI300 accelerators and RoCEv2 networks
+   on a limited number of nodes. This example, which is intended for demonstration purposes only, is not meant to be inclusive of all potential AMD GPUs and network configuration.
+
+API description
+================
+
+To build a custom tuner, implement the ``ncclTuner_v1_t`` structure.
+
+Structure: ncclTuner_v1_t
+---------------------------
+
+**Fields**
+
+*  ``name``
+  
+   *  **Type**: ``const char*``
+   *  **Description**: The name of the tuner, which can be used for logging purposes when ``NCCL_DEBUG=info`` and ``NCCL_DEBUG_SUBSYS=tune`` are set.
+
+**Functions**
+
+*  ``init`` (called upon communicator initialization with ``ncclCommInitRank``)
+
+   Initializes the tuner states. Each communicator initializes its tuner. ``nNodes`` x ``nRanks`` = the total number of GPUs participating in the collective communication.
+
+   *  **Parameters**:
+
+      * ``nRanks`` (``size_t``): The number of devices (GPUs).
+      * ``nNodes`` (``size_t``): The number of operating system nodes (physical nodes or VMs).
+      * ``logFunction`` (``ncclDebugLogger_t``): A log function for certain debugging info.
+
+   *  **Return**:
+
+      *  **Type**: ``ncclResult_t``
+      *  **Description**: The result of the initialization.
+
+*  ``getCollInfo`` (called for each collective call per communicator)
+
+   Retrieves information about the collective algorithm, protocol, and number of channels for the given input parameters.
+
+   *  **Parameters**:
+
+      * ``collType`` (``ncclFunc_t``): The collective type, for example, ``allreduce``, ``allgather``, etc.
+      * ``nBytes`` (``size_t``): The size of the collective in bytes.
+      * ``collNetSupport`` (``int``): Whether ``collNet`` supports this type.
+      * ``nvlsSupport`` (``int``): Whether NVLink SHARP supports this type.
+      * ``numPipeOps`` (``int``): The number of operations in the group.
+  
+   *  **Outputs**:
+
+      * ``algorithm`` (``int*``): The selected algorithm to be used for the given collective.
+      * ``protocol`` (``int*``): The selected protocol to be used for the given collective.
+      * ``nChannels`` (``int*``): The number of channels (and SMs) to be used.
+     
+   *  **Description**:
+
+      If ``getCollInfo()`` does not return ``ncclSuccess``, RCCL falls back to its default tuning for the given collective.
+      The tuner is allowed to leave fields unset, in which case RCCL automatically sets those fields.
+
+   *  **Return**:
+
+      *  **Type**: ``ncclResult_t``
+      *  **Description**: The result of the operation.
+
+*  ``destroy`` (called upon communicator finalization with ``ncclCommFinalize``)
+
+   Terminates the plugin and cleans up any resources allocated by the tuner.
+
+   *  **Return**:
+
+      *  **Type**: ``ncclResult_t`` 
+      *  **Description**: The result of the cleanup process.
+
+Build and usage instructions
+============================
+
+To use the external plugin, implement the desired algorithm and protocol selection technique using the API described above.
+As a reference, the `following example <https://github.com/ROCm/rccl/blob/develop/ext-tuner/example/plugin.c>`_ is based on the
+MI300 tuning table by default.
+
+Building and using the example libnccl-tuner.so file
+-----------------------------------------------------
+
+#. Build the ``libnccl-tuner.so`` file following `the example Makefile <https://github.com/ROCm/rccl/blob/develop/ext-tuner/example/Makefile>`_.
+
+   .. code-block:: shell
+
+      cd $RCCL_HOME/ext-tuner/example/
+      make
+
+#. Tell RCCL to use the custom ``libnccl-tuner.so`` file by setting the following environment variable
+   to the file path:
+
+   .. code-block:: shell
+
+      export NCCL_TUNER_PLUGIN=$RCCL_HOME/ext-tuner/example/libnccl-tuner.so
@@ -0,0 +1,50 @@
+.. meta::
+   :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
+   :keywords: RCCL, ROCm, library, API
+
+.. _index:
+
+******************
+RCCL documentation
+******************
+
+The ROCm Communication Collectives Library (RCCL) is a stand-alone library
+that provides multi-GPU and multi-node collective communication primitives
+optimized for AMD GPUs. It uses PCIe and xGMI high-speed interconnects.
+To learn more, see :doc:`what-is-rccl`
+
+The RCCL public repository is located at `<https://github.com/ROCm/rccl>`_.
+
+.. grid:: 2
+  :gutter: 3
+
+  .. grid-item-card:: Install
+
+    * :doc:`Installing RCCL using the install script <./install/installation>`
+    * :doc:`Running RCCL using Docker <./install/docker-install>`
+    * :doc:`Building and installing RCCL from source code <./install/building-installing>`
+
+  .. grid-item-card:: How to
+
+    * :doc:`Using the RCCL Tuner plugin <./how-to/using-rccl-tuner-plugin-api>`
+    * :doc:`Using the NCCL Net plugin <./how-to/using-nccl>`
+    * :doc:`Troubleshoot RCCL <./how-to/troubleshooting-rccl>`
+    * :doc:`RCCL usage tips <./how-to/rccl-usage-tips>`
+
+
+  .. grid-item-card:: Examples
+
+    * `RCCL Tuner plugin examples <https://github.com/ROCm/rccl/tree/develop/ext-tuner/example>`_
+    * `NCCL Net plugin examples <https://github.com/ROCm/rccl/tree/develop/ext-net/example>`_
+
+  .. grid-item-card:: API reference
+
+    * :ref:`Library specification<library-specification>`
+    * :ref:`api-library`
+    * :ref:`Environment variables<env-variables>`
+
+To contribute to the documentation, see
+`Contributing to ROCm  <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
+
+You can find licensing information on the
+`Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
@@ -0,0 +1,103 @@
+.. meta::
+   :description: Information on how to build the RCCL library from source code
+   :keywords: RCCL, ROCm, library, API, build, install
+
+.. _building-from-source:
+
+*********************************************
+Building and installing RCCL from source code
+*********************************************
+
+To build RCCL directly from the source code, follow these steps. This guide also includes
+instructions explaining how to test the build.
+For information on using the quick start install script to build RCCL, see :doc:`installation`.
+
+Requirements
+============
+
+The following prerequisites are required to build RCCL:
+
+1. ROCm-supported GPUs
+2. Having the ROCm stack installed on the system, including the :doc:`HIP runtime <hip:index>` and the HIP-Clang compiler.
+
+Building the library using CMake:
+---------------------------------
+
+To build the library from source, follow these steps:
+
+.. code-block:: shell
+
+    git clone --recursive https://github.com/ROCm/rccl.git
+    cd rccl
+    mkdir build
+    cd build
+    cmake ..
+    make -j 16      # Or some other suitable number of parallel jobs
+
+If you have already cloned the repository, you can checkout the external submodules manually.
+
+.. code-block:: shell
+
+    git submodule update --init --recursive --depth=1
+
+You can substitute a different installation path by providing the path as a parameter
+to ``CMAKE_INSTALL_PREFIX``, for example:
+
+.. code-block:: shell
+
+    cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release ..
+
+.. note::
+
+    Ensure ROCm CMake is installed using the command ``apt install rocm-cmake``. By default,
+    CMake builds the component in debug mode unless ``DCMAKE_BUILD_TYPE`` is specified.
+
+
+Building the RCCL package and install package:
+----------------------------------------------
+
+After you have cloned the repository and built the library as described in the previous section,
+use this command to build the package:
+
+.. code-block:: shell
+
+    cd rccl/build
+    make package
+    sudo dpkg -i *.deb
+
+.. note::
+   
+   The RCCL package install process requires ``sudo`` or root access because it creates a directory
+   named ``rccl`` in ``/opt/rocm/``. This is an optional step. RCCL can be used directly by including the path containing ``librccl.so``.
+
+Testing RCCL
+============
+
+The RCCL unit tests are implemented using the Googletest framework in RCCL. These unit tests require Googletest 1.10
+or higher to build and run (this dependency can be installed using the ``-d`` option for ``install.sh``).
+To run the RCCL unit tests, go to the ``build`` folder and the ``test`` subfolder,
+then run the appropriate RCCL unit test executables.
+
+The RCCL unit test names follow this format:
+
+.. code-block:: shell
+
+    CollectiveCall.[Type of test]
+
+Filtering of the RCCL unit tests can be done using environment variables
+and by passing the ``--gtest_filter`` command line flag:
+
+.. code-block:: shell
+
+    UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
+
+This command runs only the ``AllReduce`` correctness tests with the ``float16`` datatype.
+A list of the available environment variables for filtering appears at the top of every run.
+See the `Googletest documentation <https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests>`_
+for more information on how to form advanced filters.
+
+There are also other performance and error-checking tests for RCCL. They are maintained separately at `<https://github.com/ROCm/rccl-tests>`_.
+
+.. note::
+
+    For more information on how to build and run rccl-tests, see the `rccl-tests README file <https://github.com/ROCm/rccl-tests/blob/develop/README.md>`_ .
@@ -0,0 +1,52 @@
+.. meta::
+   :description: Instruction on how to install the RCCL library for collective communication primitives using Docker
+   :keywords: RCCL, ROCm, library, API, install, Docker
+
+.. _install-docker:
+
+*****************************************
+Running RCCL using Docker
+*****************************************
+
+To use Docker to run RCCL, Docker must already be installed on the system.
+To build the Docker image and run the container, follow these steps.
+
+#. Build the Docker image
+
+   By default, the Dockerfile uses ``docker.io/rocm/dev-ubuntu-22.04:latest`` as the base Docker image.
+   It then installs RCCL and rccl-tests (in both cases, it uses the version from the ``develop`` branch).
+
+   Use this command to build the Docker image:
+
+   .. code-block:: shell
+
+      docker build -t rccl-tests -f Dockerfile.ubuntu --pull .
+
+   The base Docker image, rccl repository, rccl-tests repository, and GPU targets can be modified
+   by using ``--build-args`` in the ``docker build`` command above. For example, to use a different base Docker image for the MI250 GPU,
+   use this command:
+
+   .. code-block:: shell
+
+      docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull .
+
+#. Launch an interactive Docker container on a system with AMD GPUs:
+
+   .. code-block:: shell
+
+      docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash
+
+To run, for example, the ``all_reduce_perf`` test from rccl-tests on 8 AMD GPUs from inside the Docker container, use this command
+for ROCm 6.4.1 or earlier:
+
+.. code-block:: shell
+
+   mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
+
+For ROCm 6.4.2 or later, use this command:
+
+.. code-block:: shell
+
+   mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1
+
+For more information on the rccl-tests options, see the `Usage guidelines <https://github.com/ROCm/rccl-tests#usage>`_ in the GitHub repository.
@@ -0,0 +1,85 @@
+.. meta::
+   :description: Instruction on how to install the RCCL library for collective communication primitives using the quick start install script
+   :keywords: RCCL, ROCm, library, API, install
+
+.. _install:
+
+*****************************************
+Installing RCCL using the install script
+*****************************************
+
+To quickly install RCCL using the install script, follow these steps.
+For instructions on building RCCL from the source code, see :doc:`building-installing`.
+For additional tips, see :doc:`../how-to/rccl-usage-tips`.
+
+Requirements
+============
+
+The following prerequisites are required to use RCCL:
+
+1. ROCm-supported GPUs
+2. The ROCm stack must be installed on the system, including the :doc:`HIP runtime <hip:index>` and the HIP-Clang compiler.
+
+Quick start RCCL build
+======================
+
+RCCL directly depends on the HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
+For ROCm installation instructions, see the :doc:`package manager installation guide <rocm-install-on-linux:install/install-methods/package-manager-index>`.
+
+Use the `install.sh helper script <https://github.com/ROCm/rccl/blob/develop/install.sh>`_,
+located in the root directory of the RCCL repository,
+to build and install RCCL with a single command. It uses hard-coded configurations that can be specified directly
+when using cmake. However, it's a great way to get started quickly and provides an
+example of how to build and install RCCL.
+
+Building the library using the install script:
+----------------------------------------------
+
+To build the library using the install script, use this command:
+
+.. code-block:: shell
+
+    ./install.sh
+
+For more information on the build options and flags for the install script, run the following command:
+
+.. code-block:: shell
+
+    ./install.sh --help
+
+The RCCL build and installation helper script options are as follows:
+
+.. code-block:: shell
+
+       --address-sanitizer     Build with address sanitizer enabled
+    -d|--dependencies          Install RCCL dependencies
+       --debug                 Build debug library
+       --enable_backtrace      Build with custom backtrace support
+       --disable-colltrace     Build without collective trace
+       --disable-msccl-kernel  Build without MSCCL kernels
+       --enable-mscclpp        Build with MSCCL++ support
+    -f|--fast                  Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
+    -h|--help                  Prints this help message
+    -i|--install               Install RCCL library (see --prefix argument below)
+    -j|--jobs                  Specify how many parallel compilation jobs to run ($nproc by default)
+    -l|--local_gpu_only        Only compile for local GPU architecture
+       --amdgpu_targets        Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)
+       --no_clean              Don't delete files if they already exist
+       --npkit-enable          Compile with npkit enabled
+       --openmp-test-enable    Enable OpenMP in rccl unit tests
+       --roctx-enable          Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program)
+    -p|--package_build         Build RCCL package
+       --prefix                Specify custom directory to install RCCL to (default: `/opt/rocm`)
+       --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility
+       --run_tests_all         Run all rccl unit tests (must be built already)
+    -r|--run_tests_quick       Run small subset of rccl unit tests (must be built already)
+       --static                Build RCCL as a static library instead of shared library
+    -t|--tests_build           Build rccl unit tests, but do not run
+       --time-trace            Plot the build time of RCCL (requires `ninja-build` package installed on the system)
+       --verbose               Show compile commands
+
+.. tip::
+
+    By default, the RCCL install script builds all the GPU targets that are defined in ``DEFAULT_GPUS`` in `CMakeLists.txt <https://github.com/ROCm/rccl/blob/develop/CMakeLists.txt>`_.
+    To target specific GPUs and potentially reduce the build time, use ``--amdgpu_targets`` along with
+    a semicolon (``;``) separated string list of the GPU targets.
@@ -0,0 +1,8 @@
+.. meta::
+   :description: RCCL licensing information
+   :keywords: RCCL, ROCm, library, API, license
+
+License
+=======
+
+.. include:: ../LICENSE.txt
@@ -0,0 +1,45 @@
+root: index
+subtrees:
+
+- entries:
+  - file: what-is-rccl.rst
+    title: What is RCCL?
+
+- caption: Install
+  entries:
+  - file: install/installation
+    title: Installation guide
+  - file: install/docker-install
+    title: Running RCCL using Docker
+  - file: install/building-installing
+    title: Building and installing from source
+
+- caption: How to
+  entries:
+  - file: how-to/using-rccl-tuner-plugin-api
+    title: Using the RCCL Tuner plugin
+  - file: how-to/using-nccl
+    title: Using the NCCL Net plugin
+  - file: how-to/troubleshooting-rccl
+    title: Troubleshoot RCCL
+  - file: how-to/rccl-usage-tips
+
+- caption: Examples
+  entries:
+  - url: https://github.com/ROCm/rccl/tree/develop/ext-tuner/example
+    title: RCCL Tuner plugin examples
+  - url: https://github.com/ROCm/rccl/tree/develop/ext-net/example
+    title: NCCL Net plugin examples
+
+- caption: API reference
+  entries:
+  - file: api-reference/library-specification
+    title: Library specification
+  - file: api-reference/api-library
+  - file: api-reference/env-variables
+    title: Environment variables
+
+- caption: About
+  entries:
+  - file: license
+  - file: attributions
@@ -0,0 +1 @@
+rocm-docs-core==1.26.0
@@ -0,0 +1,277 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accessible-pygments==0.0.5
+    # via pydata-sphinx-theme
+alabaster==0.7.16
+    # via sphinx
+asttokens==3.0.0
+    # via stack-data
+attrs==24.3.0
+    # via
+    #   jsonschema
+    #   jupyter-cache
+    #   referencing
+babel==2.15.0
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+beautifulsoup4==4.12.3
+    # via pydata-sphinx-theme
+breathe==4.35.0
+    # via rocm-docs-core
+certifi==2024.7.4
+    # via requests
+cffi==1.16.0
+    # via
+    #   cryptography
+    #   pynacl
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via
+    #   jupyter-cache
+    #   sphinx-external-toc
+comm==0.2.2
+    # via ipykernel
+cryptography==44.0.1
+    # via pyjwt
+debugpy==1.8.12
+    # via ipykernel
+decorator==5.1.1
+    # via ipython
+deprecated==1.2.14
+    # via pygithub
+docutils==0.21.2
+    # via
+    #   breathe
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   sphinx
+exceptiongroup==1.2.2
+    # via ipython
+executing==2.1.0
+    # via stack-data
+fastjsonschema==2.19.1
+    # via
+    #   nbformat
+    #   rocm-docs-core
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via rocm-docs-core
+greenlet==3.1.1
+    # via sqlalchemy
+idna==3.7
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==8.6.1
+    # via
+    #   jupyter-cache
+    #   myst-nb
+ipykernel==6.29.5
+    # via myst-nb
+ipython==8.31.0
+    # via
+    #   ipykernel
+    #   myst-nb
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   myst-parser
+    #   sphinx
+jsonschema==4.23.0
+    # via nbformat
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+jupyter-cache==1.0.1
+    # via myst-nb
+jupyter-client==8.6.3
+    # via
+    #   ipykernel
+    #   nbclient
+jupyter-core==5.7.2
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   nbclient
+    #   nbformat
+markdown-it-py==3.0.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==2.1.5
+    # via jinja2
+matplotlib-inline==0.1.7
+    # via
+    #   ipykernel
+    #   ipython
+mdit-py-plugins==0.4.1
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-nb==1.1.2
+    # via rocm-docs-core
+myst-parser==3.0.1
+    # via myst-nb
+nbclient==0.10.2
+    # via
+    #   jupyter-cache
+    #   myst-nb
+nbformat==5.10.4
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   nbclient
+nest-asyncio==1.6.0
+    # via ipykernel
+packaging==24.0
+    # via
+    #   ipykernel
+    #   sphinx
+parso==0.8.4
+    # via jedi
+pexpect==4.9.0
+    # via ipython
+platformdirs==4.3.6
+    # via jupyter-core
+prompt-toolkit==3.0.50
+    # via ipython
+psutil==6.1.1
+    # via ipykernel
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.3
+    # via stack-data
+pycparser==2.22
+    # via cffi
+pydata-sphinx-theme==0.16.1
+    # via
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==2.3.0
+    # via rocm-docs-core
+pygments==2.18.0
+    # via
+    #   accessible-pygments
+    #   ipython
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt[crypto]==2.8.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+python-dateutil==2.9.0.post0
+    # via jupyter-client
+pyyaml==6.0.1
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   myst-parser
+    #   rocm-docs-core
+    #   sphinx-external-toc
+pyzmq==26.2.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+referencing==0.36.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.32.4
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core==1.26.0
+    # via -r requirements.in
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.1
+    # via gitdb
+snowballstemmer==2.2.0
+    # via sphinx
+soupsieve==2.5
+    # via beautifulsoup4
+sphinx==7.3.7
+    # via
+    #   breathe
+    #   myst-nb
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+sphinx-book-theme==1.1.2
+    # via rocm-docs-core
+sphinx-copybutton==0.5.2
+    # via rocm-docs-core
+sphinx-design==0.6.0
+    # via rocm-docs-core
+sphinx-external-toc==1.0.1
+    # via rocm-docs-core
+sphinx-notfound-page==1.0.2
+    # via rocm-docs-core
+sphinxcontrib-applehelp==1.0.8
+    # via sphinx
+sphinxcontrib-devhelp==1.0.6
+    # via sphinx
+sphinxcontrib-htmlhelp==2.0.5
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==1.0.7
+    # via sphinx
+sphinxcontrib-serializinghtml==1.1.10
+    # via sphinx
+sqlalchemy==2.0.37
+    # via jupyter-cache
+stack-data==0.6.3
+    # via ipython
+tabulate==0.9.0
+    # via jupyter-cache
+tomli==2.0.1
+    # via sphinx
+tornado==6.5.1
+    # via
+    #   ipykernel
+    #   jupyter-client
+traitlets==5.14.3
+    # via
+    #   comm
+    #   ipykernel
+    #   ipython
+    #   jupyter-client
+    #   jupyter-core
+    #   matplotlib-inline
+    #   nbclient
+    #   nbformat
+typing-extensions==4.12.0
+    # via
+    #   ipython
+    #   myst-nb
+    #   pydata-sphinx-theme
+    #   pygithub
+    #   referencing
+    #   sqlalchemy
+urllib3==2.5.0
+    # via
+    #   pygithub
+    #   requests
+wcwidth==0.2.13
+    # via prompt-toolkit
+wrapt==1.16.0
+    # via deprecated
+zipp==3.21.0
+    # via importlib-metadata
@@ -0,0 +1,31 @@
+.. meta::
+   :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs
+   :keywords: RCCL, ROCm, library, API
+
+.. _what-is:
+
+******************
+What is RCCL?
+******************
+
+The ROCm Communication Collectives Library (RCCL) includes multi-GPU and
+multi-node collective communication primitives optimized for AMD GPUs.
+It implements routines such as ``all-reduce``, ``all-gather``, ``reduce``,
+``broadcast``, ``reduce-scatter``, ``gather``, ``scatter``, ``all-to-allv``,
+and ``all-to-all``, as well as direct point-to-point (GPU-to-GPU) send
+and receive operations. It is optimized to achieve high bandwidth
+on platforms using PCIe and xGMI and networking using InfiniBand Verbs or TCP/IP
+sockets. RCCL supports an arbitrary number of GPUs installed in a single node
+or multiple nodes and can be used in either
+single- or multi-process (for example, MPI) applications.
+
+The collective operations are implemented using ring and tree algorithms and have been optimized
+for throughput and latency by leveraging topology awareness, high-speed interconnects,
+and RDMA-based collectives. For best performance, small operations can be either
+batched into larger operations or aggregated through the API.
+
+RCCL uses PCIe and xGMI high-speed interconnects for intra-node communication
+as well as InfiniBand, RoCE, and TCP/IP for inter-node communication.
+It supports an arbitrary number of GPUs installed in a single-node or
+multi-node platform and can easily integrate into
+single- or multi-process (for example, MPI) applications.
@@ -0,0 +1,419 @@
+# NCCL Net Plugin Documentation
+
+This page describes the NCCL Net plugin API and how to implement a network plugin for NCCL.
+
+# Overview
+
+To allow NCCL to work on any network type, NCCL provides a way to use external plugins. Plugins
+implement the NCCL network API, and decouple NCCL binary builds which are built against a
+particular version of the GPU stack (i.e. CUDA) from the network code which is built against a
+particular version of the networking stack. That way, we can easily integrate any CUDA version
+with any network stack version.
+
+NCCL network plugins come as a shared library called `libnccl-net.so`. That shared library
+contains one or more implementations of the NCCL NET API, in the form of versioned structs,
+filled with pointers to all required functions.
+
+# Plugin architecture
+
+## Plugin name and supporting multiple network plugins
+
+When NCCL is initialized, it will look for a `libnccl-net.so` library and dynamically load it,
+then look for symbols inside the library.
+
+The `NCCL_NET_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
+will look for a library with a name of `libnccl-net-${NCCL_NET_PLUGIN}.so`. It is therefore
+advised to name the library following that pattern, with a symlink pointing `libnccl-net.so`
+to `libnccl-net-${NCCL_NET_PLUGIN}.so`. That way, if there are multiple plugins in the path,
+setting `NCCL_NET_PLUGIN` will allow users to select the right plugin.
+
+## Struct versioning
+
+Once a library is found, NCCL will look for a symbol named `ncclNet_vX`, with `X` increasing
+over time. The versioning ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide multiple of those symbols, implementing multiple versions
+of the NCCL NET API, so that the same plugin can be compiled and support a wide range of NCCL
+versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
+for the latest ncclNet struct version, but also looking for older ones so that older plugins
+would still work.
+
+## In-network collective operations, a.k.a. collNet
+
+Additionally to the ncclNet structure, network plugins can provide a collNet structure which
+implements in-network collective operations, if supported. That can be used by the NCCL collNet
+algorithm to accelerate inter-node reductions in allReduce.
+
+The collNet struct is a different, optional struct provided by the network plugin, but its
+versioning is tied to the ncclNet struct and many functions are common between the two to
+ease the implementation.
+
+## Headers management
+
+To help users build plugins effortlessly, plugins should copy the `ncclNet_vX` definitions
+they support to their internal includes. An example is shown in `ext-net/example/` where we keep
+all headers in the `nccl/` directory and provide thin layers to implement old versions on top
+of newer ones.
+
+The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
+from old API versions. It also provides error codes in `err.h`.
+
+# API (v10)
+
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
+
+```
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_t;
+```
+
+## Error codes
+
+All plugins functions use NCCL error codes as return value. `ncclSuccess` should be returned upon
+success.
+
+Otherwise, plugins can return one of the following:
+ - `ncclSystemError` is the most common error for network plugins, when a call to the linux kernel
+or a system library fails. This typically includes all network/hardware errors.
+ - `ncclInternalError` is returned when the NCCL core code is using the network plugin in an
+incorrect way, for example allocating more requests than it should, or passing an invalid argument
+to calls.
+ - `ncclInvalidUsage` should be returned when the error is most likely a user error. This can
+include misconfiguration, but also sizes mismatch.
+ - `ncclInvalidArgument` should usually not be used by plugins since arguments should be checked by
+the NCCL core layer.
+ - `ncclUnhandledCudaError` is returned when an error comes from CUDA. Since network plugins should
+not need to rely on CUDA, this should not be common.
+
+## Operation overview
+
+NCCL will call the `init` function first, then query the number of network devices with the
+`devices` function, getting each network device properties with `getProperties`.
+
+If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice`
+specifying a list of physical devices (the original devices listed from `devices`) it wishes to
+merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null.
+
+To establish a connection between two network devices, NCCL will first call `listen` on the
+receiving side, pass the returned handle to the sender side of the connection, and call `connect`
+with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
+establishment.
+
+`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller
+wishes to make use of device networking. This parameter may be ignored by the plugin if it does
+not support device-side networking.
+
+Once the connection is established, communication will be done using the functions `isend`,
+`irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
+all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
+
+In certain conditions, `iflush` will be called after a receive calls completes to allow the network
+plugin to flush data and ensure the GPU will observe the newly written data.
+
+To close the connections NCCL will call `closeListen` to close the object returned by `listen`,
+`closeSend` to close the object returned by `connect` and `closeRecv` to close the object returned
+by `accept`.
+
+## API Functions
+
+### Initialization
+`name`
+
+The `name` field should point to a character string with the name of the network plugin. This will
+be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
+
+Note: setting `NCCL_NET=<plugin name>` will ensure a specific network implementation is used, with
+a matching `name`. This is not to be confused with `NCCL_NET_PLUGIN` which defines a suffix to the
+`libnccl-net.so`library name to load.
+
+`init`
+
+As soon as NCCL finds the plugin and the correct ncclNet symbol, it will call the `init` function.
+This will allow the plugin to discover network devices and make sure they are usable. If the
+`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
+internal ones.
+
+To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
+function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
+the plugin code adding the following definitions:
+
+```
+#define WARN(...) logFunction(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+```
+
+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
+`devices`
+
+Once the plugin is initialized, NCCL will query the number of devices available. It should not
+be zero, otherwise NCCL initialization will fail. If no device is present or usable, the `init`
+function should not return `ncclSuccess`.
+
+`getProperties`
+
+Right after getting the number of devices, NCCL will query properties for each available network
+device. These properties are critical when multiple adapters are present to ensure NCCL uses each
+adapter in the most optimized way.
+
+The `name` is only used for logging.
+
+The `pciPath` is the base for all topology detection and should point to the PCI device directory
+in /sys. This is typically the directory pointed by `/sys/class/net/eth0/device` or
+`/sys/class/infiniband/mlx5_0/device`. If the network interface is virtual, then `pciPath` should
+be `NULL`.
+
+The `guid` field is used to determine when network adapters are connected to multiple PCI
+endpoints. For normal cases, it can be set to the device number. If multiple network devices have
+the same guid, then NCCL will consider the are sharing the same network port to the fabric, hence
+it will not use the port multiple times.
+
+The `ptrSupport` field indicates whether or not CUDA pointers are supported. If so, it should be
+set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HOST`. If the plugin
+supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
+provide a `regMrDmaBuf` function.
+
+The `regIsGlobal` field allows NCCL to register buffers in advance using e.g. a loopback connection
+and later on, expect that another registration on a buffer contained within a previous registration
+will be nearly immediate, as the buffer is already known by the network adapter. A typical
+implementation would maintain a registration cache; the call to ncclCommRegister will create the
+initial entry in the cache using regMr() on a loopback connection. Any later call to NCCL
+operations will call regMr() again on the real connection, with the real buffer (could be at a
+different offset within the original buffer, with a smaller size, etc), then deregMr() right after.
+The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
+on the network adapter.
+
+The `forceFlush` field can request the NCCL core to call flush for all transfers. By default,
+flushes are only called when the GPU architecture or PCI topology would not not guarantee correct
+PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the
+completion paths use different PCI links and therefore need a call to flush() to guarantee
+ordering.
+
+The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
+important to ensure proper optimization of flows within the node.
+
+The `port` field indicates the port number. This is important again for topology detection and flow
+optimization within the node when a NIC with a single PCI connection is connected to the fabric
+with multiple ports.
+
+The `latency` field indicates the network latency in microseconds. This can be useful to improve
+the NCCL tuning and make sure NCCL switches from tree to ring at the right size.
+
+The `maxComms` field indicates the maximum number of connections we can create.
+
+The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
+receive).
+
+The `netDeviceType` indicates which type of device networking this plugin supports. The current supported
+options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`.
+
+The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code.
+
+The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for
+point-to-point and collective calls. This will tell the NCCL core to cut large operations into
+multiple smaller chunks if needed.
+
+`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device.
+
+### Connection establishment
+
+Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
+side.
+
+`listen`
+
+To create a connection, NCCL will start by calling `listen` on the receiver side. This function
+takes a device number as input argument, and should return a local `listenComm` object, and a
+`handle` to pass to the other side, so that the sender side can connect to the receiver.
+
+The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
+
+This call should never block, but contrary to `connect` and `accept`, `listenComm` should never
+be `NULL` if the call succeeds.
+
+`connect`
+
+NCCL will use its bootstrap infrastructure to provide the `handle` to the sender side, then call
+`connect` on the sender side on a given device index `dev`, providing the `handle`. `connect`
+should not block either, and instead set `sendComm` to `NULL` and return `ncclSuccess`. In that
+case, NCCL will call `accept` again until it succeeds.
+
+`accept`
+
+To finalize the connection, the receiver side will call `accept` on the `listenComm` returned by
+the `listen` call previously. If the sender did not connect yet, `accept` should not block. It
+should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
+succeeds.
+
+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
+`closeListen`/`closeSend`/`closeRecv`
+
+Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
+`closeListen`/`closeSend`/`closeRecv` to free the associated resources.
+
+### Communication
+
+Communication is done using asynchronous send and receive operations: `isend`, `irecv` and `test`.
+To support RDMA capabilities, buffer registration and flush functions are provided.
+
+To keep track of asynchronous send, receive and flush operations, requests are returned to NCCL,
+then queried with `test`. Each `sendComm` or `recvComm` must be able to handle
+`NCCL_NET_MAX_REQUESTS` requests in parallel.
+
+Note: That value should be multiplied by the multi-receive capability of the plugin for the sender
+side, so that we can effectively have `NCCL_NET_MAX_REQUESTS` multi-receive operations happening
+in parallel. So, if we have a `maxRecvs`value of 8 and `NCCL_NET_MAX_REQUESTS` is 8, then each
+`sendComm` must be able to handle up to 8x8=64 concurrent `isend` operations.
+
+`regMr`
+
+Prior to sending or receiving data, NCCL will call `regMr` with any buffers later used for
+communication. It will provide a `sendComm` or `recvComm` as `comm` argument, then the buffer
+pointer `data`, `size`, and `type` being either `NCCL_PTR_HOST`, or `NCCL_PTR_CUDA` if the network
+supports CUDA pointers.
+
+The network plugin can use the output argument `mhandle` to keep any reference to that memory
+registration, as this `mhandle` will be passed back for all `isend`, `irecv`, `iflush` and
+`deregMr` calls.
+
+`regMrDmaBuf`
+
+If the plugin has set the `NCCL_PTR_DMABUF` property in `ptrSupport`, NCCL will use `regMrDmaBuf`
+instead of `regMr`. If the property was not set, `regMrDmaBuf` can be set to `NULL`.
+
+
+`deregMr`
+
+When buffers will no longer be used for communication, NCCL will call `deregMr` to let the plugin
+free resources. This function is used to deregister handles returned by both `regMr` and
+`regMrDmaBuf`.
+
+`isend`
+
+Data will be sent through the connection using `isend`, passing the `sendComm` previously
+created by `connect`, and the buffer described by `data`, `size`, and `mhandle`. A `tag` must be
+used if the network supports multi-receive operations (see `irecv`) to distinguish between
+different sends matching the same multi-receive. Otherwise it can be set to 0.
+
+The `isend` operation returns a handle in the `request` argument for further calls to `test`. If
+the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
+`isend` again later.
+
+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
+`irecv`
+
+To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
+`n` will allow NCCL to perform a multi-receive, to allow grouping of multiple sends through a
+single network connection. Each buffer will be described by the `data`, `sizes`, and `mhandles`
+arrays. `tags` will specify a tag for each receive so that each of the `n` independent `isend`
+operations is received into the right buffer.
+
+If all receive operations can be initiated, `irecv` will return a handle in the `request` pointer,
+otherwise it will set it to `NULL`. In the case of multi-receive, all `n` receive operations are
+handled by a single request handle.
+
+The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
+The contrary (receive size being lower than the send size) is an error, however.
+
+NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using
+LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion
+of irecv and is resilient to redundant network writes. This allows the plugin to optimize request
+completions on such irecvs (for example, complete the request immediately). The plugin is still
+expected to set a valid request pointer on return which NCCL can poll to check for completion.
+
+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
+Note: for a given connection, send/receive operations should always match in the order they were
+posted. Tags provided for receive operations are only used to assign a given send operation to one
+of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
+matching on any receive operation posted.
+
+`test`
+
+After an `isend` or `irecv` operation is initiated, NCCL will call `test` on the request handles
+until they complete. When that happens, `done` will be set to 1 and `sizes` will be set to the
+real size sent or received, the latter being potentially lower than the size passed to `irecv`.
+
+In the case of a multi-receive, all receives will be considered as done as a single operation (the
+goal being to allow aggregation), hence they share a single request and a single `done` status.
+However, they can have different sizes, so when `done` is non-zero, the `sizes` array should
+contain the `n` sizes corresponding to the buffers passed to `irecv`.
+
+Once `test` returns 1 in `done`, the request handle can be freed, meaning that NCCL will never
+call `test` again on that request (until it is reallocated by another call to `isend` or `irecv`).
+
+`iflush`
+
+After a receive operation completes, if the operation was targeting GPU memory and received a
+non-zero number of bytes, NCCL will call `iflush` to let the network flush any buffer and ensure
+the GPU can read it right after without seeing stale data. This flush operation is decoupled from
+the `test` code to improve latency of `LL*` protocols, as those are capable of determining when
+data is valid or not.
+
+`iflush` returns a request which needs to be queried with `test` until it completes.
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+
+build: ${BUILDDIR}/libnccl-net-example.so
+
+${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/libnccl-net-example.so
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+#include <stdint.h>
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
+
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#endif
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_H_
+#define NET_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "err.h"
+#include "net_device.h"
+#include "common.h"
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+#include "net_v10.h"
+#include "net_v9.h"
+#include "net_v8.h"
+#include "net_v7.h"
+#include "net_v6.h"
+#include "net_v5.h"
+#include "net_v4.h"
+#include "net_v3.h"
+#include "net_v2.h"
+
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
+#endif // end include guard
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_DEVICE_H_
+#define NET_DEVICE_H_
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+
+#endif
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V2_H_
+#define NET_V2_H_
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
+
+#endif // end include guard
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V3_H_
+#define NET_V3_H_
+
+#define NCCL_NET_MAX_REQUESTS_V3 16
+
+typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v3_t;
+
+#endif // end include guard
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V4_H_
+#define NET_V4_H_
+
+#define NCCL_NET_HANDLE_MAXSIZE_V4 64
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v4_t;
+
+#endif // end include guard
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V5_H_
+#define NET_V5_H_
+
+typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+#endif // end include guard
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V6_H_
+#define NET_V6_H_
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+#endif // end include guard
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V7_H_
+#define NET_V7_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+#endif // end include guard
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V8_H_
+#define NET_V8_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+#endif // end include guard
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V9_H_
+#define NET_V9_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclNet_v9_t;
+
+#endif // end include guard
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+} ncclDataType_t;
+
+#endif
@@ -0,0 +1,418 @@
+/*************************************************************************
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "net.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+#define NCCL_PLUGIN_MAX_RECVS 1
+
+int max_requests = NCCL_NET_MAX_REQUESTS;
+
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
+__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
+__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
+  // Below are default values, if unsure don't change.
+
+  props->name = "Example";
+  // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
+  props->pciPath = NULL;
+  // Only used to detect NICs with multiple PCI attachments.
+  props->guid = 0;
+  // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
+  props->ptrSupport = NCCL_PTR_HOST;
+  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
+  props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
+  // Speed in *Mbps*. 100000 means 100G
+  props->speed = 100000;
+  // Port number, used in conjunction with guid
+  props->port = 0;
+  // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
+  props->latency = 0;
+  // Maximum number of comm objects we can create.
+  props->maxComms = 1024*1024;
+  // Maximum number of receive operations taken by irecv().
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
+  // Coupling with NCCL network device-side code.
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
+
+#define PLUGIN_NAME "Plugin"
+
+const ncclNet_v10_t ncclNetPlugin_v10 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice,
+};
+
+__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
+  return pluginInit(logFunction, NULL);
+}
+
+__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
+  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+}
+
+__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
+  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+  return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
+
+const ncclNet_v9_t ncclNetPlugin_v9 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v9,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v9,
+  .irecv = pluginIrecv_v9,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v9,
+};
+
+__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v8->name = props.name;
+  props_v8->pciPath = props.pciPath;
+  props_v8->guid = props.guid;
+  props_v8->ptrSupport = props.ptrSupport;
+  props_v8->regIsGlobal = props.regIsGlobal;
+  props_v8->speed = props.speed;
+  props_v8->latency = props.latency;
+  props_v8->port = props.port;
+  props_v8->maxComms = props.maxComms;
+  props_v8->maxRecvs = props.maxRecvs;
+  props_v8->netDeviceType = props.netDeviceType;
+  props_v8->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
+  for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
+}
+
+const ncclNet_v8_t ncclNetPlugin_v8 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v8,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+ };
+
+__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v7->name = props.name;
+  props_v7->pciPath = props.pciPath;
+  props_v7->guid = props.guid;
+  props_v7->ptrSupport = props.ptrSupport;
+  props_v7->speed = props.speed;
+  props_v7->latency = props.latency;
+  props_v7->port = props.port;
+  props_v7->maxComms = props.maxComms;
+  props_v7->maxRecvs = props.maxRecvs;
+  props_v7->netDeviceType = props.netDeviceType;
+  props_v7->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
+  return pluginRegMr(collComm, data, size, type, mhandle);
+}
+
+const ncclNet_v7_t ncclNetPlugin_v7 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v7,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr_v7,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+};
+
+__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v6->name = props.name;
+  props_v6->pciPath = props.pciPath;
+  props_v6->guid = props.guid;
+  props_v6->ptrSupport = props.ptrSupport;
+  props_v6->speed = props.speed;
+  props_v6->latency = props.latency;
+  props_v6->port = props.port;
+  props_v6->maxComms = props.maxComms;
+  props_v6->maxRecvs = props.maxRecvs;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; }
+
+const ncclNet_v6_t ncclNetPlugin_v6 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v6,
+  .listen = pluginListen,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
+  .regMr = pluginRegMr_v7,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen
+};
+
+/* v5 Compat */
+const ncclNet_v5_t ncclNetPlugin_v5 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v6,
+  .listen = pluginListen,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v4 Compat */
+static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v4->name = props.name;
+  props_v4->pciPath = props.pciPath;
+  props_v4->guid = props.guid;
+  props_v4->ptrSupport = props.ptrSupport;
+  props_v4->speed = props.speed;
+  props_v4->port = props.port;
+  props_v4->maxComms = props.maxComms;
+  return ncclSuccess;
+}
+static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
+  return pluginIsend_v8(sendComm, data, size, 0, mhandle, request);
+}
+static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  int tag = 0;
+  return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request);
+}
+static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
+}
+static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
+  ncclResult_t ret;
+  do {
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
+  } while (ret == ncclSuccess && *sendComm == NULL);
+  return ret;
+}
+static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
+  ncclResult_t ret;
+  do {
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginAccept(listenComm, recvComm, &handle);
+  } while (ret == ncclSuccess && *recvComm == NULL);
+  return ret;
+}
+const ncclNet_v4_t ncclNetPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v4,
+  .listen = pluginListen,
+  .connect = pluginConnect_v4,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .iflush = pluginIflush_v4,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v3 Compat */
+static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhandle) {
+  void* req;
+  ncclResult_t ret = pluginIflush_v4(recvComm, data, size, mhandle, &req);
+  int done = 0;
+  while (ret == ncclSuccess && done == 0) {
+    ret = pluginTest(req, &done, NULL);
+  }
+  return ret;
+}
+static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
+  max_requests = NCCL_NET_MAX_REQUESTS_V3;
+  return pluginInit(logFunction, NULL);
+}
+#include <string.h>
+static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
+  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
+  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
+  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
+  return ret;
+}
+static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
+  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
+  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4);
+  return pluginConnect_v4(dev, &pluginHandle, sendComm);
+}
+const ncclNet_v3_t ncclNetPlugin_v3 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v3,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v4,
+  .listen = pluginListen_v3,
+  .connect = pluginConnect_v3,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .flush = pluginFlush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
+
+/* v2 Compat */
+const ncclNet_v2_t ncclNetPlugin_v2 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v3,
+  .devices = pluginDevices,
+  .pciPath = pluginPciPath,
+  .ptrSupport = pluginPtrSupport,
+  .listen = pluginListen,
+  .connect = pluginConnect_v4,
+  .accept = pluginAccept_v4,
+  .regMr = pluginRegMr_v7,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v4,
+  .irecv = pluginIrecv_v4,
+  .flush = pluginFlush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+};
@@ -0,0 +1,22 @@
+CUDA_HOME?=/usr/local/cuda
+INC:=-I$(CUDA_HOME)/include
+PLUGIN_SO:=libnccl-net.so
+
+default: $(PLUGIN_SO)
+
+$(PLUGIN_SO): nccl-fastsocket/*.cc
+	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+nccl-fastsocket/*.cc:
+	git clone https://github.com/google/nccl-fastsocket.git
+
+install: $(BUILDDIR)/lib/$(PLUGIN_SO)
+
+$(BUILDDIR)/lib/$(PLUGIN_SO): $(PLUGIN_SO)
+	@printf "Grabbing %-35s > %s\n" $< $@
+	mkdir -p $(BUILDDIR)/lib
+	install -m 644 $< $@
+
+clean:
+	rm -f $(PLUGIN_SO)
+	rm -Rf nccl-fastsocket
@@ -0,0 +1,461 @@
+# NCCL Profiler Plugin Documentation
+
+This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL.
+
+# Overview
+
+To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin
+interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and
+use it for debugging and analysis.
+
+Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
+called `lirccl-profiler.so`. That shared library contains one or more implementations of the
+NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
+functions.
+
+# Plugin architecture
+
+## Plugin name and supporting multiple profiler plugins
+
+When NCCL is initialized, it will look for a `librccl-profiler.so` library and dynamically load
+it, then look for symbols inside the library.
+
+The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
+will look for a library with a name of `librccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
+advised to name the library following that pattern, with a symlink pointing `librccl-profiler.so`
+to `librccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
+path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
+the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `librccl-profiler.so` library.
+
+## Struct versioning
+
+Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing
+over time. The versioning ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the
+NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
+for the latest ncclProfiler struct version, but also looking for older ones so that older plugins
+would still work.
+
+## Headers management
+
+To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions
+they support to their internal includes. An example is shown in `ext-profiler/example` where we
+keep all headers in the `nccl/` directory and provide thin layers to implement old version on top
+of newer ones.
+
+The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
+from old API versions. It also provides error codes in `err.h`.
+
+# API (v4)
+
+Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
+
+```
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+```
+
+## Error codes
+
+As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal
+functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for
+them arises in the future. For now, any profiler interface call should only return `ncclSuccess`.
+The only exception is `init` that can return an error so that NCCL can disable the plugin.
+
+## Operation overview
+
+NCCL will call the `init` function first for every new communicator that is initialized. The profiler
+returns an opaque context handle that is used to isolate profiler instances across communicators.
+Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources.
+
+The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`.
+These are used to start, stop and update events in the profiler, respectively.
+
+## API Functions
+
+### Initialization
+
+#### name
+
+The `name` field should point to a character string with the name of the profiler plugin. This will
+be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
+
+#### init
+
+As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function.
+This allows the plugin to initialize its internal context, used during profiling of NCCL events.
+If the `init` function does not return `ncclSuccess`, NCCL disables the plugin.
+
+#### finalize
+
+When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees
+up resources.
+
+### Profiling
+
+#### startEvent
+
+When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler
+context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and
+returns an opaque profiler event handle that can be passed to other profiler functions, as discussed
+later in the document.
+
+
+The event descriptor contains all the event metadata. Every event type has its own descriptor. Below
+is the `ncclProfilerEventDescr_t` struct.
+
+```
+typedef struct {
+  uint8_t type;             // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
+  void* parentObj;          // pointer to parent event used to expose the event hierarchy to the profiler
+  int rank;                 // rank that generated the event
+  union {
+    struct {                // collective events metadata
+      uint64_t seqNumber;   // sequence number of this collective operation in the communicator
+      const char* func;     // string containing name of the collective
+      void const* sendBuff; // address of send buffer
+      void* recvBuff;       // address of recv buffer
+      size_t count;         // data count
+      int root;             // root rank
+      const char* datatype; // string containing the name of the datatype
+      uint8_t nChannels;    // number of channels for this collective
+      uint8_t nWarps;       // number of GPU warps for this collective
+      const char* algo;     // string containing name of the algorithm for this collective
+      const char* proto;    // string containing name of the protocol for this collective
+    } coll;
+
+    struct {                // point-to-point events metadata
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;             // peer rank for this point-to-point
+      uint8_t nChannels;    // number of channels for this p2p
+    } p2p;
+
+    struct {                // proxyOp events metadata
+      pid_t pid;            // process id that generated the associated `ncclProxyOp` object
+      uint8_t channelId;    // id of the channel used by the associated `ncclProxyOp` object
+      int peer;             // peer rank
+      int nSteps;           // number of network transfers/steps required by the `ncclProxyOp`
+      int chunkSize;        // chunk size for this `ncclProxyOp`
+      int isSend;           // type of network operation
+    } proxyOp;
+
+    struct {                // proxyStep events metadata
+      int step;             // individual step in `ncclProxyOp`
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;    // id of the channel used by the kernel
+      uint64_t ptimer;      // kernel supplied timestamp
+    } kernelCh;
+
+    struct {
+      int64_t id;           // net plugin id (used by net and profiler plugins to agree on event definitions)
+      void* data;           // pointer to network plugin defined event
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+```
+
+NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
+`ncclProfileNetPlugin`.
+
+#### stopEvent
+
+`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event
+has been stopped the handle can no longer be used with other profiler calls. Using the event
+handle after `eventStop` is undefined behavior.
+
+#### recordEventState
+
+Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
+`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
+
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
+`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
+
+The state of these events can be updated, along with event attributes, using `recordEventState`.
+These events can go through several states during their lifecycle.
+
+The list of supported states for the updatable events is reported below.
+
+```
+typedef enum {
+  // ncclProfileProxyOp event states
+  ncclProfilerProxyOpSendPosted        = 0, // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1, // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2, // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3, // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4, // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5, // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6, // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7, // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,// state marks transition of proxy op to progress
+
+  // ncclProfileProxyStep event states
+  ncclProfilerProxyStepSendGPUWait     = 8, // state marks the waiting of send data from GPU for given network transfer/step
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,// state marks the waiting of recv clear to send credits for given network transfer/step
+  ncclProfilerProxyStepSendWait        = 9, // state marks the waiting of send data from network for given network transfer/step
+  ncclProfilerProxyStepRecvWait        = 10,// state marks the waiting of recv data from network for given network transfer/step
+  ncclProfilerProxyStepRecvFlushWait   = 11,// state marks the waiting of recv data flush to GPU for given network transfer/step
+  ncclProfilerProxyStepRecvGPUWait     = 12,// state marks the waiting of recv data consumption from GPU for given network transfer/step
+
+  // ncclProfileProxyCtrl event states
+  ncclProfilerProxyCtrlIdle            = 13,// state marks proxy progress thread idle
+  ncclProfilerProxyCtrlActive          = 14,// state marks proxy progress thread active
+  ncclProfilerProxyCtrlSleep           = 15,// state marks proxy progress thread sleeping
+  ncclProfilerProxyCtrlWakeup          = 16,// state marks proxy progress thread waking up
+  ncclProfilerProxyCtrlAppend          = 17,// state marks append of new network work item begin
+  ncclProfilerProxyCtrlAppendEnd       = 18,// state marks append of new network work item end
+
+  // ncclProfileNetPlugin event states
+  ncclProfilerNetPluginUpdate          = 21,// state marks update of network defined event
+
+  // ncclProfileKernelCh event states
+  ncclProfilerKernelChStop             = 22,// state marks stop of kernelCh event and timestamp update
+} ncclProfilerEventState_v4_t;
+```
+
+`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
+network requests for the GPU kernel. ProxyOp events are generated for every active channel and
+provide a summary of the activity of the proxy progress thread for that channel. Most of the
+states for this event were duplicated with `ncclProfileProxyStep` events. Therefore, starting
+with version 4 of the profiler interface these states have been deprecated. The same level of
+information can still be obtained through the `ncclProfileProxyStep` events.
+
+`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
+network requests for the GPU kernel. ProxyStep events describe individual network transfer in
+the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
+
+`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing
+network requests for the GPU kernel. This includes everything else that the proxy thread might be
+doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
+
+`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
+processes work items for the enqueued NCCL operations.
+
+`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
+their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
+the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
+network defined event definition using the plugin id in the event descriptor. The plugin identifier
+is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
+16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
+unused and available for future extensions.
+
+A network IB plugin can use this infrastructure to define a QP event as:
+
+```C
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+```
+
+The network event infrastructure is network agnostic. A different network socket plugin can
+use it to define a socket event as:
+
+```C
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+```
+
+The network plugin creates an event (descriptor) and passes it to the profiler callback,
+along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
+event descriptor, attaches the network plugin defined event as external data, and calls
+the profiler `startEvent` function.
+
+```C
+ncclResult_t isend(..., void* phandle, ...) {
+  ...
+  int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+  ncclProfilerNetIbDescr_v1_t eDescr = { };
+  eDescr.type = ncclProfileQp;
+  eDescr.qp = { ... };
+  ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
+  ...
+}
+```
+
+State transitions for the events described can also come with event attribute updates. For this
+reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
+
+```
+typedef union {
+  struct {                // attributes for update for ncclProfileProxyStep events
+    size_t transSize;     // transfer size field for this proxy step
+  } proxyStep;
+
+  struct {                // attributes to update for ncclProfileProxyCtrl events
+    int appendedProxyOps; // number of appended proxy ops thus far
+  } proxyCtrl;
+
+  struct {                // attributes to update for ncclProfileNetPlugin events
+    void* data;           // network plugin opaque update data field
+  } netPlugin;
+
+  struct {                // attribute to update for ncclProfileKernelCh events
+    uint64_t pTimer;      // timestamp provided by the NCCL kernel
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+```
+
+The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
+
+### Event hierarchy
+
+NCCL core events (reported above) are organized into a hierarchy as reported below:
+
+```
+Group event
+   |
+   +- Collective event
+   |  |
+   |  +- ProxyOp event
+   |  |  |
+   |  |  +- ProxyStep event
+   |  |     |
+   |  |     +- NetPlugin event
+   |  |
+   |  +- KernelCh event
+   |
+   +- Point-to-point event
+      |
+      +- ProxyOp event
+      |  |
+      |  +- ProxyStep event
+      |     |
+      |     +- NetPlugin event
+      |
+      +- KernelCh event
+
+ProxyCtrl event
+```
+
+# Profiler instrumentation and logging
+
+## Profiling of collective and p2p operations
+
+The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
+collective and point-to-point operations, as well as proxy, kernel and network activity. Due to the asynchronous nature
+of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
+precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
+figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
+the profiler that the collective has been enqueued. The profiler can leverage proxy and/or kernel event information, if
+these are enabled, to estimate when the collective ends. For example, the profiler can look at the `stopEvent`
+call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
+can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
+increment and decrement the reference counter, respectively.
+
+## PXN
+
+PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that
+generated the operation. When this happens, the event hierarchy reported above breaks. Because the
+profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the
+parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy
+thread originating the operation. To avoid the profiler instance in the remote proxy address space to
+dereference a pointer from another address space the event descriptor includes the PID of the originator.
+The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
+parent event.
+
+# Known Limitations
+
+In intra-node communication, or whenever a rank does not have any network activity for which proxy events
+are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
+enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
+collective. However, this time only represents the launch time of the collective and not the actual
+execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
+
+With version 3 of the profiler interface network activity is no longer required to do intra-node profiling.
+Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
+thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
+the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
+accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
+delayed, a similar loss of accuracy can be encountered.
+
+To mitigate this effect, with version 4 of the profiler NCCL uses a per-channel ring buffer of 64 elements.
+Every counter is complemented by two timestamps (ptimers) supplied by the NCCL kernel (one for start and one
+for stop of the operation in the kernel). NCCL propagates these timestamps to the profiler plugin that it can
+convert them to CPU time domain.
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+
+build: ${BUILDDIR}/librccl-profiler.so
+
+${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/librccl-profiler.so
@@ -0,0 +1,239 @@
+# NCCL Example Profiler Plugin Usage
+
+This page describes how to use the NCCL example profiler plugin
+
+# Overview
+
+The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API
+defines a set of events and data structures that NCCL uses to share event information with profiler
+plugins. The user can control what events are instrumented by NCCL and when traces collected by the
+profiler should be dumped through environment variables, as described in the rest of the document.
+The user can also control other profiler parameters that alter its behavior. For example, users can
+change the size of the event window the profiler keeps track of.
+
+## Building the profiler plugin
+
+To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
+You can override `NCCL_HOME` to where the NCCL installation is on your system.
+
+## Using the profiler plugin
+
+1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`,
+   as documented in `ext-profiler/README.md`.
+
+2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By
+   default, all collectives and send/recv operations will be traced. For more details about the event
+   representation used by the profiler refer to `ext-profiler/README.md`.
+
+   As an example, setting:
+
+   `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
+
+   enables the profiling of the group, the collective and the proxy op events. The same events can be
+   expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
+   in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
+   is that the profiler can easily correlate events that belong to the same NCCL operation and present
+   them accordingly.
+
+3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
+   ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
+   event format (more precisely, using asynchronous events).
+
+4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and
+   open the created dump file to visualize the traces.
+
+# Changing the profiler memory pool sizes
+
+The example profiler uses separate memory pools for different types of events. The size of these memory
+pools (i.e., the # events) determines the number of events that the profiler can keep track of at the
+same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce`
+operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it
+finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the
+profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event
+handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned
+event handle. The profiler has a total of 5 memory pools.
+
+The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool
+contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
+generated by remote proxies. A list of pools and their size is reported below:
+
+- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
+- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
+- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
+- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
+- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
+
+Remote proxy operations are generated when PXN is in use. Refer to this article for more information
+about PXN and how it works:
+https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/
+
+# Reported events
+
+The example profiler generates traces using the json format. An example of trace is reported below:
+
+```
+[
+{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
+{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
+{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
+{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
+ ... [ trace truncated for brevity ]
+{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
+{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
+{}]
+```
+
+Details about the fields used in the trace can be found at this link:
+https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
+
+The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
+the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
+(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
+one collective and this is what is presented in the traces above).
+
+
+The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args`
+field in the traces contains NCCL specific information (aside from the chrome trace event format).
+
+## AllReduce trace
+
+The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field:
+
+- seqNum      : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator)
+- commHash    : communicator unique identifier
+- rank        : NCCL rank for the ncclAllReduce
+- datatype    : NCCL datatype
+- algorithm   : algorithm used to process the ncclAllReduce
+- protocol    : protocol used to process the ncclAllReduce
+- nMaxChannels: max number of channels used to process the ncclAllReduce
+
+If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
+consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
+of collective and p2p operations`.
+
+### Proxy Send
+The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
+info in the args field:
+
+- Channel      : id of the channel used by this proxy operation to send data to the peer
+- Peer         : peer rank
+- Steps        : number of network steps required to transfer transSize bytes to the peer
+- ChunkSize    : chunk size used by NCCL to pipeline data through the proxy thread
+- transSize    : bytes transferred across the channel by this proxy operation
+- POSTED       : struct containing the number of buffer posts to the GPU and the time stamp for the last post
+- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
+- TRANSMITTED  : struct containing the number of network sends and the time stamp of the last send
+- DONE         : struct containing the number of network sends completed and the time stamp of the last send completed
+
+In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
+which could help identify at which point the network problem occurred.
+
+The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
+needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
+entries below are also reported by the profiler.
+
+#### Proxy SendBufferWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
+
+#### Proxy SendGPUWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
+buffer.
+
+#### Proxy SendWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
+
+### Proxy Recv
+
+The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
+info in the args field:
+
+- Channel    : id of the channel used by this proxy operation to recv data from the peer
+- Peer       : peer rank
+- Steps      : number of network steps required to transfer transSize bytes from the peer
+- ChunkSize  : chunk size used by NCCL to pipeline data through the proxy thread
+- transSize  : bytes transferred across the channel by this proxy operation
+- POSTED     : struct containing the number of recvs posted and the time stamp for the last recv posted
+- RECEIVED   : struct containing the number of recvs completed and the time stamp for the last recv completed
+- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
+- DONE       : struct containing the number of flush completed and the time stamp for the last flush completed
+
+The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
+needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
+entries below are also reported by the profiler.
+
+
+#### Proxy RecvBufferWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
+become available.
+
+#### Proxy RecvWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
+
+#### Proxy RecvFlushWait
+
+Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
+
+#### Proxy RecvGPUWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
@@ -0,0 +1,30 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include "event.h"
+
+int taskEventQueueEmpty(struct group* g) {
+  return g->eventHead == NULL;
+}
+
+void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
+  event->next = NULL;
+  if (g->eventHead) g->eventTail->next = event;
+  else g->eventHead = event;
+  g->eventTail = event;
+}
+
+struct taskEventBase* taskEventQueueHead(struct group* g) {
+  return g->eventHead;
+}
+
+struct taskEventBase* taskEventQueueDequeue(struct group* g) {
+  struct taskEventBase* tmp = g->eventHead;
+  g->eventHead = g->eventHead->next;
+  if (g->eventHead == NULL) g->eventTail = NULL;
+  return tmp;
+}
@@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef EVENT_H_
+#define EVENT_H_
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <unistd.h>
+#include "profiler.h"
+
+#define MAX_CHANNELS                     128 // Match RCCL's MAXCHANNELS
+#define MAX_STEPS                        16
+#define MAX_OPS                          16 // Up to 64K ranks for PAT
+#define MAX_EVENTS_PER_REQ               (8)
+
+struct proxyOp;
+struct proxyStep;
+
+struct netPlugin {
+  uint8_t type;
+  int pluginType;
+  int pluginVer;
+  uint8_t pluginEvent;
+  union {
+    struct {
+      int device;
+      int qpNum;
+      int opcode;
+      uint64_t wr_id;
+      size_t length;
+    } qp;
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+  double startTs;
+  double stopTs;
+  struct proxyStep* parent;
+};
+
+struct kernelCh {
+  uint8_t type;
+  uint8_t channelId;
+  struct taskEventBase* parent;
+  double startTs;
+  double stopTs;
+  uint64_t startGpuClk;
+  uint64_t stopGpuClk;
+};
+
+#define PROXY_STEP_SEND_GPU_WAIT 0
+#define PROXY_STEP_SEND_PEER_WAIT 1
+#define PROXY_STEP_SEND_WAIT 2
+#define PROXY_STEP_RECV_WAIT 0
+#define PROXY_STEP_RECV_FLUSH_WAIT 1
+#define PROXY_STEP_RECV_GPU_WAIT 2
+#define PROXY_STEP_MAX_STATES 3
+
+struct proxyStep {
+  uint8_t type;                     // type of event: network transfer
+  int state;
+  int step;                         // network transfer id in given channel
+  int isSend;                       // send/recv channel operation
+  double timestamp[PROXY_STEP_MAX_STATES];
+  double startTs;
+  double stopTs;
+  struct proxyOp* parent;
+  struct netPlugin net[MAX_EVENTS_PER_REQ];
+  int nNetEvents;
+};
+
+struct proxyOp {
+  uint8_t type;                     // type of event: proxy operation
+  uint8_t channelId;                // channel id for this proxy operation
+  pid_t pid;
+  int rank;
+  int peer;                         // peer rank for this proxy operation
+  int nSteps;                       // total number of network transfers for this proxy operation
+  int chunkSize;                    // chunk size for this proxy operation
+  int isSend;                       // send/recv channel operation
+  size_t transSize;                 // transfer data size for this proxy operation
+  double startTs;
+  double progrTs;                   // In progress state transition
+  double stopTs;
+  int stepCount;                    // last processed network operation for this proxy operation
+  struct proxyStep step[MAX_STEPS]; // array of network transfer events
+  struct taskEventBase* parent;     // parent event p2p/collective
+};
+
+struct group;
+struct context;
+
+struct proxyCtrl {
+  uint8_t type;
+  struct context* ctx;              // profiler context
+  double startTs;
+  double stopTs;
+  int state;
+  int appended;                     // appended proxy operations
+};
+
+// task level event base structure
+struct taskEventBase {
+  uint8_t type;                     // event type: collective/p2p
+  int rank;                         // rank of the operation in NCCL communicator
+  const char* func;                 // ncclFunc*
+  int refCount;                     // number of references for this operation
+  struct group* parent;             // parent event group
+  struct taskEventBase* next;       // next top level event in group
+  double startTs;
+  double stopTs;
+};
+
+struct collective {
+  struct taskEventBase base;        // base structure for this event
+  uint64_t seqNumber;               // sequence number for this collective in communicator
+  void const* sendBuff;
+  void* recvBuff;
+  size_t count;
+  int root;
+  const char* datatype;
+  uint8_t nChannels;
+  const char* algo;
+  const char* proto;
+  int nWarps;
+  struct proxyOp op[MAX_CHANNELS][2*MAX_OPS];
+  int nProxyOps[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
+};
+
+struct p2p {
+  struct taskEventBase base;        // base structure for this event
+  uint8_t func;
+  void const* buff;
+  size_t count;
+  const char* datatype;
+  int peer;
+  uint8_t nChannels;
+  struct proxyOp op[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
+};
+
+struct group {
+  uint8_t type;
+  struct context* ctx;              // profiler context
+  int groupId;
+  int refCount;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct group* next;               // next group event in queue
+};
+
+// arrays for different event objects
+struct context {
+  const char* commName;
+  uint64_t commHash;
+  int nranks;
+  int rank;
+
+  int groupPoolSize;
+  int groupPoolBase;
+  int groupPoolIndex;
+  struct group* groupPool;
+
+  int collPoolSize;
+  int collPoolBase;
+  int collPoolIndex;
+  struct collective* collPool;
+
+  int p2pPoolSize;
+  int p2pPoolBase;
+  int p2pPoolIndex;
+  struct p2p* p2pPool;
+
+  int proxyCtrlPoolSize;
+  int proxyCtrlPoolBase;
+  int proxyCtrlPoolIndex;
+  struct proxyCtrl* proxyCtrlPool;
+};
+
+int taskEventQueueEmpty(struct group* g);
+void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
+struct taskEventBase* taskEventQueueHead(struct group* g);
+struct taskEventBase* taskEventQueueDequeue(struct group* g);
+
+#endif
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCCL_LOG_WARN=3, NCCL_LOG_INFO=4, NCCL_LOG_ABORT=5, NCCL_LOG_TRACE=6} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_H_
+#define PROFILER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined events states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+
+#include "profiler_v4.h"
+#include "profiler_v3.h"
+#include "profiler_v2.h"
+#include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v4_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
+
+#endif // end include guard
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#include "net_ib_v1.h"
+#include "net_socket_v1.h"
+
+#endif
@@ -0,0 +1,109 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
@@ -0,0 +1,106 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
@@ -0,0 +1,114 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communciator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
--- a/Voir plus
+++ b/Voir plus