2
0

RCCL 2.4 update

Este cometimento está contido em:
Wenkai Du
2019-07-05 15:43:00 -07:00
ascendente 4d579e51cc
cometimento f11c8f60cd
95 ficheiros modificados com 7829 adições e 614 eliminações
+206
Ver ficheiro
@@ -0,0 +1,206 @@
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
cmake_minimum_required(VERSION 2.8.12)
set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
project(rccl CXX)
find_package(ROCM
REQUIRED
PATHS
/opt/rocm)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMSetupVersion)
include(ROCMInstallSymlinks)
include(ROCMCreatePackage)
option(BUILD_TESTS "Build test programs" OFF)
# parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
# NCCL_SUFFIX is optional NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 +
# (Z)) so we must first detect one or two digits first
file(READ makefiles/version.mk version_mk_text)
if("${version_mk_text}" MATCHES "NCCL_MAJOR *:= *([0-9]*)")
set(NCCL_MAJOR ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse NCCL_MAJOR")
endif()
if("${version_mk_text}" MATCHES "NCCL_MINOR *:= *([0-9]*)")
set(NCCL_MINOR ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse NCCL_MINOR")
endif()
if("${version_mk_text}" MATCHES "NCCL_PATCH *:= *([0-9]*)")
set(NCCL_PATCH ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse NCCL_PATCH")
endif()
if("${version_mk_text}" MATCHES "NCCL_SUFFIX *:= *([0-9]*)")
set(NCCL_SUFFIX ${CMAKE_MATCH_1})
else()
set(NCCL_SUFFIX)
endif()
if("${version_mk_text}" MATCHES "PKG_REVISION *:= *([0-9]*)")
set(PKG_REVISION ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse PKG_REVISION")
endif()
if("${NCCL_PATCH}" MATCHES "[0-9][0-9]")
set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}${NCCL_PATCH}")
else()
set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}0${NCCL_PATCH}")
endif()
# Setup VERSION
set(VERSION_STRING "2.6.0.")
# Check if BUILD_NUMBER is defined in a Jenkins environment
if($ENV{BUILD_NUMBER})
string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER})
else()
string(CONCAT BUILD_VERSION ${VERSION_STRING} "0")
endif()
rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION)
list(APPEND CMAKE_PREFIX_PATH
/opt/rocm
/opt/rocm/hip
/opt/rocm/hcc)
find_package(hip REQUIRED)
message(STATUS "HIP compiler: ${HIP_COMPILER}")
message(STATUS "HIP runtime: ${HIP_RUNTIME}")
option(BUILD_SHARED_LIBS "Build as a shared library" ON)
configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/rccl.h)
configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/nccl.h)
include_directories(${PROJECT_BINARY_DIR}) # for generated rccl.h header
include_directories(src)
include_directories(src/include)
include_directories(src/collectives)
include_directories(src/collectives/device)
set(CU_SOURCES
src/collectives/device/all_reduce.cu
src/collectives/device/all_gather.cu
src/collectives/device/reduce.cu
src/collectives/device/broadcast.cu
src/collectives/device/reduce_scatter.cu
src/collectives/device/functions.cu)
set(CPP_SOURCES)
foreach(filename ${CU_SOURCES})
string(REPLACE ".cu"
".cpp"
cpp_filename
${filename})
configure_file(${filename} ${cpp_filename} COPYONLY)
list(APPEND CPP_SOURCES ${cpp_filename})
endforeach(filename)
set(CC_SOURCES
src/init.cc
src/collectives/all_reduce.cc
src/collectives/all_gather.cc
src/collectives/reduce.cc
src/collectives/broadcast.cc
src/collectives/reduce_scatter.cc
src/channel.cc
src/misc/trees.cc
src/misc/rings.cc
src/misc/argcheck.cc
src/misc/group.cc
src/misc/utils.cc
src/misc/ibvwrap.cc
src/misc/nvmlwrap_stub.cc
src/misc/topo.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_socket.cc
src/transport/p2p.cc
src/transport/shm.cc
src/transport.cc
src/bootstrap.cc
src/enqueue.cc)
foreach(filename ${CC_SOURCES})
list(APPEND CPP_SOURCES ${filename})
endforeach(filename)
add_library(rccl ${CPP_SOURCES})
if(TRACE)
add_definitions(-DENABLE_TRACE)
endif()
if(PROFILE)
add_definitions(-DENABLE_PROFILING)
endif()
target_link_libraries(rccl
PRIVATE --amdgpu-target=gfx803
PRIVATE --amdgpu-target=gfx900
PRIVATE --amdgpu-target=gfx906)
if("${HIP_COMPILER}" MATCHES "clang")
target_compile_options(rccl
PRIVATE --amdgpu-target=gfx803
PRIVATE --amdgpu-target=gfx900
PRIVATE --amdgpu-target=gfx906
PRIVATE -fgpu-rdc)
target_link_libraries(rccl PRIVATE -fgpu-rdc)
target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
endif()
if("${HIP_COMPILER}" MATCHES "hcc")
target_link_libraries(rccl PRIVATE -hc-function-calls)
endif()
if(TARGET hip::device)
target_link_libraries(rccl PRIVATE hip::device)
target_link_libraries(rccl INTERFACE hip::host)
else()
target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
endif()
rocm_install_targets(TARGETS
rccl
PREFIX
rccl)
install(FILES ${PROJECT_BINARY_DIR}/rccl.h
DESTINATION rccl/${CMAKE_INSTALL_INCLUDEDIR})
rocm_export_targets(NAMESPACE
roc::
PREFIX
rccl
TARGETS
rccl
DEPENDS
hip)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
rocm_create_package(
NAME
rccl
DESCRIPTION
"Optimized primitives for collective multi-GPU communication"
MAINTAINER
"<no-reply@amd.com>"
LDCONFIG)
rocm_install_symlink_subdir(rccl)
if(BUILD_TESTS)
add_subdirectory(test)
endif()
+89
Ver ficheiro
@@ -0,0 +1,89 @@
#!/usr/bin/env groovy
// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rccl
@Library('rocJenkins@noDocker') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
////////////////////////////////////////////////////////////////////////
// Mostly generated from snippet generator 'properties; set job properties'
// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
properties([
pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
buildDiscarder(logRotator(
artifactDaysToKeepStr: '',
artifactNumToKeepStr: '',
daysToKeepStr: '',
numToKeepStr: '10')),
disableConcurrentBuilds(),
[$class: 'CopyArtifactPermissionProperty', projectNames: '*']
])
////////////////////////////////////////////////////////////////////////
import java.nio.file.Path;
rcclCI:
{
def rccl = new rocProject('rccl')
// customize for project
rccl.paths.build_command = './install.sh -t'
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(['RCCL'], rccl)
boolean formatCheck = false
def compileCommand =
{
platform, project->
project.paths.construct_build_prefix()
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}
LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=${project.compiler.compiler_path} ${project.paths.build_command}
"""
sh command
}
def testCommand =
{
platform, project->
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}/build/release/test
HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
"""
sh command
//junit "${project.paths.project_build_prefix}/build/release/*.xml"
}
def packageCommand =
{
platform, project->
def command = """
set -x
cd ${project.paths.project_build_prefix}/build
make package
rm -rf package && mkdir -p package
mv *.deb package/
sudo dpkg -i package/*.deb
"""
//platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
}
buildProjectNoDocker(rccl, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
+1
Ver ficheiro
@@ -1,5 +1,6 @@
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
+66
Ver ficheiro
@@ -0,0 +1,66 @@
Notices and Licenses file
_______________________________________________________________
Dependencies on nvidia-nccl v2.3.7-1 (BSD3)
Copyright (c) 2015-2018, NVIDIA CORPORATION.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
nvidia-nccl v2.3.7-1 (BSD2)
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+52 -64
Ver ficheiro
@@ -1,92 +1,80 @@
# NCCL
# RCCL
Optimized primitives for collective multi-GPU communication.
ROCm Communication Collectives Library
## Introduction
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node, and can be used in either single- or multi-process (e.g., MPI) applications. Multi node support is planned for a future release.
For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
## What's inside
At present, the library implements the following collectives operations:
- all-reduce
- all-gather
- reduce-scatter
- reduce
- broadcast
These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
The collective operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
## Requirements
NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
1. ROCm supported GPUs
2. ROCm stack installed on the system (HIP runtime & HCC)
3. For building and running the unit tests, chrpath will need to be installed on your machine first. (sudo apt-get install chrpath)
## Build
## Quickstart RCCL Build
Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
RCCL directly depends on HIP runtime & HCC C++ compiler which are part of the ROCm software stack.
In addition, HC Direct Function call support needs to be present on your machine. There are binaries for hcc and HIP that need to be installed to get HC Direct Function call support. These binaries are currently packaged with roc-master, and will be included in ROCm 2.4.
To build the library :
The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
* `./install.sh` -- builds library including unit tests
* `./install.sh -i` -- builds and installs the library to /opt/rocm/rccl; installation path can be changed with --prefix argument (see below.)
* `./install.sh -h` -- shows help
* `./install.sh -t` -- builds library including unit tests
* `./install.sh -r` -- runs unit tests (must be already built)
* `./install.sh -p` -- builds RCCL package
* `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm)
## Manual build
#### To build the library :
```shell
$ cd nccl
$ make -j src.build
$ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
$ cd rccl
$ mkdir build
$ cd build
$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
$ make -j 8
```
You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
#### To build the RCCL package and install package :
Assuming you have already cloned this repository and built the library as shown in the previous section:
```shell
$ make src.build CUDA_HOME=<path to cuda install>
$ cd rccl/build
$ make package
$ sudo dpkg -i *.deb
```
NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
```shell
$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
```
## Install
To install NCCL on the system, create a package then install it as root.
Debian/Ubuntu :
```shell
$ # Install tools to create debian packages
$ sudo apt install build-essential devscripts debhelper fakeroot
$ # Build NCCL deb package
$ make pkg.debian.build
$ ls build/pkg/deb/
```
RedHat/CentOS :
```shell
$ # Install tools to create rpm packages
$ sudo yum install rpm-build rpmdevtools
$ # Build NCCL rpm package
$ make pkg.redhat.build
$ ls build/pkg/rpm/
```
OS-agnostic tarball :
```shell
$ make pkg.txz.build
$ ls build/pkg/txz/
```
RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so.
## Tests
Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
There are unit tests implemented with the Googletest framework in RCCL, which are currently a work-in-progress. To invoke the unit tests, go to the rccl-install folder, then the test/ subfolder, and execute the appropriate unit test executable(s). Several notes for running the unit tests:
1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests.
An example call to the unit tests:
```shell
$ git clone https://github.com/NVIDIA/nccl-tests.git
$ cd nccl-tests
$ make
$ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
$ LD_LIBRARY_PATH=rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 rccl-install/test/UnitTests
```
There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests.
See the rccl-tests README for more information on how to build and run those tests.
## Library and API Documentation
Please refer to the [Library documentation](http://rccl.readthedocs.io/) for current documentation.
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+2456
Ver ficheiro
A apresentação das diferenças no ficheiro foi suprimida por ser demasiado grande Carregar diff
BIN
Ver ficheiro
Ficheiro binário não mostrado.

Depois

Largura:  |  Altura:  |  Tamanho: 6.6 KiB

Ficheiro executável
+16
Ver ficheiro
@@ -0,0 +1,16 @@
#!/bin/bash
if [ -d docBin ]; then
rm -rf docBin
fi
sed -e 's/ROCFFT_EXPORT //g' ../library/include/rocfft.h > rocfft.h
doxygen Doxyfile
cd source
make clean
make html
cd ..
rm rocfft.h
Ficheiro executável
+12
Ver ficheiro
@@ -0,0 +1,12 @@
#!/bin/bash
if [ -d docBin ]; then
rm -rf docBin
fi
rm nccl.h
sed -e 's/ROCFFT_EXPORT //g' ../src/nccl.h.in > nccl.h
doxygen Doxyfile
#rm nccl.h
+20
Ver ficheiro
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = RCCL
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+11
Ver ficheiro
@@ -0,0 +1,11 @@
.. toctree::
:maxdepth: 4
:caption: Contents:
=======
All API
=======
.. doxygenindex::
+103
Ver ficheiro
@@ -0,0 +1,103 @@
.. toctree::
:maxdepth: 4
:caption: Contents:
===
API
===
This section provides details of the library API
Communicator Functions
----------------------
.. doxygenfunction:: ncclGetUniqueId
.. doxygenfunction:: ncclCommInitRank
.. doxygenfunction:: ncclCommInitAll
.. doxygenfunction:: ncclCommDestroy
.. doxygenfunction:: ncclCommCount
.. doxygenfunction:: ncclCommCuDevice
.. doxygenfunction:: ncclCommUserRank
Collection Communication Operations
-----------------------------------
Collective communication operations must be called separately for each communicator in a communicator clique.
They return when operations have been enqueued on the hipstream.
Since they may perform inter-CPU synchronization, each call has to be done from a different thread or process, or need to use Group Semantics (see below).
.. doxygenfunction:: ncclReduce
.. doxygenfunction:: ncclBcast
.. doxygenfunction:: ncclBroadcast
.. doxygenfunction:: ncclAllReduce
.. doxygenfunction:: ncclReduceScatter
.. doxygenfunction:: ncclAllGather
Group Semantics
---------------
When managing multiple GPUs from a single thread, and since NCCL collective
calls may perform inter-CPU synchronization, we need to "group" calls for
different ranks/devices into a single call.
Grouping NCCL calls as being part of the same collective operation is done
using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
collective calls until the ncclGroupEnd call, which will wait for all calls
to be complete. Note that for collective communication, ncclGroupEnd only
guarantees that the operations are enqueued on the streams, not that
the operation is effectively done.
Both collective communication and ncclCommInitRank can be used in conjunction
of ncclGroupStart/ncclGroupEnd.
.. doxygenfunction:: ncclGroupStart
.. doxygenfunction:: ncclGroupEnd
Library Functions
-----------------
.. doxygenfunction:: ncclGetVersion
.. doxygenfunction:: ncclGetErrorString
Types
-----
There are few data structures that are internal to the library. The pointer types to these
structures are given below. The user would need to use these types to create handles and pass them
between different library functions.
.. doxygentypedef:: ncclComm_t
.. doxygenstruct:: ncclUniqueId
Enumerations
------------
This section provides all the enumerations used.
.. doxygenenum:: ncclResult_t
.. doxygenenum:: ncclRedOp_t
.. doxygenenum:: ncclDataType_t
+185
Ver ficheiro
@@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
#
# RCCL documentation build configuration file, created by
# sphinx-quickstart on Mon Jan 8 16:34:42 2018.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import os
import sys
import subprocess
read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
if read_the_docs_build:
subprocess.call('cd ..; ./run_doxygen.sh; cd source', shell=True)
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['sphinx.ext.mathjax', 'breathe']
breathe_projects = { "RCCL": "../docBin/xml" }
breathe_default_project = "RCCL"
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'RCCL'
copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019 Advanced Mirco Devices'
author = u'Advanced Mirco Devices'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = u'0.8'
# The full version, including alpha/beta/rc tags.
release = u'0.8'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
# html_theme = 'alabaster'
if read_the_docs_build:
html_theme = 'default'
else:
import sphinx_rtd_theme
html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# This is required for the alabaster theme
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
# html_sidebars = {
# '**': [
# 'relations.html', # needs 'show_related': True theme option to display
# 'searchbox.html',
# ]
# }
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'RCCLdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'RCCL.tex', u'RCCL Documentation',
u'Advanced Mirco Devices', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'RCCL', u'RCCL Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'RCCL', u'RCCL Documentation',
author, 'RCCL', 'One line description of project.',
'Miscellaneous'),
]
+21
Ver ficheiro
@@ -0,0 +1,21 @@
.. rocFFT documentation master file, created by
sphinx-quickstart on Mon Jan 8 09:51:41 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to RCCL's documentation!
==================================
.. toctree::
:maxdepth: 4
:caption: Contents:
library
api
allapi
Indices and tables
==================
* :ref:`genindex`
* :ref:`search`
+13
Ver ficheiro
@@ -0,0 +1,13 @@
.. toctree::
:maxdepth: 4
:caption: Contents:
======
RCCL
======
Introduction
------------
The RCCL is an AMD port of NCCL.
+3
Ver ficheiro
@@ -0,0 +1,3 @@
breathe
Ficheiro executável
+132
Ver ficheiro
@@ -0,0 +1,132 @@
#!/bin/bash
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "RCCL build & installation helper script"
echo "./install [-h|--help] "
echo " [-h|--help] prints this help message."
echo " [-i|--install] install RCCL library (see --prefix argument below.)"
echo " [-p|--package_build] Build RCCL package."
echo " [-t|--tests_build] Build unit tests, but do not run."
echo " [-r|--run_tests] Run unit tests (must be built already.)"
echo " [--prefix] Specify custom directory to install RCCL to (default: /opt/rocm)."
}
# #################################################
# global variables
# #################################################
default_path=/opt/rocm
build_package=false
install_prefix=$default_path
build_tests=false
run_tests=false
build_release=true
install_library=false
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,package_build,tests_build,run_tests,prefix: --options hiptr -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help)
display_help
exit 0
;;
-i|--install)
install_library=true
shift ;;
-p|--package_build)
build_package=true
shift ;;
-t|--tests_build)
build_tests=true
shift ;;
-r|--run_tests)
run_tests=true
shift ;;
--prefix)
install_prefix=${2}
shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
rocm_path=/opt/rocm/bin
# #################################################
# prep
# #################################################
# ensure a clean build environment
if [[ "${build_release}" == true ]]; then
rm -rf build/release
else
rm -rf build/debug
fi
# Create and go to the build directory.
mkdir -p build; cd build
if ($build_release); then
mkdir -p release; cd release
else
mkdir -p debug; cd debug
fi
# build type
if [[ "${build_release}" == true ]]; then
cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Release"
else
cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Debug"
fi
if ($build_tests); then
CXX=$rocm_path/hcc cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$install_prefix ../../.
else
CXX=$rocm_path/hcc cmake -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$install_prefix ../../.
fi
if ($install_library); then
make -j$(nproc) install
else
make -j$(nproc)
fi
if ($build_package); then
make package
fi
# Optionally, run tests if they're enabled.
if ($run_tests); then
if (test -f "./test/UnitTests"); then
HSA_FORCE_FINE_GRAIN_PCIE=1 ./test/UnitTests
else
echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
exit 1
fi
fi
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -39,7 +40,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
// Free Ring index to rank tables
free(channel->ring.userRanks);
CUDACHECK(cudaFree(channel->ring.devUserRanks));
CUDACHECK(hipFree(channel->ring.devUserRanks));
// Free transport proxy resources
for (int r=0; r<nRanks; r++) {
@@ -49,7 +50,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
}
// Free the peer structures.
CUDACHECK(cudaFree(channel->devPeers));
CUDACHECK(hipFree(channel->devPeers));
free(channel->peers);
return ncclSuccess;
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,9 +9,9 @@
#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,9 +9,9 @@
#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+5 -4
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,9 +9,9 @@
#include "collectives.h"
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
@@ -18,9 +19,9 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
ncclComm_t comm, hipStream_t stream) {
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
+17 -10
Ver ficheiro
@@ -1,5 +1,7 @@
#include "hip/hip_runtime.h"
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +9,7 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
@@ -17,7 +19,7 @@
/* Declare all collective operations */
#define DECL_COLL5(coll, op, dtype) \
extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
#define DECL_COLL4(coll, op, dtype) \
@@ -25,8 +27,7 @@
DECL_COLL5(coll##LL, op, dtype)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
DECL_COLL4(coll##Tree, op, dtype)
DECL_COLL4(coll##Ring, op, dtype)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
@@ -55,12 +56,18 @@
DECL_ALL_COLLS
// CHUNKSIZE must be a multiple of SLICESIZE
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLREDUCE_SLICESTEPS 4
#define ALLREDUCE_CHUNKSTEPS 4
#define ALLGATHER_SLICESTEPS 4
#define ALLGATHER_CHUNKSTEPS 4
#define REDUCESCATTER_SLICESTEPS 4
#define REDUCESCATTER_CHUNKSTEPS 4
#define BROADCAST_SLICESTEPS 1
#define BROADCAST_CHUNKSTEPS 1
#define REDUCE_SLICESTEPS 1
+6 -3
Ver ficheiro
@@ -1,11 +1,14 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "all_gather.h"
#include "common.h"
#include "all_gather.h"
#include "collectives.h"
IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
#define UNROLL 4
IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
+6 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,9 +10,10 @@
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -67,9 +69,11 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
@@ -128,4 +132,5 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+9 -3
Ver ficheiro
@@ -1,11 +1,17 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "all_reduce.h"
#include "common.h"
#include "all_reduce.h"
#include "collectives.h"
IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
#define UNROLL 4
IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum);
IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin);
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
+25 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,9 +10,10 @@
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -21,6 +23,11 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws, wr;
if (tid == 0) clk = clock64();
#endif
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
@@ -44,7 +51,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
@@ -52,7 +61,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.recvReduceSend(thisInput+offset, nelem);
ACCUMULATE_COUNTER(recvReduceSend);
}
// step k-1: reduce this buffer and data, which will produce the final
@@ -61,7 +72,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvReduceCopySend);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
@@ -69,7 +82,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
offset = chunkOffset + slice * realChunkSize;
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvCopySend);
}
// Make final copy from buffer to dest.
@@ -78,14 +93,20 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
nelem = min(realChunkSize, size-offset);
// Final wait/copy.
INIT_COUNTER;
prims.directRecv(thisOutput+offset, offset, nelem);
ACCUMULATE_COUNTER(directRecv);
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
#endif
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -135,6 +156,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
@@ -210,6 +232,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
+6 -3
Ver ficheiro
@@ -1,11 +1,14 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "broadcast.h"
#include "common.h"
#include "broadcast.h"
#include "collectives.h"
IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
#define UNROLL 4
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
+22 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,9 +10,10 @@
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -23,6 +25,11 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t clk, t0 = 0ULL, ws, wr;
if (tid == 0) clk = clock64();
#endif
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
@@ -39,22 +46,35 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
if (rank == root) {
if (thisInput == thisOutput) {
INIT_COUNTER;
prims.send(thisInput+offset, nelem);
ACCUMULATE_COUNTER(send);
} else {
INIT_COUNTER;
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
ACCUMULATE_COUNTER(copySend);
}
} else if (nextRank == root) {
INIT_COUNTER;
prims.recv(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recv);
} else {
INIT_COUNTER;
prims.recvCopySend(thisOutput+offset, nelem);
ACCUMULATE_COUNTER(recvCopySend);
}
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
#endif
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
@@ -99,4 +119,5 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
+127 -61
Ver ficheiro
@@ -1,5 +1,7 @@
#include "hip/hip_runtime.h"
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,11 +12,18 @@
#include "../collectives.h"
#include "devcomm.h"
#include "nccl.h"
#include <type_traits>
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define exitIfAbortBarrier(abort, abortCount) \
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
__syncthreads(); \
if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
#else
static inline __device__ void exitIfAbortBarrier(int abort) {
uint32_t popc;
asm ("{");
@@ -24,21 +33,116 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
asm ("}");
if (popc) { asm volatile ("exit;"); }
}
#endif
typedef void(*ncclKern_t)(struct CollectiveArgs* args);
extern __device__ ncclKern_t ncclFuncs[];
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype)
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
// Must be consistent with ncclColl_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
using ncclFunc_t = void (*)(struct CollectiveArgs*);
static const __device__ constexpr ncclFunc_t ncclFuncs[]{
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if defined(__HIP_DEVICE_COMPILE__)
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
#endif
};
template<unsigned short f, unsigned short l>
struct Caller {
static
void call(ncclColl* const c) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
}
};
template<unsigned short f>
struct Caller<f, f + 1>{
static
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
};
inline
__device__
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
if (c->funcIndex < 72) {
if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args);
else ncclBroadcastRing_copy_i8(&c->args);
}
else if (c->funcIndex < 144) Caller<72, 144>::call(c);
else if (c->funcIndex < 216) {
if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args);
else ncclAllGatherRing_copy_i8(&c->args);
}
else Caller<216, 360>::call(c);
}
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
int* d = (int*)dst;
int* s = (int*)src;
// When aggregation is effective, if some threads have aborted inside the LL kernel,
// make sure the rest of the threads abort as well
exitIfAbortBarrier(0);
exitIfAbortBarrier(0, abortCount);
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
__syncthreads();
}
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
if (tid == 0) hostColl->active = 0;
}
@@ -56,23 +160,27 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
__shared__ struct ncclColl localColl; \
__shared__ uint32_t abortCount; \
if (tid == 0) abortCount = 0; \
__syncthreads(); \
\
struct ncclDevComm* comm = firstColl.args.comm; \
struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
channel->abortCount = &abortCount; \
if (bid == 0) { \
/* To optimize for latency, (only) the first operation is passed as argument.*/ \
c = &firstColl; \
} else { \
c = &localColl; \
load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
} \
while (1) { \
if (tid < c->args.nThreads) { \
if (c->funcIndex == fIndex) { \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
} else { \
ncclFuncs[c->funcIndex](&c->args); \
NCCL_CALL_FUNCTIONS(c); \
} \
} \
int nextIndex = c->nextIndex; \
@@ -84,7 +192,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
load_coll(c, channel->devCollectives+nextIndex, tid); \
load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
} \
}
#else
@@ -98,61 +206,19 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0)
#if NCCL_TYPE == 0
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8)
#elif NCCL_TYPE == 1
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8)
#elif NCCL_TYPE == 2
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32)
#elif NCCL_TYPE == 3
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
#elif NCCL_TYPE == 4
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64)
#elif NCCL_TYPE == 5
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
#elif NCCL_TYPE == 6
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16)
#elif NCCL_TYPE == 7
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32)
#elif NCCL_TYPE == 8
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64)
#endif
// Reduction define all functions
#if NCCL_OP == 0
#define IMPL_COLL_R(collf, colln) \
IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum);
#elif NCCL_OP == 1
#define IMPL_COLL_R(collf, colln) \
IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
#elif NCCL_OP == 2
#define IMPL_COLL_R(collf, colln) \
IMPL_COLL2(collf, min, FuncMin, colln, ncclMin);
#elif NCCL_OP == 3
#define IMPL_COLL_R(collf, colln) \
IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
#endif
// Copy primitives only define one
#if NCCL_OP == 0 && NCCL_TYPE == 0
#define IMPL_COLL_C(collf, colln) \
IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
#else
#define IMPL_COLL_C(collf, colln)
#endif
#define COLL_UNROLL 4
#define COLL_UNROLL 2
#endif
+31 -6
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,13 +12,25 @@
#include <cstdio>
#include <cstdint>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
typedef uint64_t PackType;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
template<class FUNC, typename T>
struct MULTI {
__device__ PackType operator()(const PackType x, const PackType y) const
{
return FUNC()(x, y);
}
};
#else
// unpack x and y to elements of type T and apply FUNC to each element
template<class FUNC, typename T>
struct MULTI {
@@ -192,6 +205,8 @@ struct MULTI<FUNC, int64_t> {
}
};
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
@@ -202,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
#if CUDART_VERSION < 9000
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
@@ -239,14 +254,24 @@ struct MULTI128 {
};
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
v.x = p->x;
v.y = p->y;
#else
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
#endif
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
p->x = v.x;
p->y = v.y;
#else
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
#endif
}
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
const int offset, const int N) {
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
@@ -263,10 +288,10 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread
}
}
#define WARP_SIZE 32
#define WARP_SIZE 64
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
@@ -316,7 +341,7 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
int N) {
int Nrem = N;
+1 -64
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,69 +9,5 @@
#include "collectives.h"
#include "common.h"
#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll, op, dtype), \
NCCL_COLL_NAME(coll##LL, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##Tree, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(coll) \
NCCL_FUNCS3A(coll, sum ), \
NCCL_FUNCS3A(coll, prod), \
NCCL_FUNCS3A(coll, max ), \
NCCL_FUNCS3A(coll, min )
#define NCCL_FUNCS2B(coll) \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
// Must be consistent with ncclColl_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
NCCL_FUNCS2A(ncclReduceScatter), \
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
NCCL_FUNCS2A(ncclReduceScatter),
NCCL_FUNCS2A(ncclAllReduce)
#endif
};
// Workaround for https://reviews.llvm.org/D55580
__device__ void ncclWorkaroundClangD55580() {}
+186 -113
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -51,24 +52,29 @@ class ncclPrimitives {
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
struct ncclDevComm* comm;
uint32_t* abortCount;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
__device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
__device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
inline __device__ void barrier() {
__device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
__syncthreads();
#else
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
#endif
}
uint32_t mismatch = 0;
const uint64_t opCount;
inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
if (mismatch) {
// In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
*(comm->fatalDevError) = ncclDevAssertedMismatch;
} else if (remoteOpCount && *remoteOpCount > opCount) {
STORE(comm->fatalDevError, ncclDevAssertedMismatch);
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
mismatch += 1;
}
}
@@ -76,63 +82,78 @@ class ncclPrimitives {
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
spins++;
abort = LOAD(comm->abortFlag);
if (spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
checkMismatch(remoteOpCount);
spins = 0;
}
return abort;
}
inline __device__ void waitRecv(int i) {
__device__ void waitRecv(int i) {
spins = 0;
mismatch = 0;
recvStep[i] += SLICESTEPS;
if (tid == i) {
while (*(waitPtr) < recvStep[i]) {
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t t0 = clock64();
#endif
while (LOAD(waitPtr) < recvStep[i]) {
if (checkAbort(recvConn[i]->opCountRem)) break;
}
#ifdef ENABLE_PROFILING
__atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
#endif
}
}
inline __device__ void waitSend(int i) {
__device__ void waitSend(int i) {
spins = 0;
mismatch = 0;
sendStep[i] += SLICESTEPS;
if (tid == WARP_SIZE+i) {
#ifdef ENABLE_PROFILING
auto devProf = comm->devProf;
uint64_t t0 = clock64();
#endif
while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
sendConnHead[i] = *waitPtr;
sendConnHead[i] = LOAD(waitPtr);
if (checkAbort(sendConn[i]->opCountRem)) break;
}
#ifdef ENABLE_PROFILING
__atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
#endif
}
}
inline __device__ void postRecv(int i) {
*(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
STORE(recvConn[i]->head, recvStep[i]);
}
inline __device__ void postSend(int i) {
*(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
STORE(sendConn[i]->tail, sendStep[i]);
}
inline __device__ void postSendSize(int i, int size) {
if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
__device__ void postSendSize(int i, int size) {
if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
}
template <int DIRECTRECV>
inline __device__ const T* directRecvPtr(int i, int directOffset) {
__device__ const T* directRecvPtr(int i, int directOffset) {
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
}
template <int DIRECTSEND>
inline __device__ T* directSendPtr(int i, int directOffset) {
__device__ T* directSendPtr(int i, int directOffset) {
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
}
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
__device__ void
GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
int offset = 0;
int sliceSize = stepSize * SLICESTEPS;
@@ -154,157 +175,155 @@ class ncclPrimitives {
#pragma unroll 1
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(sliceSize, nelem-offset));
if (tid < nthreads) {
FOR_SEND(waitSend);
FOR_RECV(waitRecv);
if (realSize > 0) {
barrier();
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
}
} else {
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
FOR_SEND(waitSend);
FOR_RECV(waitRecv);
if (realSize > 0) {
barrier();
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
}
} else {
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
exitIfAbortBarrier(abort);
} else {
exitIfAbortBarrier(abort);
FOR_SEND(postSendSize, realSize*sizeof(T));
if (SEND) __threadfence_system();
FOR_SEND(postSend);
FOR_RECV(postRecv);
}
for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
offset += sliceSize;
exitIfAbortBarrier(abort, abortCount);
if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T));
if (SEND) __threadfence_system();
if (tid == 0) FOR_SEND(postSend);
if (tid == 0) FOR_RECV(postRecv);
}
for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
offset += sliceSize;
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
recvConn[i] = conn;
recvBuff[i] = (const T*)recvConn[i]->buff;
recvStep[i] = recvConn[i]->step;
recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
recvStep[i] = LOAD(&recvConn[i]->step);
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
// Return credits in case we rounded up.
if (tid == nthreads) *recvConn[i]->head = recvStep[i];
if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
if (tid == i) {
waitPtr = recvConn[i]->tail;
*(recvConn[i]->opCountLoc) = opCount;
waitPtr = LOAD(&recvConn[i]->tail);
STORE(recvConn[i]->opCountLoc, opCount);
}
recvDirectBuff[i] = NULL;
if (directBuff && recvConn[i]->direct) {
recvDirectBuff[i] = directBuff;
if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
}
nrecv++;
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
__device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
sendConn[i] = conn;
sendBuff[i] = (T*)sendConn[i]->buff;
sendStep[i] = sendConn[i]->step;
sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
sendStep[i] = LOAD(&sendConn[i]->step);
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
if (tid == WARP_SIZE+i) {
waitPtr = sendConn[i]->head;
sendConnHead[i] = *waitPtr;
*(sendConn[i]->opCountLoc) = opCount;
waitPtr = LOAD(&sendConn[i]->head);
sendConnHead[i] = LOAD(waitPtr);
STORE(sendConn[i]->opCountLoc, opCount);
}
sendDirectBuff[i] = NULL;
if (directBuff && sendConn[i]->direct) {
void* volatile* ptr = sendConn[i]->ptrExchange;
while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
__syncthreads();
if (tid == 0) *ptr = NULL;
if (tid == 0) STORE(ptr, NULL);
}
nsend++;
}
__device__ __forceinline__ void saveRecvConn(int i) {
__device__ void saveRecvConn(int i) {
if (tid == i) {
recvConn[i]->step = recvStep[i];
STORE(&recvConn[i]->step, recvStep[i]);
__threadfence_system();
*(recvConn[i]->opCountLoc) += 1;
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
}
}
__device__ __forceinline__ void saveSendConn(int i) {
__device__ void saveSendConn(int i) {
if (tid == WARP_SIZE+i) {
sendConn[i]->step = sendStep[i];
STORE(&sendConn[i]->step, sendStep[i]);
__threadfence_system();
*(sendConn[i]->opCountLoc) += 1;
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
}
}
public:
__device__ __forceinline__
__device__
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it
abortCount = channel->abortCount;
__syncthreads();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
// disable directBuff
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
}
__device__ __forceinline__ void
__device__ void
send(const T* src, int nelem) {
GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
directSend(const T* src, int directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
}
__device__ __forceinline__ void
__device__ void
recv(T* dst, int nelem) {
GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
directRecv(T* dst, int directOffset, int nelem) {
GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
}
__device__ __forceinline__ void
__device__ void
copySend(const T* src, T* dst, int nelem) {
GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
directCopySend(const T* src, T* dst, int directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
}
__device__ __forceinline__ void
__device__ void
recvCopySend(T* dst, int nelem) {
GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
directRecvCopySend(T* dst, int directOffset, int nelem) {
GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
}
__device__ __forceinline__ void
__device__ void
recvReduceCopy(const T* src, T* dst, int nelem) {
GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
recvReduceSend(const T* src, int nelem) {
GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
recvReduceCopySend(const T* src, T* dst, int nelem) {
GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
__device__ void
directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
// Direct is only for the send part
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
}
__device__ __forceinline__ ~ncclPrimitives() {
__device__ ~ncclPrimitives() {
// Save steps for next collective. Have thread 0 do it to be compatible
// with the way LL works.
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
@@ -330,19 +349,22 @@ class ncclLLPrimitives {
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclDevComm* comm;
uint32_t* abortCount;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
__device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
__device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
__device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
__device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
inline __device__ void exitIfAbortLocalBarrier() {
__device__ void exitIfAbortLocalBarrier() {
uint32_t popc;
asm ("{");
asm volatile (" .reg .pred barr_pred;");
@@ -354,20 +376,25 @@ class ncclLLPrimitives {
exitIfAbortBarrier(1);
}
}
#endif
inline __device__ void barrier() {
__device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
__syncthreads();
#else
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
#endif
}
uint32_t mismatch = 0;
const uint64_t opCount;
inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
if (mismatch > 20) {
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
*(comm->fatalDevError) = ncclDevSuspectedMismatch;
} else if (remoteOpCount && *remoteOpCount > opCount) {
STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
mismatch += 1;
}
}
@@ -375,37 +402,37 @@ class ncclLLPrimitives {
uint32_t spins = 0;
uint32_t abort = 0;
inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
spins++;
abort = LOAD(comm->abortFlag);
if (spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
checkMismatch(remoteOpCount);
spins = 0;
}
return abort;
}
inline __device__ void waitSend(int i, int nbytes) {
__device__ void waitSend(int i, int nbytes) {
spins = 0;
mismatch = 0;
if (tid == WARP_SIZE+i) {
while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
sendConnHead = *waitPtr;
sendConnHead = LOAD(waitPtr);
if (checkAbort(sendConn[i]->opCountRem)) break;
}
if (fifoPtr) {
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
fifoPtr[sendStep[i]%NCCL_STEPS] = size;
STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
}
}
}
inline __device__ void postRecv(int i) {
__device__ void postRecv(int i) {
recvStep[i]++;
if (tid == i) *postPtr = recvStep[i];
if (tid == i) STORE(postPtr, recvStep[i]);
}
inline __device__ void postSend(int i, int offset) {
__device__ void postSend(int i, int offset) {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
@@ -414,22 +441,46 @@ class ncclLLPrimitives {
sendStep[i]++;
}
__device__ uint64_t readLL(int i, int offset) {
__device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
spins = 0;
mismatch = 0;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
do {
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
if (checkAbort(recvConn[i]->opCountRem)) break;
} while (i4[1] != flag || i4[3] != flag);
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
#else
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
if (checkAbort(recvConn[i]->opCountRem)) break;
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
#endif
return val64;
}
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
__device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = val & 0xffffffff;
i4[1] = flag;
i4[2] = (val >> 32);
i4[3] = flag;
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
"s_waitcnt vmcnt(0)\n"
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
#endif
}
// Using memcpy handles misaligned pointers.
@@ -453,7 +504,7 @@ class ncclLLPrimitives {
uint64_t* dstPack = (uint64_t*)dstPtr;
int offset = tid;
// Do multiples of 64 bits
#pragma unroll 2
#pragma unroll 1
for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
@@ -478,56 +529,61 @@ class ncclLLPrimitives {
}
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
exitIfAbortBarrier(abort, abortCount);
#else
exitIfAbortLocalBarrier();
#endif
FOR_RECV(postRecv);
FOR_SEND(postSend, offset);
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvConn[i] = conn;
recvBuff[i] = recvConn[i]->llBuff;
recvStep[i] = recvConn[i]->step;
if (tid == i) {
postPtr = recvConn[i]->head;
*(recvConn[i]->opCountLoc) = opCount;
STORE(recvConn[i]->opCountLoc, opCount);
}
nrecv++;
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
__device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendConn[i] = conn;
sendBuff[i] = sendConn[i]->llBuff;
sendStep[i] = sendConn[i]->step;
if (tid == WARP_SIZE+i) {
waitPtr = sendConn[i]->head;
fifoPtr = sendConn[i]->fifo;
sendConnHead = *waitPtr;
*(sendConn[i]->opCountLoc) = opCount;
sendConnHead = LOAD(waitPtr);
STORE(sendConn[i]->opCountLoc, opCount);
}
nsend++;
}
__device__ __forceinline__ void saveRecvConn(int i) {
__device__ void saveRecvConn(int i) {
if (tid == i) {
recvConn[i]->step = recvStep[i];
*(recvConn[i]->opCountLoc) += 1;
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
__threadfence_block();
}
}
__device__ __forceinline__ void saveSendConn(int i) {
__device__ void saveSendConn(int i) {
if (tid == WARP_SIZE+i) {
sendConn[i]->step = sendStep[i];
*(sendConn[i]->opCountLoc) += 1;
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
__threadfence_block();
}
}
public:
__device__ __forceinline__
__device__
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
// Make sure step is updated before we read it.
abortCount = channel->abortCount;
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
@@ -562,10 +618,27 @@ class ncclLLPrimitives {
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
}
__device__ __forceinline__ ~ncclLLPrimitives() {
__device__ ~ncclLLPrimitives() {
// Save steps for the next operation
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
}
};
#ifdef ENABLE_PROFILING
#define INIT_COUNTER \
if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
#define ACCUMULATE_COUNTER(prim) \
if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
+ ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
+ wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
__ATOMIC_SEQ_CST); \
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
#else
#define INIT_COUNTER
#define ACCUMULATE_COUNTER(prim)
#endif
#endif
+9 -3
Ver ficheiro
@@ -1,11 +1,17 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "reduce.h"
#include "common.h"
#include "reduce.h"
#include "collectives.h"
IMPL_COLL_R(ncclReduce, ncclCollReduce);
#define UNROLL 4
IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum);
IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin);
IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax);
+6 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,9 +10,10 @@
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -48,9 +50,11 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
@@ -92,4 +96,5 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
+139
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,6 +19,123 @@ struct FuncNull {
}
};
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
//we really don't need any specializations and we don't need
//to break things into uint32_t
template<typename T>
__device__ inline T ncclMinFunc(T x, T y) { return y < x ? y : x; }
template<typename T>
__device__ inline T ncclMaxFunc(T x, T y) { return y < x ? x : y; }
template<typename T>
class FuncBase {
protected:
static constexpr auto n = sizeof(PackType) / sizeof(T);
union Cvt {
using Vec = T __attribute__((ext_vector_type(n)));
PackType data;
Vec vec;
static_assert(sizeof(Vec) == sizeof(data), "Vec must be the same size of data.");
};
};
template<>
class FuncBase<half> {
protected:
static constexpr auto n = sizeof(PackType) / sizeof(_Float16);
union Cvt {
using Vec = _Float16 __attribute__((ext_vector_type(n)));
PackType data;
Vec vec;
static_assert(sizeof(Vec) == sizeof(data), "Vec must be the same size of data.");
};
};
template<typename T>
struct FuncSum : private FuncBase<T> {
__device__ PackType operator()(PackType x, PackType y) const
{
using Cvt = typename FuncBase<T>::Cvt;
Cvt tmp_x{x};
tmp_x.vec += Cvt{y}.vec;
return tmp_x.data;
}
template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
__device__ T operator()(const T x, const T y) const {
return x + y;
}
};
template<typename T>
struct FuncProd : private FuncBase<T> {
__device__ PackType operator()(PackType x, PackType y) const
{
using Cvt = typename FuncBase<T>::Cvt;
Cvt tmp_x{x};
tmp_x.vec *= Cvt{y}.vec;
return tmp_x.data;
}
template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
__device__ T operator()(const T x, const T y) const {
return x * y;
}
};
template<typename T>
struct FuncMax : private FuncBase<T> {
__device__ PackType operator()(PackType x, PackType y) const
{
using Cvt = typename FuncBase<T>::Cvt;
Cvt tmp_x{x};
Cvt tmp_y{y};
for (auto i = 0u; i != FuncBase<T>::n; ++i) {
tmp_x.vec[i] = ncclMaxFunc(tmp_x.vec[i], tmp_y.vec[i]);
}
return tmp_x.data;
}
template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
__device__ T operator()(const T x, const T y) const {
return (x < y) ? y : x;
}
};
template<typename T>
struct FuncMin : private FuncBase<T> {
__device__ PackType operator()(PackType x, PackType y) const
{
using Cvt = typename FuncBase<T>::Cvt;
Cvt tmp_x{x};
Cvt tmp_y{y};
for (auto i = 0u; i != FuncBase<T>::n; ++i) {
tmp_x.vec[i] = ncclMinFunc(tmp_x.vec[i], tmp_y.vec[i]);
}
return tmp_x.data;
}
template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
__device__ T operator()(const T x, const T y) const {
return (x < y) ? x : y;
}
};
#else
template<typename T>
struct FuncSum {
__device__ T operator()(const T x, const T y) const {
@@ -62,12 +180,15 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
template<>
struct FuncSum<int8_t> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#else
return addChar4(x, y);
#endif
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -77,12 +198,15 @@ struct FuncSum<int8_t> {
template<>
struct FuncSum<uint8_t> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
#else
return addChar4(x, y);
#endif
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -126,6 +250,8 @@ template<>
struct FuncMax<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -139,6 +265,7 @@ struct FuncMax<int8_t> {
cr.a.z = max(cx.a.z, cy.a.z);
cr.a.w = max(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -149,6 +276,8 @@ template<>
struct FuncMax<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -162,6 +291,7 @@ struct FuncMax<uint8_t> {
cr.a.z = max(cx.a.z, cy.a.z);
cr.a.w = max(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -173,6 +303,8 @@ template<>
struct FuncMin<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -186,6 +318,7 @@ struct FuncMin<int8_t> {
cr.a.z = min(cx.a.z, cy.a.z);
cr.a.w = min(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -196,6 +329,8 @@ template<>
struct FuncMin<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -209,6 +344,7 @@ struct FuncMin<uint8_t> {
cr.a.z = min(cx.a.z, cy.a.z);
cr.a.w = min(cx.a.w, cy.a.w);
return cr.storage;
#endif
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -299,4 +435,7 @@ struct FuncMin<half> {
return __float2half(fm);
}
};
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#endif // REDUCE_KERNEL_H_
+10 -3
Ver ficheiro
@@ -1,11 +1,18 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "reduce_scatter.h"
#include "common.h"
#include "reduce_scatter.h"
#include "collectives.h"
IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
#define UNROLL 4
IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum);
IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin);
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
+6 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,9 +10,10 @@
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int nthreads = blockDim.x;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -62,9 +64,11 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
}
template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
@@ -120,4 +124,5 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
}
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,9 +9,9 @@
#include "collectives.h"
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,9 +9,9 @@
#include "collectives.h"
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+56 -57
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,34 +13,33 @@
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype)
NCCL_KERN_NAME(coll##LL, op, dtype), \
NCCL_KERN_NAME(coll##LL, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
(void*)NCCL_FUNC5(coll##Ring, op, dtype), \
(void*)NCCL_FUNC5(coll##Tree, op, dtype)
NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, u8), \
(void*)NCCL_FUNC4(coll, op, i32), \
(void*)NCCL_FUNC4(coll, op, u32), \
(void*)NCCL_FUNC4(coll, op, i64), \
(void*)NCCL_FUNC4(coll, op, u64), \
(void*)NCCL_FUNC4(coll, op, f16), \
(void*)NCCL_FUNC4(coll, op, f32), \
(void*)NCCL_FUNC4(coll, op, f64)
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, u8), \
NCCL_FUNC4(coll, op, i32), \
NCCL_FUNC4(coll, op, u32), \
NCCL_FUNC4(coll, op, i64), \
NCCL_FUNC4(coll, op, u64), \
NCCL_FUNC4(coll, op, f16), \
NCCL_FUNC4(coll, op, f32), \
NCCL_FUNC4(coll, op, f64)
#define NCCL_FUNCS3B(coll, op) \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8), \
(void*)NCCL_FUNC4(coll, op, i8)
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8), \
NCCL_FUNC4(coll, op, i8)
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
#define NCCL_FUNCS2A(coll) \
@@ -53,8 +53,9 @@
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
typedef void(*ncclKern_t)(struct ncclColl);
// Must be consistent with the ncclFuncSet enum
static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
@@ -66,33 +67,31 @@ static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
/* Launch system : synchronization and CUDA kernel launch */
/*****************************************************************************/
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
#if CUDART_VERSION >= 9000
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
if (cgMode & 0x01) {
CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
// These flags are to reduce the latency of using this API
cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
0));
return ncclSuccess;
}
#endif
int savedDev;
CUDACHECK(cudaGetDevice(&savedDev));
CUDACHECK(hipGetDevice(&savedDev));
for (int i = 0; i < numDevices; i++) {
struct cudaLaunchParams* params = paramsList+i;
CUDACHECK(cudaSetDevice(cudaDevs[i]));
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
hipLaunchParams* params = paramsList+i;
CUDACHECK(hipSetDevice(cudaDevs[i]));
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
}
CUDACHECK(cudaSetDevice(savedDev));
CUDACHECK(hipSetDevice(savedDev));
return ncclSuccess;
}
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
// Set active = 2 for the last operation
for (int r=0; r<params->gridDim.x; r++) {
struct ncclChannel* channel = comm->channels+r;
channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
}
// Find the first operation, choose the kernel accordingly and pass it
@@ -100,15 +99,15 @@ ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params)
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
memcpy(&comm->args, coll, sizeof(struct ncclColl));
// As we pass that coll directly, we can free it immediately.
coll->active = 0;
STORE(&coll->active, 0);
params->func = ncclKerns[coll->funcIndex];
params->func = (void *)ncclKerns[coll->funcIndex];
return ncclSuccess;
}
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
int val = *ptr;
int val = LOAD(ptr);
bool done = false;
while (done == false) {
if (val >= comm->intraRanks) {
@@ -130,7 +129,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
int val = *ptr;
int val = LOAD(ptr);
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
WARN("Trying to launch too many collectives");
return ncclInternalError;
@@ -140,28 +139,28 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
while (*ptr < comm->intraRanks) pthread_yield();
while (LOAD(ptr) < comm->intraRanks) pthread_yield();
comm->intraPhase ^= 1;
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
if (comm->nRanks == 1) return ncclSuccess;
struct cudaLaunchParams* params = comm->myParams;
hipLaunchParams* params = comm->myParams;
NCCLCHECK(setupLaunch(comm, params));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
// Enqueue event in user stream
CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
// Create dependency between user stream and internal NCCL stream
CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
params->stream = comm->groupStream;
} else {
if (comm->userStream != params->stream) {
// Stream changed from last call, create dependency against last NCCL kernel launch
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
}
params->stream = comm->userStream;
}
@@ -192,12 +191,12 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
NCCLCHECK(ncclCpuBarrierOut(comm));
struct cudaLaunchParams *params = comm->myParams;
hipLaunchParams *params = comm->myParams;
if (comm->launchMode == ncclComm::PARALLEL) {
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
}
// Start the network proxies as soon as the kernel has been launched. We can't
// perform any CUDA call between the two or having a cudaFree between the CUDA
// perform any CUDA call between the two or having a hipFree between the CUDA
// launch and the transportStartProxy call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
@@ -212,13 +211,13 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
}
ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
struct cudaLaunchParams *params = comm->myParams;
hipLaunchParams *params = comm->myParams;
// Enqueue event after NCCL kernel
CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
// Create dependency between NCCL internal stream and user stream
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
}
comm->userStreamSet = false;
return ncclSuccess;
@@ -292,7 +291,7 @@ static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* n
} else {
*llMode = 0;
*nChannels = info->comm->nChannels;
*nThreads = info->comm->nThreads+1;
*nThreads = info->comm->nThreads;
}
}
@@ -356,7 +355,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
static ncclResult_t saveKernel(struct ncclInfo* info) {
if (info->comm->nRanks == 1) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
}
@@ -390,12 +389,12 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (activePtr[0] != 0) sched_yield();
while (LOAD(activePtr) != 0) sched_yield();
memcpy(c, &coll, sizeof(struct ncclColl));
c->args.bid = bid;
c->active = 1;
STORE(&c->active, 1);
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
channel->collFifoTail = opIndex;
@@ -418,8 +417,8 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
ncclResult_t ret = ncclSuccess;
int savedDev = -1;
if (info->comm->checkPointers) {
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
}
// Check arguments
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
@@ -428,7 +427,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
NCCLCHECKGOTO(saveKernel(info), ret, end);
end:
if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
ncclAsyncErrCheck(ret);
return ret;
} else {
+9 -6
Ver ficheiro
@@ -12,14 +12,14 @@
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
CUDACHECK(hipHostFree(ptr));
return ncclSuccess;
}
@@ -36,15 +36,18 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
return ncclSuccess;
}
+6 -6
Ver ficheiro
@@ -11,17 +11,17 @@
// Check CUDA calls
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
+8 -18
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,21 +8,10 @@
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#define MAXCHANNELS 16
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define CACHE_LINE_SIZE 128
#define CACHE_LINE_SIZE 64
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
@@ -66,9 +56,9 @@ struct ncclComm {
int nvmlDev; // my NVML device number
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
hipStream_t userStream;
bool userStreamSet;
cudaEvent_t doneEvent;
hipEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
@@ -88,7 +78,7 @@ struct ncclComm {
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
hipStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
@@ -111,13 +101,13 @@ struct ncclComm {
int intraPhase;
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
hipLaunchParams * intraParams;
hipLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
void* argsptr;
struct ncclColl* argsptr;
// Global proxy thread
pthread_t proxyThread;
+65
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,6 +11,15 @@
#include "nccl.h"
#include <stdint.h>
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#else
#define LOAD(VAR) *(VAR)
#define STORE(DST, SRC) *(DST) = (SRC)
#endif
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
@@ -73,6 +83,12 @@ struct ncclConnInfo {
// Low latency mechanism
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct ncclConnector {
@@ -111,6 +127,8 @@ struct ncclPeer {
struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
@@ -165,14 +183,56 @@ struct ncclChannel {
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
uint32_t* abortCount;
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
#pragma pack(pop) /* restore original alignment from stack */
#define MAXCHANNELS 16
#ifdef ENABLE_PROFILING
struct ncclProf {
union {
struct {
uint64_t total_cycle;
uint64_t wait_send_cycle[MAXCHANNELS];
uint64_t wait_recv_cycle[MAXCHANNELS];
// primtive cycles
uint64_t send_cycle;
uint64_t directSend_cycle;
uint64_t recv_cycle;
uint64_t directRecv_cycle;
uint64_t copySend_cycle;
uint64_t directCopySend_cycle;
uint64_t recvCopySend_cycle;
uint64_t directRecvCopySend_cycle;
uint64_t recvReduceCopy_cycle;
uint64_t recvReduceSend_cycle;
uint64_t recvReduceCopySend_cycle;
uint64_t directRecvReduceCopySend_cycle;
// primitive bytes
uint64_t send_byte;
uint64_t directSend_byte;
uint64_t recv_byte;
uint64_t directRecv_byte;
uint64_t copySend_byte;
uint64_t directCopySend_byte;
uint64_t recvCopySend_byte;
uint64_t directRecvCopySend_byte;
uint64_t recvReduceCopy_byte;
uint64_t recvReduceSend_byte;
uint64_t recvReduceCopySend_byte;
uint64_t directRecvReduceCopySend_byte;
};
int data[0x80];
};
};
#endif
typedef enum {
ncclDevSuccess,
ncclDevAssertedMismatch,
@@ -189,6 +249,11 @@ struct ncclDevComm {
// Channels, device side
struct ncclChannel* channels;
#ifdef ENABLE_PROFILING
// Profiling counters
struct ncclProf* devProf;
#endif
};
#endif
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,9 +13,9 @@
// Channels / LL tuning
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_LL_MIN_NTHREADS 64
#define NCCL_LL_MIN_NTHREADS 256
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
+2 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,7 +19,7 @@ typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueI
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAsyncColl(ncclComm_t comm);
#endif
+1 -1
Ver ficheiro
@@ -31,7 +31,7 @@ struct ncclInfo {
ncclRedOp_t op;
int root;
ncclComm_t comm;
cudaStream_t stream;
hipStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -58,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
/* Get the maximum number of NVLinks based on the GPU generation */
static ncclResult_t getMaxNvlinks(int* maxLinks) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
int ccMajor;
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
// 6 for Volta, 4 for Pascal
*maxLinks = (ccMajor > 6) ? 6 : 4;
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
+30
Ver ficheiro
@@ -0,0 +1,30 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVLINK_H_
#define NCCL_NVLINK_H_
#include <sys/stat.h>
#include <fcntl.h>
#include "nvmlwrap.h"
#include "topo.h"
#define CONNECT_NVLINK 0x10
#define CONNECT_NVSWITCH 0x100
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static int getNvlinkGpu(const char* busId1, const char* busId2) {
int links = 0;
return CONNECT_NVLINK*links;
}
#endif
+5 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,8 +9,11 @@
#define NCCL_RINGS_H_
static int getDefaultThreads() {
// On Kepler, rings are doubled later.
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
return 256;
#else // On Kepler, rings are doubled later.
return ncclCudaCompCap() == 3 ? 128 : 256;
#endif
}
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
+5 -4
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -39,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
ncclResult_t res = ncclSuccess;
NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
*shmPtr = ptr;
return ncclSuccess;
sysError:
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
cudaError:
hipError_t:
if (fd != -1) close(fd);
if (create) shm_unlink(shmname);
if (ptr != MAP_FAILED) munmap(ptr, shmsize);
@@ -60,7 +61,7 @@ static ncclResult_t shmUnlink(const char* shmname) {
}
static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
CUDACHECK(cudaHostUnregister(shmPtr));
CUDACHECK(hipHostUnregister(shmPtr));
if (munmap(shmPtr, shmsize) != 0) {
WARN("munmap of shared memory failed");
return ncclSystemError;
+2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,6 +12,7 @@
#include <stdint.h>
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getnHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
+130 -57
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,7 +20,11 @@
#include "checks.h"
#include "enqueue.h"
#include "topo.h"
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#include "nvlink_stub.h"
#else
#include "nvlink.h"
#endif
#include "cpuset.h"
#include <stdio.h>
#include <stdlib.h>
@@ -29,7 +34,7 @@
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
@@ -47,7 +52,7 @@ FILE *ncclDebugFile = stdout;
std::chrono::high_resolution_clock::time_point ncclEpoch;
#endif
#if CUDART_VERSION >= 9020
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -63,9 +68,9 @@ ncclNet_t* ncclNet = NULL;
#pragma weak ncclNvlinkGpu
ncclResult_t ncclNvlinkGpu(int* nvlink) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
*nvlink = getNvlinkGpu(busId, NULL);
return ncclSuccess;
}
@@ -73,17 +78,17 @@ ncclResult_t ncclNvlinkGpu(int* nvlink) {
#pragma weak ncclCudaCompCap
int ncclCudaCompCap() {
int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
int ccMajor;
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
return ccMajor;
}
int ncclCudaFullCompCap() {
int cudaDev;
if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
int ccMajor, ccMinor;
if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
return ccMajor*10+ccMinor;
}
@@ -140,7 +145,7 @@ ncclResult_t initNet() {
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0);
int ncclThreadThreshold(int minCompCap, int multiNode) {
int threshold = ncclParamThreadThreshold();
@@ -154,6 +159,22 @@ int ncclThreadThreshold(int minCompCap, int multiNode) {
return threshold;
}
bool useFineGrainVramPcie = false;
void parseHsaForceFineGrainVramPcie() {
char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
if (str && strlen(str) > 0) {
errno = 0;
int64_t v = strtoll(str, NULL, 0);
if (errno || (v != 0 && v != 1)) {
INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
} else {
useFineGrainVramPcie = v;
INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
}
}
}
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
static ncclResult_t ncclInit() {
@@ -165,6 +186,8 @@ static ncclResult_t ncclInit() {
initNet();
initialized = true;
}
// Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
parseHsaForceFineGrainVramPcie();
pthread_mutex_unlock(&initLock);
return ncclSuccess;
}
@@ -192,22 +215,51 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
#ifdef ENABLE_PROFILING
struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost));
uint64_t wait_send_cycle = 0, wait_recv_cycle = 0;
for (int chan=0; chan<comm->nChannels; chan++) {
wait_send_cycle += prof->wait_send_cycle[chan];
wait_recv_cycle += prof->wait_recv_cycle[chan];
}
#define VEGA_GPU_RTC_FREQUENCY 2.7E7
if (comm->rank == 0) {
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", "w_send", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
}
INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
(double)wait_send_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
(double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
(prof->send_cycle) ? (double)prof->send_byte*comm->nChannels/((double)prof->send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->recvReduceSend_cycle) ? (double)prof->recvReduceSend_byte*comm->nChannels/((double)prof->recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->directRecvReduceCopySend_cycle) ? (double)prof->directRecvReduceCopySend_byte*comm->nChannels/((double)prof->directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->directRecvCopySend_cycle) ? (double)prof->directRecvCopySend_byte*comm->nChannels/((double)prof->directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->directRecv_cycle) ? (double)prof->directRecv_byte*comm->nChannels/((double)prof->directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->copySend_cycle) ? (double)prof->copySend_byte*comm->nChannels/((double)prof->copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->recv_cycle) ? (double)prof->recv_byte*comm->nChannels/((double)prof->recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->recvCopySend_cycle) ? (double)prof->recvCopySend_byte*comm->nChannels/((double)prof->recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
free(prof);
CUDACHECK(hipFree(comm->hostDevComm.devProf));
#endif
free(comm->peerInfo);
if (comm->bootstrap)
NCCLCHECK(bootstrapClose(comm->bootstrap));
CUDACHECK(cudaFree(comm->hostDevComm.channels));
CUDACHECK(cudaFree(comm->devComm));
CUDACHECK(hipFree(comm->hostDevComm.channels));
CUDACHECK(hipFree(comm->devComm));
for (int channel=0; channel<comm->nChannels; channel++)
NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
if (comm->doneEvent != NULL)
CUDACHECK(cudaEventDestroy(comm->doneEvent));
CUDACHECK(hipEventDestroy(comm->doneEvent));
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(cudaStreamDestroy(comm->groupStream));
CUDACHECK(hipStreamDestroy(comm->groupStream));
}
// Last rank frees shared resources between threads
@@ -220,8 +272,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->intraCGMode);
free(comm->intraCC);
}
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
CUDACHECK(hipHostFree((void *)comm->abortFlag));
CUDACHECK(hipHostFree((void *)comm->fatalDevError));
// Poison comm to try and catch a double free
commPoison(comm);
@@ -242,15 +294,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
// Try to create a CUDA object right away. If there is something wrong with
// the device we're on (failure cause #1) , better know it early.
cudaEvent_t doneEvent;
CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
hipEvent_t doneEvent;
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
comm->rank = comm->hostDevComm.rank =rank;
comm->nRanks = comm->hostDevComm.nRanks = ndev;
cudaGetDevice(&comm->cudaDev);
hipGetDevice(&comm->cudaDev);
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
@@ -258,7 +310,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->llThreshold = ncclParamLlThreshold();
comm->treeThreshold = ncclParamTreeThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if CUDART_VERSION >= 9020
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
comm->groupCudaStream = ncclParamGroupCudaStream();
#else
// Don't allow the user to overload the default setting in older CUDA builds
@@ -267,12 +319,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->fatalError = ncclSuccess;
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
*comm->fatalDevError = ncclDevSuccess;
STORE(comm->fatalDevError, ncclDevSuccess);
NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
*comm->abortFlag = 0;
STORE(comm->abortFlag, 0);
comm->argsptr = &comm->args;
#ifdef ENABLE_PROFILING
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1));
#endif
*comret = comm;
return ncclSuccess;
@@ -296,7 +351,11 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
}
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
#else
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
#endif
static void showVersion() {
static int shown = 0;
if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
@@ -308,26 +367,31 @@ static void showVersion() {
}
}
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
info->rank = rank;
CUDACHECK(cudaGetDevice(&info->cudaDev));
CUDACHECK(hipGetDevice(&info->cudaDev));
NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
info->hostHash=getHostHash();
info->pidHash=getPidHash();
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;
// Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
// cudaDev is a CUDA runtime dev number which could be different from the
// NVML device number. Then we get the busID from NVML to be sure it is
// consistent with NVML remote PCI bus Ids.
CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#else
nvmlDevice_t nvmlDevice;
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
nvmlPciInfo_t pciInfo;
NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
#endif
return ncclSuccess;
}
static ncclResult_t setCpuAffinity(int cudaDev);
template <int type>
static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
for (int t=0; t<NTRANSPORTS; t++) {
@@ -336,8 +400,14 @@ static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeer
ncclTvalue_t ret = 0;
NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
if (ret > 0) {
cpu_set_t affinitySave;
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
int cudaDev;
CUDACHECK(hipGetDevice(&cudaDev));
setCpuAffinity(cudaDev);
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return ncclSuccess;
}
}
@@ -564,13 +634,13 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int
void* waitForNonNullPtr(void* p) {
volatile void** ptr = (volatile void**) p;
while (*ptr == NULL) sched_yield();
return (void*)*ptr;
while (LOAD(ptr) == NULL) sched_yield();
return (void*)(LOAD(ptr));
}
ncclResult_t initParams(struct ncclComm* comm) {
struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
params->args = &comm->argsptr;
hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
params->args =(void **)&comm->argsptr;
params->stream = NULL;
params->sharedMem = 0;
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -603,7 +673,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
comm->intraCC = CC;
} else {
comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
@@ -611,7 +681,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
NCCLCHECK(initParams(comm));
int cgMdLaunch = 0;
int cgMdLaunch = 1;
// Set CG Mode
comm->launchMode = ncclComm::GROUP;
@@ -620,11 +690,11 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
comm->launchMode = ncclComm::PARALLEL;
}
if (comm->launchMode == ncclComm::GROUP) {
CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
#if CUDART_VERSION >= 9000
if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
// Check whether the GPU supports Cooperative Group Multi Device Launch
(void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
(void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
}
#endif
}
@@ -691,7 +761,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int rank = comm->rank;
int nranks = comm->nRanks;
TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
// AllGather1 - begin
@@ -702,7 +773,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
allGather1Data[rank].comm = comm;
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
@@ -945,7 +1016,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
// Make sure all host memory allocation are close to the GPU
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
NCCLCHECK(setCpuAffinity(cudaDev));
ncclResult_t res;
@@ -976,7 +1047,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
if (myrank == 0) showVersion();
// Make sure the CUDA runtime is initialized.
CUDACHECK(cudaFree(NULL));
CUDACHECK(hipFree(NULL));
NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
@@ -986,7 +1057,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
if (ncclAsyncMode()) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
} else {
return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
@@ -997,8 +1068,8 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
struct ncclPeerInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
NCCLCHECK(fillInfo(allInfo+rank, rank));
CUDACHECK(hipSetDevice(devs[rank]));
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
}
int* connectTransport;
@@ -1020,7 +1091,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
int myCompCap = ncclCudaCompCap();
int minCompCap = myCompCap;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
CUDACHECK(hipSetDevice(devs[rank]));
int nringsRank;
int nthreadsRank = getDefaultThreads();
myCompCap = ncclCudaCompCap();
@@ -1061,7 +1132,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
for (int r=0; r<nrings; r++) {
int* ringRanks = rings+r*nranks;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
CUDACHECK(hipSetDevice(devs[rank]));
struct ncclChannel* channel = comms[rank]->channels+r;
struct ncclRing *ring = &channel->ring;
NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
@@ -1075,7 +1146,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
}
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
CUDACHECK(hipSetDevice(devs[rank]));
struct ncclChannel* channel = comms[rank]->channels+r;
struct ncclRing *ring = &channel->ring;
struct ncclConnector* recv = &channel->peers[ring->prev].recv;
@@ -1118,7 +1189,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
ncclDevList[i] = devlist ? devlist[i] : i;
}
CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
CUDACHECKGOTO(hipGetDevice(&savedDevice), res, cleanup);
for(rank=0; rank<ndev; ++rank)
comms[rank] = NULL;
@@ -1128,7 +1199,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
for (rank=0; rank<ndev; ++rank) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
NCCLCHECK(setCpuAffinity(cudaDev));
@@ -1144,7 +1215,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
for(rank=0; rank<ndev; ++rank) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
}
@@ -1162,7 +1233,7 @@ final:
free(ncclDevList);
if(wrapNvmlShutdown() != ncclSuccess)
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
cudaSetDevice(savedDevice);
hipSetDevice(savedDevice);
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res;
}
@@ -1173,21 +1244,21 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
#ifdef ENABLE_TRACE
int rank = comm->rank;
#endif
CUDACHECK(cudaGetDevice(&savedDevice));
CUDACHECK(hipGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
if (savedDevice != commDevice) {
CUDACHECK(cudaSetDevice(commDevice));
CUDACHECK(hipSetDevice(commDevice));
}
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);
CUDACHECK(cudaStreamSynchronize(comm->groupStream));
CUDACHECK(hipStreamSynchronize(comm->groupStream));
NCCLCHECK(transportDestroyProxy(comm));
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice)
CUDACHECK(cudaSetDevice(savedDevice));
CUDACHECK(hipSetDevice(savedDevice));
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
@@ -1216,9 +1287,11 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
return ncclSuccess;
// Ask anything that might still be running on the device to quit
*comm->abortFlag = 1;
STORE(comm->abortFlag, 1);
return commDestroy(comm);
// do not destroy comm because kernel maybe still running
// return commDestroy(comm);
return ncclSuccess;
}
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
@@ -1241,7 +1314,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
// Check device reported error
static ncclDevError_t printedDevErr = ncclDevSuccess;
switch(*comm->fatalDevError) {
switch(LOAD(comm->fatalDevError)) {
case ncclDevSuccess :
break;
case ncclDevAssertedMismatch :
+5 -5
Ver ficheiro
@@ -7,16 +7,16 @@
#include "argcheck.h"
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;
cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
if (err != cudaSuccess || attr.devicePointer == NULL) {
hipPointerAttribute_t attr;
hipError_t err = hipPointerGetAttributes(&attr, pointer);
if (err != hipSuccess || attr.devicePointer == NULL) {
WARN("%s : %s is not a valid pointer", opname, ptrname);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 10000
if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
#else
if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
#endif
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
return ncclInvalidArgument;
+10 -9
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -52,7 +53,7 @@ struct ncclAsyncArgs {
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
ncclResult_t ncclSetDevice(int cudaDev) {
CUDACHECK(cudaSetDevice(cudaDev));
CUDACHECK(hipSetDevice(cudaDev));
return ncclSuccess;
}
@@ -116,7 +117,7 @@ ncclResult_t ncclGroupEnd() {
ncclGroupMode--;
if (ncclGroupMode > 0) return ncclSuccess;
int savedDev;
CUDACHECK(cudaGetDevice(&savedDev));
CUDACHECK(hipGetDevice(&savedDev));
int done = ncclGroupIndex;
int doneArray[MAX_ASYNC_OPS];
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
@@ -129,22 +130,22 @@ ncclResult_t ncclGroupEnd() {
* 2. Barrier Wait. No CUDA call is permitted
* 3. Enqueue Events. CUDA event wait/enqueue.
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
* cudaFree happens between 1 and 3, it could block that CUDA call and
* hipFree happens between 1 and 3, it could block that CUDA call and
* prevent some ranks from launching their network threads, which would
* prevent the NCCL call from completing, blocking the cudaFree call.
* prevent the NCCL call from completing, blocking the hipFree call.
*/
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == NULL)
CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end);
}
}
@@ -152,7 +153,7 @@ ncclResult_t ncclGroupEnd() {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == NULL)
CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
doneArray[i] = 1;
done--;
@@ -182,7 +183,7 @@ group_cleanup:
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int i=0; i<channel->collCount; i++) {
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
STORE(&channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active, 0);
}
channel->collFifoTail = channel->collStart;
channel->collCount = 0;
@@ -193,6 +194,6 @@ group_cleanup:
end:
ncclGroupError = ncclSuccess;
ncclGroupIndex = 0;
CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
return ret;
}
+49
Ver ficheiro
@@ -0,0 +1,49 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nvmlwrap.h"
ncclResult_t wrapNvmlSymbols(void) {
return ncclSuccess;
}
ncclResult_t wrapNvmlInit(void) {
return ncclSuccess;
}
ncclResult_t wrapNvmlShutdown(void) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
return ncclSuccess;
}
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult) {
return ncclSuccess;
}
+5
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -378,7 +379,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
*nrings = maxNrings;
} else {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
int defaultMinNrings = 1;
#else
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
#endif
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
if (minNrings > 0 && minNrings > *nrings) {
if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
+2 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,7 +13,7 @@
ncclResult_t getCudaPath(int cudaDev, char** path) {
char busId[BUSID_SIZE];
CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+12 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -20,7 +21,7 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
nvmlDevice_t nvmlDevice;
unsigned int dev;
*nvmlDev = -1;
CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
@@ -50,7 +51,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
char hostname[1024];
getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
hipGetDevice(&cudaDev);
char buffer[1024];
size_t len = 0;
@@ -96,6 +97,15 @@ uint64_t getHash(const char* string) {
return result;
}
uint64_t getnHash(const char* string, int n) {
// Based on DJB2, result = result * 33 + char
uint64_t result = 9527;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) + string[c];
}
return result;
}
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
+20 -19
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,15 +8,15 @@
#ifndef NCCL_H_
#define NCCL_H_
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#define NCCL_MAJOR ${nccl:Major}
#define NCCL_MINOR ${nccl:Minor}
#define NCCL_PATCH ${nccl:Patch}
#define NCCL_SUFFIX "${nccl:Suffix}"
#define NCCL_MAJOR ${NCCL_MAJOR}
#define NCCL_MINOR ${NCCL_MINOR}
#define NCCL_PATCH ${NCCL_PATCH}
#define NCCL_SUFFIX "${NCCL_SUFFIX}"
#define NCCL_VERSION_CODE ${nccl:Version}
#define NCCL_VERSION_CODE ${NCCL_VERSION}
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
#ifdef __cplusplus
@@ -142,9 +143,9 @@ typedef enum { ncclInt8 = 0, ncclChar = 0,
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
/*
* (deprecated) Broadcast (in-place)
@@ -156,9 +157,9 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
* This operation is implicitely in place.
*/
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, hipStream_t stream);
/*
* Broadcast
@@ -170,9 +171,9 @@ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, hipStream_t stream);
/*
* All-Reduce
@@ -183,9 +184,9 @@ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
/*
* Reduce-Scatter
@@ -200,10 +201,10 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream);
hipStream_t stream);
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
cudaStream_t stream);
hipStream_t stream);
/*
* All-Gather
@@ -216,9 +217,9 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
/*
* Group semantics
+2 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -141,7 +142,7 @@ void* persistentThread(void *comm_) {
int idleSpin = 0;
while (1) {
do {
if (*comm->abortFlag) return NULL;
if (LOAD(comm->abortFlag)) return NULL;
if (op == NULL) {
pthread_mutex_lock(&state->mutex);
op = state->ops;
+35 -20
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +11,7 @@
#include "net.h"
#include "param.h"
#include "topo.h"
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <assert.h>
#define NET_MAX_IFS 16
@@ -73,6 +74,7 @@ struct netRecvResources {
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
@@ -100,7 +102,7 @@ static ncclResult_t netDevices(int* ndev, short** distances) {
// Find distance with current GPU
int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
char line[1024];
sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
@@ -115,7 +117,7 @@ static ncclResult_t netDevices(int* ndev, short** distances) {
/* Determine if we can communicate with the peer */
ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
ret[0] = ncclNetTvalues[cudaDev];
if (ret[0] == NET_TVALUE_UNKNOWN) {
if (cudaDev >= NET_MAX_GPUS) {
@@ -243,6 +245,8 @@ end:
return dev;
}
extern bool useFineGrainVramPcie;
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
@@ -250,9 +254,14 @@ static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
*useGdr = 0;
int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
if (!useFineGrainVramPcie) {
INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
return ncclSuccess;
}
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
@@ -289,7 +298,7 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
send->transportResources = resources;
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
resources->netDev = getDev(cudaDev, channelId);
NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
@@ -298,7 +307,7 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
}
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
resources->buffSize = buffSize;
@@ -314,7 +323,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
recv->transportResources = resources;
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
resources->netDev = getDev(cudaDev, channelId);
NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
@@ -323,7 +332,8 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
if (resources->useGdr) {
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
}
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
resources->buffSize = buffSize;
@@ -399,7 +409,7 @@ ncclResult_t netSendFree(void* transportResources) {
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
CUDACHECK(cudaFree(resources->devRecvMem));
CUDACHECK(hipFree(resources->devRecvMem));
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources);
return ncclSuccess;
@@ -412,7 +422,7 @@ ncclResult_t netRecvFree(void* transportResources) {
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
CUDACHECK(cudaFree(resources->devRecvMem));
CUDACHECK(hipFree(resources->devRecvMem));
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
free(resources);
return ncclSuccess;
@@ -439,7 +449,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
if (args->llMode) {
int buffSlot = args->tail%NCCL_STEPS;
int size = sizesFifo[buffSlot];
int size = LOAD(sizesFifo+buffSlot);
if (size != -1) {
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
@@ -449,12 +459,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
volatile uint32_t *f2 = &lines[i].flag2;
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
}
if (ready) {
NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
@@ -462,14 +472,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
}
} else if (args->tail < *recvTail) {
} else if (args->tail < LOAD(recvTail)) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
int stepSize = args->channel->buffSize/NCCL_STEPS;
// Send through network
int buffSlot = args->tail%NCCL_STEPS;
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
args->tail += args->sliceSteps;
@@ -483,7 +493,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
args->head += args->sliceSteps;
resources->hostSendMem->head = args->head;
STORE(&resources->hostSendMem->head, args->head);
args->idle = 0;
}
}
@@ -518,7 +528,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
volatile uint64_t* sendHead = &resources->hostSendMem->head;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
int sliceSize = stepSize * args->sliceSteps;
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@@ -534,8 +544,13 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (done) {
args->head += args->sliceSteps;
if (args->llMode == 0) {
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
resources->hostRecvMem->tail = args->head;
if (resources->useGdr) {
ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
// Flush local HDP register after local read-back finishes
STORE(resources->curr_hdp_reg, 0x1);
TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", resources->curr_hdp_reg);
}
STORE(&resources->hostRecvMem->tail, args->head);
}
args->idle = 0;
}
+16 -11
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -196,12 +197,16 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
ncclResult_t ncclIbGdrSupport(int ibDev) {
static int moduleLoaded = -1;
if (moduleLoaded == -1) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
#else
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
#endif
}
if (moduleLoaded == 0) return ncclSystemError;
ncclResult_t ret = ncclSystemError;
void* ptr;
if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
if (hipMalloc(&ptr, sizeof(int)) == hipSuccess) {
struct ibv_mr* mr;
struct ibv_pd* pd;
if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
@@ -211,7 +216,7 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
}
wrap_ibv_dealloc_pd(pd);
}
cudaFree(ptr);
hipFree(ptr);
}
return ret;
}
@@ -220,7 +225,7 @@ ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
*supportedTypes = NCCL_PTR_HOST;
int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
if (ncclIbGdrSupport(dev) != ncclSuccess) {
@@ -620,7 +625,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
// Wait for the receiver to have posted the corresponding receive
volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
volatile uint32_t * readyPtr = &slot->ready;
if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
if (LOAD(readyPtr) == 0) { *request = NULL; return ncclSuccess; }
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
@@ -647,22 +652,22 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
__sync_synchronize(); // order the readyPtr load against rkey load below
// Sanity checks to catch user collective call count/size mismatches
// plus any potential programming errors
if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
if (size > LOAD(&slot->size) || LOAD(&slot->size) <= 0 || LOAD(&slot->addr) == 0 || LOAD(&slot->rkey) == 0 || LOAD(&slot->seq) != comm->fifoHead) {
WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
return ncclInternalError;
}
wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
wr.wr.rdma.remote_addr = slot->addr;
wr.wr.rdma.rkey = slot->rkey;
wr.wr.rdma.remote_addr = LOAD(&slot->addr);
wr.wr.rdma.rkey = LOAD(&slot->rkey);
wr.imm_data = size; // Send the message size via imm_data
__sync_synchronize();
#endif
// We must clear slot->ready, but reset other fields to aid
// debugging and sanity checks
slot->ready = 0;
slot->addr = 0ULL;
slot->rkey = slot->size = slot->seq = 0;
STORE(&slot->ready, 0);
STORE(&slot->addr, 0);
STORE(&slot->rkey, 0); STORE(&slot->size, 0); STORE(&slot->seq, 0);
comm->fifoHead++;
struct ibv_send_wr* bad_wr;
+3 -2
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -175,12 +176,12 @@ void* persistentSocketThread(void *args_) {
}
if (idle) {
pthread_mutex_lock(&resource->threadLock);
while (mark == myQueue->next && *state != stop) { // no new tasks, wait
while (mark == myQueue->next && LOAD(state) != stop) { // no new tasks, wait
pthread_cond_wait(&resource->threadCond, &resource->threadLock);
}
pthread_mutex_unlock(&resource->threadLock);
}
if (*state == stop) return NULL;
if (LOAD(state) == stop) return NULL;
}
}
+82 -35
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,21 +11,26 @@
#include "transport.h"
#include "param.h"
#include <unistd.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#include <ctype.h>
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#include "nvlink_stub.h"
#else
#include "nvlink.h"
#endif
struct p2pConnectInfo {
int direct;
union {
void* directPtr;
cudaIpcMemHandle_t devIpc;
hipIpcMemHandle_t devIpc;
};
};
struct p2pSendResources {
struct ncclSendMem* devMem;
void* ipcPtr;
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
};
struct p2pRecvResources {
@@ -37,14 +43,16 @@ struct p2pRecvResources {
NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
extern bool useFineGrainVramPcie;
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
static int busIdToCudaDev(const char* busId) {
int ndev;
if (cudaGetDeviceCount(&ndev) != cudaSuccess)
if (hipGetDeviceCount(&ndev) != hipSuccess)
return -1;
for (int i = 0; i < ndev; i++) {
char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
if (hipDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != hipSuccess)
return -1;
if (strcmp(busId, devBusId) == 0) {
return i;
@@ -95,15 +103,38 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
// See if CUDA can do P2P
int p2p;
if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != hipSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
return ncclSuccess;
}
if (p2p == 0) return ncclSuccess;
// Check for NVLink/NVswitch
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
uint32_t link_type, hops;
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
p2p = 0;
return ncclSuccess;
}
static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
static unsigned long long link_status_print_once_mask = 0;
if (!(link_status_print_once_mask & (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev)))) {
INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", myInfo->cudaDev, peerInfo->cudaDev,
link_type_name[link_type], hops);
link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
}
int nvlinkp2p = 0;
if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
if (hops == 1)
nvlinkp2p = CONNECT_NVLINK;
} else {
if (!useFineGrainVramPcie)
return ncclSuccess;
}
#else
// Check for NVLink/NVswitch
int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
#endif
if (nvlinkp2p > 0) {
*ret = nvlinkp2p;
return ncclSuccess;
@@ -266,7 +297,11 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
}
// Duplicate the rings for direct NVLink
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
#else
compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
#endif
return compNrings;
}
@@ -464,13 +499,24 @@ end:
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct p2pSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int sendSize = sizeof(struct ncclSendMem);
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
uint32_t linktype, hops;
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
return ncclInternalError;
}
if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
}
else
resources->next_hdp_reg = 0;
struct p2pConnectInfo info;
if (myInfo->pidHash == peerInfo->pidHash) {
@@ -480,12 +526,12 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
} else {
// Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == hipErrorPeerAccessAlreadyEnabled) {
hipGetLastError();
} else if (err != hipSuccess) {
WARN("failed to peer with device %d(=%d): %d %s",
peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
@@ -496,10 +542,10 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
info.direct = 0;
// Map IPC and enable P2P access
cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != cudaSuccess) {
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != hipSuccess) {
WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
@@ -520,7 +566,7 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
recv->transportResources = resources;
int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
struct p2pConnectInfo info;
if (myInfo->pidHash == peerInfo->pidHash) {
@@ -530,12 +576,12 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
} else {
// Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == hipErrorPeerAccessAlreadyEnabled) {
hipGetLastError();
} else if (err != hipSuccess) {
WARN("failed to peer with device %d(=%d): %d %s",
peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
return ncclInternalError;
}
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
@@ -545,10 +591,10 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
info.direct = 0;
// Map IPC and enable P2P access
cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != cudaSuccess) {
hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != hipSuccess) {
WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
return ncclInternalError;
}
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
@@ -569,11 +615,11 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
send->conn.direct = 1;
} else {
//TRACE_DUMP_IPC(&info->devIpc);
cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
if (err != cudaSuccess) {
if (err != hipSuccess) {
WARN("failed to open CUDA IPC handle : %d %s",
err, cudaGetErrorString(err));
err, hipGetErrorString(err));
return ncclUnhandledCudaError;
}
}
@@ -585,6 +631,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
send->conn.head = &resources->devMem->head;
send->conn.ptrExchange = &resources->devMem->ptrExchange;
send->conn.opCountLoc = &resources->devMem->opCount;
send->conn.next_hdp_reg = resources->next_hdp_reg;
return ncclSuccess;
}
@@ -599,11 +646,11 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
recv->conn.ptrExchange = &remDevMem->ptrExchange;
} else {
//TRACE_DUMP_IPC(&info->devIpc);
cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
remDevMem = (struct ncclSendMem*)resources->ipcPtr;
if (err != cudaSuccess) {
if (err != hipSuccess) {
WARN("failed to open CUDA IPC handle : %d %s",
err, cudaGetErrorString(err));
err, hipGetErrorString(err));
return ncclUnhandledCudaError;
}
}
@@ -620,8 +667,8 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
ncclResult_t p2pSendFree(void* resources) {
struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
if (sendRes->ipcPtr)
CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
CUDACHECK(cudaFree(sendRes->devMem));
CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
CUDACHECK(hipFree(sendRes->devMem));
free(sendRes);
return ncclSuccess;
}
@@ -629,8 +676,8 @@ ncclResult_t p2pSendFree(void* resources) {
ncclResult_t p2pRecvFree(void* resources) {
struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
if (recvRes->ipcPtr)
CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
CUDACHECK(cudaFree(recvRes->devMem));
CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
CUDACHECK(hipFree(recvRes->devMem));
free(recvRes);
return ncclSuccess;
}
+2 -1
Ver ficheiro
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +11,7 @@
#include "param.h"
#include "shm.h"
#include <unistd.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
struct shmConnectInfo {
uint64_t pidHash;
+72
Ver ficheiro
@@ -0,0 +1,72 @@
cmake_minimum_required(VERSION 2.8.12)
if(BUILD_TESTS)
message("Going to build unit tests (Installed in /test/UnitTests)")
# chrpath is required to properly set rpath for the UnitTests executable
find_program(CHRPATH chrpath)
if(NOT CHRPATH)
message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
endif()
# OpenMP is used to drive GPUs (one per thread)
find_package(OpenMP REQUIRED)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
# Download and unpack googletest at configure time
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
execute_process(
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download)
if(result)
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
endif()
execute_process(
COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download)
if(result)
message(FATAL_ERROR "Build step for googletest failed: ${result}")
endif()
# Add googletest directly to our build. This adds the following targets:
# gtest, gtest_main, gmock and gmock_main
add_subdirectory("${CMAKE_BINARY_DIR}/googletest-src"
"${CMAKE_BINARY_DIR}/googletest-build")
# Add googletest directly to our build. This defines the gtest and gtest_main
# targets. add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
# ${CMAKE_CURRENT_BINARY_DIR}/googletest-build EXCLUDE_FROM_ALL)
# Collect source files for tests
set(TEST_SOURCES
test_AllGather.cpp
test_AllReduce.cpp
test_Broadcast.cpp
test_Reduce.cpp
test_ReduceScatter.cpp
test_GroupCalls.cpp
test_CombinedCalls.cpp
test_AllReduceAbort.cpp
test_BroadcastAbort.cpp
)
add_executable(UnitTests ${TEST_SOURCES})
target_include_directories(UnitTests PRIVATE /opt/rocm)
target_link_libraries(UnitTests PRIVATE gtest_main PRIVATE rccl)
install(TARGETS UnitTests RUNTIME DESTINATION test)
# HCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
# remove RPATH. As a work-around, set the correct RPATH for the unit test executable
# as a post-install step
install(
CODE
"execute_process(COMMAND chrpath -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)"
)
else()
message("Not building unit tests")
endif()
+15
Ver ficheiro
@@ -0,0 +1,15 @@
cmake_minimum_required(VERSION 2.8.2)
project(googletest-download NONE)
include(ExternalProject)
ExternalProject_Add(googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.8.1
SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src"
BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
+360
Ver ficheiro
@@ -0,0 +1,360 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef CORRECTNESSTEST_HPP
#define CORRECTNESSTEST_HPP
#include <cstdio>
#include <tuple>
#include <vector>
#include <gtest/gtest.h>
#include "rccl.h"
#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
namespace CorrectnessTests
{
// Performs the various basic reduction operations
template <typename T>
T ReduceOp(ncclRedOp_t const op, T const A, T const B)
{
switch (op)
{
case ncclSum: return A + B;
case ncclProd: return A * B;
case ncclMax: return std::max(A, B);
case ncclMin: return std::min(A, B);
default:
fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op);
exit(0);
}
}
// Returns the number of bytes per element for each supported datatype
static int DataTypeToBytes(ncclDataType_t const dataType)
{
switch (dataType)
{
case ncclInt8: return 1;
case ncclUint8: return 1;
case ncclInt32: return 4;
case ncclUint32: return 4;
case ncclInt64: return 8;
case ncclUint64: return 8;
case ncclFloat16: return 2;
case ncclFloat32: return 4;
case ncclFloat64: return 8;
default:
fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType);
exit(0);
}
}
// Encapsulates all the memory used per devices for collectives, as well as reference results
struct Dataset
{
int numDevices; // Number of devices participating
size_t numElements; // Number of elements per array
ncclDataType_t dataType; // Data type of each input/output pointer
bool inPlace; // Whether or not output pointers are same as input pointers
std::vector<void *> inputs; // Input pointers (1 per device)
std::vector<void *> outputs; // Output pointers (1 per device)
// May be identical to input pointers for in-place tests
std::vector<void *> expected; // Expected output (1 per device)
size_t NumBytes() const
{
return numElements * DataTypeToBytes(dataType);
}
void Initialize(int const numDevices_,
size_t const numElements_,
ncclDataType_t const dataType_,
bool const inPlace_)
{
numDevices = numDevices_;
numElements = numElements_;
dataType = dataType_;
inPlace = inPlace_;
inputs.resize(numDevices);
outputs.resize(numDevices);
expected.resize(numDevices);
// Allocate per-device memory
size_t const numBytes = NumBytes();
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipMalloc((void **)&inputs[i], numBytes));
if (inPlace)
outputs[i] = inputs[i];
else
HIP_CALL(hipMalloc((void **)&outputs[i], numBytes));
expected[i] = malloc(numBytes);
}
}
// Explicit memory release to avoid double-free from subDatasets
void Release()
{
for (int i = 0; i < outputs.size(); i++)
{
if (!inPlace) hipFree(outputs[i]);
hipFree(inputs[i]);
free(expected[i]);
}
outputs.clear();
}
// Creates a dataset by pointing to an existing dataset
// Primarily to allow for testing with different starting byte-alignments
void ExtractSubDataset(size_t const startElement,
size_t const lastElement,
Dataset& subDataset)
{
ASSERT_LE(startElement, lastElement);
ASSERT_LT(lastElement, numElements);
subDataset.numDevices = numDevices;
subDataset.numElements = lastElement - startElement + 1;
subDataset.dataType = dataType;
subDataset.inPlace = inPlace;
subDataset.inputs.resize(numDevices);
subDataset.outputs.resize(numDevices);
subDataset.expected.resize(numDevices);
size_t const byteOffset = (startElement * DataTypeToBytes(dataType));
for (int i = 0; i < numDevices; i++)
{
subDataset.inputs[i] = (int8_t *)inputs[i] + byteOffset;
subDataset.outputs[i] = (int8_t *)outputs[i] + byteOffset;
subDataset.expected[i] = (int8_t *)expected[i] + byteOffset;
}
}
};
typedef std::tuple<ncclRedOp_t /* op */,
ncclDataType_t /* dataType */,
size_t /* numElements */,
int /* numDevices */,
bool /* inPlace */> TestTuple;
// Base class for each collective test
// - Each test is instantiated with a different TestTuple
class CorrectnessTest : public testing::TestWithParam<TestTuple>
{
protected:
// This code is called per test-tuple
void SetUp() override
{
// Check for fine-grained env variable (otherwise will hang)
if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
{
printf("Must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
exit(0);
}
// Make the test tuple parameters accessible
std::tie(op, dataType, numElements, numDevices, inPlace) = GetParam();
// Collect the number of available GPUs
HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
// Only proceed with testing if there are enough GPUs
if (numDevices > numDevicesAvailable)
{
fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n",
numDevices, numDevicesAvailable);
// Modify the number of devices so that tear-down doesn't occur
// This is temporary until GTEST_SKIP() becomes available
numDevices = 0;
numDevicesAvailable = -1;
return;
}
// Initialize communicators
comms.resize(numDevices);
NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
// Create streams
streams.resize(numDevices);
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipStreamCreate(&streams[i]));
}
}
// Clean up per TestTuple
void TearDown() override
{
// Release communicators and streams
for (int i = 0; i < numDevices; i++)
{
NCCL_CALL(ncclCommDestroy(comms[i]));
HIP_CALL(hipStreamDestroy(streams[i]));
}
}
void FillDatasetWithPattern(Dataset& dataset)
{
int8_t* arrayI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
// NOTE: Currently half-precision float tests are unsupported due to half being supported
// on GPU only and not host
// Fills input data[i][j] with (i + j) % 6
// - Keeping range small to reduce likelihood of overflow
// - Sticking with floating points values that are perfectly representable
for (int i = 0; i < dataset.numDevices; i++)
{
for (int j = 0; j < dataset.numElements; j++)
{
int valueI = (i + j) % 6;
float valueF = (float)valueI;
switch (dataset.dataType)
{
case ncclInt8: arrayI1[j] = valueI; break;
case ncclUint8: arrayU1[j] = valueI; break;
case ncclInt32: arrayI4[j] = valueI; break;
case ncclUint32: arrayU4[j] = valueI; break;
case ncclInt64: arrayI8[j] = valueI; break;
case ncclUint64: arrayU8[j] = valueI; break;
case ncclFloat32: arrayF4[j] = valueF; break;
case ncclFloat64: arrayF8[j] = valueF; break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(), hipMemcpyHostToDevice));
// Fills output data[i][j] with 0 (if not inplace)
if (!dataset.inPlace)
HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes()));
}
free(arrayI1);
}
void Synchronize() const
{
// Wait for reduction to complete
for (int i = 0; i < numDevices; i++)
{
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipStreamSynchronize(streams[i]));
}
}
void ValidateResults(Dataset const& dataset) const
{
int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* outputU1 = (uint8_t *)outputI1;
int32_t* outputI4 = (int32_t *)outputI1;
uint32_t* outputU4 = (uint32_t *)outputI1;
int64_t* outputI8 = (int64_t *)outputI1;
uint64_t* outputU8 = (uint64_t *)outputI1;
float* outputF4 = (float *)outputI1;
double* outputF8 = (double *)outputI1;
bool isMatch = true;
// Loop over each device's output and compare it to the expected output
// (Each collective operation computes its own expected results)
for (int i = 0; i < dataset.numDevices && isMatch; i++)
{
HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
int8_t* expectedI1 = (int8_t *)dataset.expected[i];
uint8_t* expectedU1 = (uint8_t *)expectedI1;
int32_t* expectedI4 = (int32_t *)expectedI1;
uint32_t* expectedU4 = (uint32_t *)expectedI1;
int64_t* expectedI8 = (int64_t *)expectedI1;
uint64_t* expectedU8 = (uint64_t *)expectedI1;
float* expectedF4 = (float *)expectedI1;
double* expectedF8 = (double *)expectedI1;
for (int j = 0; j < dataset.numElements && isMatch; j++)
{
switch (dataset.dataType)
{
case ncclInt8: isMatch &= (outputI1[j] == expectedI1[j]); break;
case ncclUint8: isMatch &= (outputU1[j] == expectedU1[j]); break;
case ncclInt32: isMatch &= (outputI4[j] == expectedI4[j]); break;
case ncclUint32: isMatch &= (outputU4[j] == expectedU4[j]); break;
case ncclInt64: isMatch &= (outputI8[j] == expectedI8[j]); break;
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break;
case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
if (!isMatch)
{
switch (dataset.dataType)
{
case ncclInt8:
printf("Expected %d. Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
case ncclUint8:
printf("Expected %u. Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
case ncclInt32:
printf("Expected %d. Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
case ncclUint32:
printf("Expected %u. Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
case ncclInt64:
printf("Expected %ld. Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
case ncclUint64:
printf("Expected %lu. Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
case ncclFloat32:
printf("Expected %f. Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
case ncclFloat64:
printf("Expected %lf. Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
ASSERT_EQ(isMatch, true);
}
}
// Passed in parameters from TestTuple
ncclRedOp_t op;
ncclDataType_t dataType;
size_t numElements;
int numDevices;
bool inPlace;
int numDevicesAvailable;
std::vector<ncclComm_t> comms;
std::vector<hipStream_t> streams;
};
}
#endif
+111
Ver ficheiro
@@ -0,0 +1,111 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllGather.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(AllGatherCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset);
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
size_t const sendCount = dataset.numElements / dataset.numDevices;
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
dataset.outputs[i], sendCount,
dataType, comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
TEST_P(AllGatherCorrectnessTest, Alignment)
{
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
// Allocate dataset
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
// Loop over several offsets (so that device pointers are not aligned)
for (int firstElement = 1; firstElement <= 11; firstElement += 2)
{
if (firstElement < numElements)
{
// Select last element so that total number of elements is multiple of numDevices
int const lastElement = firstElement + ((numElements - firstElement) / numDevices) * numDevices - 1;
if (lastElement >= numElements) break;
Dataset subDataset;
dataset.ExtractSubDataset(firstElement, lastElement, subDataset);
// Compute reference results for sub-dataset
FillDatasetWithPattern(subDataset);
ComputeExpectedResults(subDataset);
size_t const byteCount = subDataset.NumBytes() / subDataset.numDevices;
size_t const sendCount = subDataset.numElements / subDataset.numDevices;
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
subDataset.outputs[i], sendCount,
dataType, comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(subDataset);
}
}
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(AllGatherCorrectnessSweep,
AllGatherCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+32
Ver ficheiro
@@ -0,0 +1,32 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLGATHER_HPP
#define TEST_ALLGATHER_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllGatherCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset)
{
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
int8_t* result = (int8_t *)malloc(dataset.NumBytes());
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount),
byteCount, hipMemcpyDeviceToHost));
for (int i = 0; i < dataset.numDevices; i++)
memcpy(dataset.expected[i], result, dataset.NumBytes());
}
};
}
#endif
+60
Ver ficheiro
@@ -0,0 +1,60 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduce.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(AllReduceCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, op);
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(AllReduceCorrectnessSweep,
AllReduceCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+76
Ver ficheiro
@@ -0,0 +1,76 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_HPP
#define TEST_ALLREDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllReduceCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
// Copy results into expected arrays
for (int i = 0; i < dataset.numDevices; i++)
memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
free(resultI1);
}
};
}
#endif
+150
Ver ficheiro
@@ -0,0 +1,150 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduceAbort.hpp"
#include "../include/core.h"
#include <omp.h>
#define NUM_ITER 8
#define FAKE_OP_COUNT NUM_ITER+1
namespace CorrectnessTests
{
#define HIPCHECK(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) { \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
TEST_P(AllReduceAbortTest, Correctness) {
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(dataset);
int gpu = 0; // GPU number to trigger abort
ncclComm_t comm = comms[gpu];
HIPCHECK(hipSetDevice(gpu));
hipStream_t stream;
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
struct ncclChannel* channel = comm->channels;
struct ncclRing *ring = &channel->ring;
struct ncclConnector* send = &channel->peers[ring->next].send;
size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers;
size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers;
uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset;
uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset;
uint64_t *real_opCount, *fake_opCount, *fake_o;
uint64_t *real_head, *fake_head, *fake_h;
// get original opCount and head
HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipStreamSynchronize(stream));
// allocate and install fakes
HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped));
HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
*fake_opCount = FAKE_OP_COUNT;
HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
*fake_head = 0;
HIPCHECK(hipStreamSynchronize(stream));
// read back fakes to confirm
HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipStreamSynchronize(stream));
//std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
//std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
// Perform a number of iterations and introduce abort
for (int j = 0; j < NUM_ITER; j++) {
//std::cerr << "[ ] iter = " << j << std::endl;
// Start a group call
ncclGroupStart();
for (int i = 0; i < numDevices; i++) {
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
// Signal end of group call
ncclGroupEnd();
}
// Wait for reduction to complete
auto start = std::chrono::high_resolution_clock::now();
hipError_t hipErr;
int remaining = numDevices;
int* done = (int*)malloc(sizeof(int)*numDevices);
memset(done, 0, sizeof(int)*numDevices);
bool timeout = false, abort_called = false;
while (remaining) {
int idle = 1;
for (int i=0; i<numDevices; i++) {
if (done[i]) continue;
hipErr = hipStreamQuery(streams[i]);
if (hipErr == hipSuccess) {
done[i] = 1;
remaining--;
idle = 0;
continue;
}
#if NCCL_MAJOR >= 2
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
if (deltaSec > 10.0 && !timeout) {
std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl;
timeout = true;
}
ncclResult_t ncclAsyncErr;
ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
// An asynchronous error happened. Stop the operation and destroy
// the communicator
std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
for (int i=0; i<numDevices; i++)
ncclCommAbort(comms[i]);
// Abort the perf test
abort_called = true;
break;
}
#endif
#endif
}
// We might want to let other threads (including NCCL threads) use the CPU.
if (idle) pthread_yield();
}
HIPCHECK(hipHostFree(fake_opCount));
HIPCHECK(hipStreamDestroy(stream));
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(AllReduceAbortSweep,
AllReduceAbortTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2, 4),
// In-place or not
testing::Values(false)));
} // namespace
+20
Ver ficheiro
@@ -0,0 +1,20 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_HPP
#define TEST_ALLREDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllReduceAbortTest : public CorrectnessTest
{
protected:
public:
};
}
#endif
+69
Ver ficheiro
@@ -0,0 +1,69 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_Broadcast.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(BroadcastCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, root);
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclBroadcast(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
}
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(BroadcastCorrectnessSweep,
BroadcastCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+26
Ver ficheiro
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_BROADCAST_HPP
#define TEST_BROADCAST_HPP
#include "CorrectnessTest.hpp"
#include <omp.h>
namespace CorrectnessTests
{
class BroadcastCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root)
{
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
};
}
#endif
+153
Ver ficheiro
@@ -0,0 +1,153 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_BroadcastAbort.hpp"
#include "../include/core.h"
#include <omp.h>
#define NUM_ITER 8
#define FAKE_OP_COUNT NUM_ITER+1
namespace CorrectnessTests
{
#define HIPCHECK(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) { \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
TEST_P(BroadcastAbortTest, Correctness) {
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(dataset);
int root = 0;
int gpu = 0; // GPU number to trigger abort
ncclComm_t comm = comms[gpu];
HIPCHECK(hipSetDevice(gpu));
hipStream_t stream;
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
struct ncclChannel* channel = comm->channels;
struct ncclRing *ring = &channel->ring;
struct ncclConnector* send = &channel->peers[ring->next].send;
size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers;
size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers;
uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset;
uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset;
uint64_t *real_opCount, *fake_opCount, *fake_o;
uint64_t *real_head, *fake_head, *fake_h;
// get original opCount and head
HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipStreamSynchronize(stream));
// allocate and install fakes
HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped));
HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
*fake_opCount = FAKE_OP_COUNT;
HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
*fake_head = 0;
HIPCHECK(hipStreamSynchronize(stream));
// read back fakes to confirm
HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
HIPCHECK(hipStreamSynchronize(stream));
//std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
//std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
// Perform a number of iterations and introduce abort
for (int j = 0; j < NUM_ITER; j++) {
//std::cerr << "[ ] iter = " << j << std::endl;
// Start a group call
ncclGroupStart();
for (int i = 0; i < numDevices; i++) {
ncclBroadcast(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
// Signal end of group call
ncclGroupEnd();
}
// Wait for reduction to complete
auto start = std::chrono::high_resolution_clock::now();
hipError_t hipErr;
int remaining = numDevices;
int* done = (int*)malloc(sizeof(int)*numDevices);
memset(done, 0, sizeof(int)*numDevices);
bool timeout = false, abort_called = false;
while (remaining) {
int idle = 1;
for (int i=0; i<numDevices; i++) {
if (done[i]) continue;
hipErr = hipStreamQuery(streams[i]);
if (hipErr == hipSuccess) {
done[i] = 1;
remaining--;
idle = 0;
continue;
}
#if NCCL_MAJOR >= 2
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
if (deltaSec > 10.0 && !timeout) {
std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl;
timeout = true;
}
ncclResult_t ncclAsyncErr;
ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
// An asynchronous error happened. Stop the operation and destroy
// the communicator
std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
for (int i=0; i<numDevices; i++)
ncclCommAbort(comms[i]);
// Abort the perf test
abort_called = true;
break;
}
#endif
#endif
}
// We might want to let other threads (including NCCL threads) use the CPU.
if (idle) pthread_yield();
}
HIPCHECK(hipHostFree(fake_opCount));
HIPCHECK(hipStreamDestroy(stream));
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(BroadcastAbortSweep,
BroadcastAbortTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32),
// Number of elements
testing::Values(1048576),
// Number of devices
testing::Values(2, 4),
// In-place or not
testing::Values(false)));
} // namespace
+20
Ver ficheiro
@@ -0,0 +1,20 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_HPP
#define TEST_ALLREDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class BroadcastAbortTest : public CorrectnessTest
{
protected:
public:
};
}
#endif
+99
Ver ficheiro
@@ -0,0 +1,99 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_CombinedCalls.hpp"
#include "test_AllGather.hpp"
#include "test_AllReduce.hpp"
#include "test_Broadcast.hpp"
#include "test_Reduce.hpp"
#include "test_ReduceScatter.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(CombinedCallsCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Create multiple datasets for combined operation
std::vector<Dataset> datasets(5);
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(datasets[i]);
}
// Compute expected results for each dataset in combined
int const root = 0;
AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
size_t const byteCount = datasets[0].NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
datasets[0].outputs[i], elemCount,
dataType, comms[i], streams[i]);
ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
numElements, dataType, op, comms[i], streams[i]);
ncclBroadcast(datasets[2].inputs[i],
datasets[2].outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
ncclReduce(datasets[3].inputs[i],
datasets[3].outputs[i],
numElements, dataType, op,
root, comms[i], streams[i]);
ncclReduceScatter(datasets[4].inputs[i],
(int8_t *)datasets[4].outputs[i] + (i * byteCount),
elemCount, dataType, op,
comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results for each collective in the combined
for (int i = 0; i < 5; i++)
{
ValidateResults(datasets[i]);
datasets[i].Release();
}
}
INSTANTIATE_TEST_CASE_P(CombinedCallsCorrectnessSweep,
CombinedCallsCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+17
Ver ficheiro
@@ -0,0 +1,17 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_COMBINEDCALLS_HPP
#define TEST_COMBINEDCALLS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class CombinedCallsCorrectnessTest : public CorrectnessTest {};
}
#endif
+120
Ver ficheiro
@@ -0,0 +1,120 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_GroupCalls.hpp"
#include "test_AllGather.hpp"
#include "test_AllReduce.hpp"
#include "test_Broadcast.hpp"
#include "test_Reduce.hpp"
#include "test_ReduceScatter.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(GroupCallsCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Create multiple datasets for group operation
std::vector<Dataset> datasets(5);
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(datasets[i]);
}
// Compute expected results for each dataset in group
int const root = 0;
AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
// Start a group call
ncclGroupStart();
// AllGather
size_t const byteCount = datasets[0].NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
datasets[0].outputs[i], elemCount,
dataType, comms[i], streams[i]);
}
// AllReduce
for (int i = 0; i < numDevices; i++)
{
ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
// Broadcast
for (int i = 0; i < numDevices; i++)
{
ncclBroadcast(datasets[2].inputs[i],
datasets[2].outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
// Reduce
for (int i = 0; i < numDevices; i++)
{
ncclReduce(datasets[3].inputs[i],
datasets[3].outputs[i],
numElements, dataType, op,
root, comms[i], streams[i]);
}
// ReduceScatter
for (int i = 0; i < numDevices; i++)
{
ncclReduceScatter(datasets[4].inputs[i],
(int8_t *)datasets[4].outputs[i] + (i * byteCount),
elemCount, dataType, op,
comms[i], streams[i]);
}
// Signal end of group call
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results for each collective in the group
for (int i = 0; i < 5; i++)
{
ValidateResults(datasets[i]);
datasets[i].Release();
}
}
INSTANTIATE_TEST_CASE_P(GroupCallsCorrectnessSweep,
GroupCallsCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+17
Ver ficheiro
@@ -0,0 +1,17 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_GROUPCALLS_HPP
#define TEST_GROUPCALLS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class GroupCallsCorrectnessTest : public CorrectnessTest {};
}
#endif
+68
Ver ficheiro
@@ -0,0 +1,68 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_Reduce.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(ReduceCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, op, root);
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclReduce(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType, op,
root, comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
}
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(ReduceCorrectnessSweep,
ReduceCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+80
Ver ficheiro
@@ -0,0 +1,80 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_REDUCE_HPP
#define TEST_REDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ReduceCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
// Copy results into expected arrays
for (int i = 0; i < dataset.numDevices; i++)
{
if (i == root)
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
else
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
}
free(resultI1);
}
};
}
#endif
+67
Ver ficheiro
@@ -0,0 +1,67 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_ReduceScatter.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(ReduceScatterCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace);
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, op);
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
size_t const recvCount = dataset.numElements / dataset.numDevices;
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
for (int i = 0; i < numDevices; i++)
{
ncclReduceScatter(dataset.inputs[i],
(int8_t *)dataset.outputs[i] + (i * byteCount),
recvCount, dataType, op,
comms[i], streams[i]);
}
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
INSTANTIATE_TEST_CASE_P(ReduceScatterCorrectnessSweep,
ReduceScatterCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4),
// In-place or not
testing::Values(false, true)));
} // namespace
+83
Ver ficheiro
@@ -0,0 +1,83 @@
/*************************************************************************
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_REDUCE_SCATTER_HPP
#define TEST_REDUCE_SCATTER_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ReduceScatterCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
// Copy results into expected arrays
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
for (int i = 0; i < dataset.numDevices; i++)
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
resultI1 + (i * byteCount), byteCount);
free(resultI1);
}
};
}
#endif
+16
Ver ficheiro
@@ -0,0 +1,16 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
EXE=TransferBench
CXXFLAGS = -O3 -fopenmp -I../../src/include -I.
all: $(EXE)
$(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $< -o $@
clean:
rm -f *.o $(EXE)
+313
Ver ficheiro
@@ -0,0 +1,313 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// This program measures simultaneous copy performance across multiple GPUs
// on the same node
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <set>
#include <hip/hip_runtime.h>
#include "copy_kernel.h"
#include "TransferBench.hpp"
int main(int argc, char **argv)
{
// Display usage
if (argc <= 1)
{
printf("Usage: %s configFile <N>\n", argv[0]);
printf("- configFile: file describing topologies to test\n");
printf(" Each line should contain a single topology\n");
printf(" L - number of links followed by L white-space separated triples (src, dst, # blocks)\n");
printf(" For example:\n");
printf(" 2 0 1 3 1 0 3\n");
printf(" would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n");
printf("- N: (Optional) Number of bytes to transfer per link.\n");
printf(" If not specified, defaults to 2^28=256MB. Must be a multiple of 128 bytes\n");
printf("Set env var USE_MEMCPY_ASYNC to use hipMemcpyAsync instead of copy kernel\n");
exit(0);
}
// Parse number of bytes to use (or use default if not specified)
size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : (1<<28);
size_t N = numBytesPerLink / sizeof(float);
if (numBytesPerLink % 128)
{
printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink);
exit(1);
}
// Currently an environment variable is required in order to enable fine-grained VRAM allocations
if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
{
printf("[ERROR] Currently you must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
exit(1);
}
bool useMemcpy = getenv("USE_MEMCPY_ASYNC");
printf("Using %s\n", useMemcpy ? "hipMemcpyAsync (USE_MEMCPY_ASYNC found) [# of blocks to use will be ignored]" : "copy kernel (USE_MEMCPY_ASYNC not found)");
// Collect the number of available GPUs on this machine
int numDevices;
HIP_CALL(hipGetDeviceCount(&numDevices));
if (numDevices < 1)
{
printf("[ERROR] No GPU devices found\n");
exit(1);
}
// Print header
printf("%-*s(GB/s)", MAX_NAME_LEN - 6, "Configuration");
for (int i = 0; i < numDevices; i++)
printf(" GPU %02d", i);
printf(" Total\n");
for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
printf("\n");
// Read configuration file
FILE* fp = fopen(argv[1], "r");
if (!fp)
{
printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
exit(1);
}
// Track links that get used
std::map<std::pair<int, int>, int> linkMap;
char line[2048];
while(fgets(line, 2048, fp))
{
// Parse links from configuration file
std::vector<Link> links;
ParseLinks(line, links);
int const numLinks = links.size();
if (numLinks == 0) continue;
// Clear counters
int linkCount[numDevices];
for (int i = 0; i < numDevices; i++)
linkCount[i] = 0;
float* linkSrcMem[numLinks];
float* linkDstMem[numLinks];
hipStream_t streams[numLinks];
hipEvent_t startEvents[numLinks];
hipEvent_t stopEvents[numLinks];
std::vector<BlockParam> cpuBlockParams[numLinks];
BlockParam* gpuBlockParams[numLinks];
char name[MAX_NAME_LEN+1] = {};
for (int i = 0; i < numLinks; i++)
{
int const src = links[i].srcGpu;
int const dst = links[i].dstGpu;
if (src < 0 || src >= numDevices ||
dst < 0 || dst >= numDevices)
{
printf("[ERROR] Invalid link (%d to %d). Total devices: %d\n", src, dst, numDevices);
exit(1);
}
snprintf(name + strlen(name), MAX_NAME_LEN, "%d->%d:%d ", src, dst, links[i].numBlocksToUse);
// Enable peer-to-peer access if this is the first time seeing this pair
auto linkPair = std::make_pair(src, dst);
linkMap[linkPair]++;
if (linkMap[linkPair] == 1)
{
int canAccess;
HIP_CALL(hipDeviceCanAccessPeer(&canAccess, src, dst));
if (!canAccess)
{
printf("[ERROR] Unable to enable peer access between device %d and %d\n", src, dst);
exit(1);
}
HIP_CALL(hipSetDevice(src));
HIP_CALL(hipDeviceEnablePeerAccess(dst, 0));
}
// Count # of links / total blocks each GPU will be working on
linkCount[src]++;
// Allocate GPU memory on source GPU / streams / events
HIP_CALL(hipSetDevice(links[i].srcGpu));
HIP_CALL(hipStreamCreate(&streams[i]));
HIP_CALL(hipEventCreate(&startEvents[i]));
HIP_CALL(hipEventCreate(&stopEvents[i]));
HIP_CALL(hipMalloc((void **)&linkSrcMem[i], numBytesPerLink));
HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
CheckOrFill(N, linkSrcMem[i], false);
// Allocate fine-grained GPU memory on destination GPU
HIP_CALL(hipSetDevice(links[i].dstGpu));
HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
// Each block needs to know src/dst pointers and how many elements to transfer
// Figure out the sub-array each block does for this link
// NOTE: Have each sub-array to work on multiple of 32-floats (128-bytes),
// but divide as evenly as possible
// NOTE: N is always a multiple of 32
int blocksWithExtra = (N / 32) % links[i].numBlocksToUse;
int perBlockBaseN = (N / 32) / links[i].numBlocksToUse * 32;
for (int j = 0; j < links[i].numBlocksToUse; j++)
{
BlockParam param;
param.N = perBlockBaseN + ((j < blocksWithExtra) ? 32 : 0);
param.src = linkSrcMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
j : blocksWithExtra) * 32);
param.dst = linkDstMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
j : blocksWithExtra) * 32);
cpuBlockParams[i].push_back(param);
}
HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(),
sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice));
}
// Launch kernels (warmup iterations are not counted)
int numWarmups = 3;
int numIterations = 10;
double totalCpuTime = 0;
double totalGpuTime[numDevices];
for (int i = 0; i < numDevices; i++) totalGpuTime[i] = 0.0;
for (int iteration = -numWarmups; iteration < numIterations; iteration++)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
#pragma omp parallel for num_threads(numLinks)
for (int i = 0; i < numLinks; i++)
{
HIP_CALL(hipSetDevice(links[i].srcGpu));
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
if (useMemcpy)
{
HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i],
numBytesPerLink, hipMemcpyDeviceToDevice,
streams[i]));
}
else
{
hipLaunchKernelGGL(CopyKernel,
dim3(links[i].numBlocksToUse, 1, 1),
dim3(BLOCKSIZE, 1, 1),
0,
streams[i],
gpuBlockParams[i]);
}
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
}
for (int i = 0; i < numLinks; i++)
hipStreamSynchronize(streams[i]);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (iteration >= 0)
{
totalCpuTime += deltaSec;
for (int i = 0; i < numDevices; i++)
{
// Multiple links running on the same device may be running simultaneously
// so try to figure out the first/last event across all links
float maxTime = 0.0f;
for (int j = 0; j < numLinks; j++)
{
if (links[j].srcGpu != i) continue;
for (int k = 0; k < numLinks; k++)
{
if (links[k].srcGpu != i) continue;
float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
maxTime = std::max(maxTime, gpuDeltaMsec);
}
}
totalGpuTime[i] += maxTime / 1000.0;
}
}
}
// Validate that each link has transferred correctly
for (int i = 0; i < numLinks; i++)
CheckOrFill(N, linkDstMem[i], true);
// Report timings
printf("%-*s", MAX_NAME_LEN, name);
for (int i = 0; i < numDevices; i++)
{
if (linkCount[i] == 0)
{
printf("%8.3f", 0.0);
}
else
{
totalGpuTime[i] /= (1.0 * numIterations);
printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
}
}
// Print off bandwidth (based on CPU wall-time timer)
totalCpuTime /= numIterations;
printf("%8.3f\n", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
// Release GPU memory
for (int i = 0; i < numLinks; i++)
{
HIP_CALL(hipFree(linkSrcMem[i]));
HIP_CALL(hipFree(linkDstMem[i]));
HIP_CALL(hipFree(gpuBlockParams[i]));
HIP_CALL(hipStreamDestroy(streams[i]));
HIP_CALL(hipEventDestroy(startEvents[i]));
HIP_CALL(hipEventDestroy(stopEvents[i]));
}
}
fclose(fp);
// Print link information
for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
printf("\n");
printf("Link topology:\n");
uint32_t linkType;
uint32_t hopCount;
for (auto mapPair : linkMap)
{
int src = mapPair.first.first;
int dst = mapPair.first.second;
HIP_CALL(hipExtGetLinkTypeAndHopCount(src, dst, &linkType, &hopCount));
printf("%d -> %d: %s [%d hop(s)]\n", src, dst,
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "HYPERTRANSPORT" :
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? "QPI" :
linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" :
linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFINIBAND" :
linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "UNKNOWN",
hopCount);
}
return 0;
}
+111
Ver ficheiro
@@ -0,0 +1,111 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#define MAX_NAME_LEN 64
#define BLOCKSIZE 256
#define COPY_UNROLL 4
// Each link is defined between a source GPU and destination GPU
struct Link
{
int srcGpu; // Source GPU (global memory source)
int dstGpu; // Destination GPU (fine-grained memory destination)
int numBlocksToUse; // Number of threadblocks to use for this link
};
// Each threadblock copies N floats from src to dst
struct BlockParam
{
int N;
float* src;
float* dst;
};
// GPU copy kernel
__global__ void __launch_bounds__(BLOCKSIZE)
CopyKernel(BlockParam* blockParams)
{
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
}
// Helper function to parse a link of link definitions
void ParseLinks(char const* line, std::vector<Link>& links)
{
links.clear();
int numLinks = 0;
std::istringstream iss;
iss.clear();
iss.str(line);
iss >> numLinks;
links.resize(numLinks);
if (iss.fail()) return;
for (int i = 0; i < numLinks; i++)
iss >> links[i].srcGpu >> links[i].dstGpu >> links[i].numBlocksToUse;
}
// Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
void CheckOrFill(int N, float* devPtr, bool doCheck)
{
float* refBuffer = (float*)malloc(N * sizeof(float));
for (int i = 0; i < N; i++)
refBuffer[i] = i % 383 + 31;
if (doCheck)
{
float* hostBuffer = (float*) malloc(N * sizeof(float));
HIP_CALL(hipMemcpy(hostBuffer, devPtr, N * sizeof(float), hipMemcpyDeviceToHost));
for (int i = 0; i < N; i++)
{
if (refBuffer[i] != hostBuffer[i])
{
printf("[ERROR] Mismatch at element %d Ref: %f Actual: %f\n", i, refBuffer[i], hostBuffer[i]);
exit(1);
}
}
}
else
{
HIP_CALL(hipMemcpy(devPtr, refBuffer, N * sizeof(float), hipMemcpyHostToDevice));
}
free(refBuffer);
}
+310
Ver ficheiro
@@ -0,0 +1,310 @@
/*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COPY_KERNEL_H_
#define COPY_KERNEL_H_
#include <cstdio>
#include <cstdint>
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
typedef uint64_t PackType;
template<class FUNC, typename T>
struct MULTI {
__device__ PackType operator()(const PackType x, const PackType y) const
{
return FUNC()(x, y);
}
};
#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
size_t ptrval = reinterpret_cast<size_t>(ptr);
return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
}
template<typename T> inline __device__
void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__attribute__((noinline))
__device__ inline void ReduceCopy(
const int tid, const int nthreads,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1,
volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1, const int N) {
for (int idx = tid; idx < N; idx += nthreads) {
T val = vFetch(src0+idx);
if (TWO_INPUTS) {
val = FUNC()(val, vFetch(src1+idx));
}
vStore(dest0+idx, val);
if (TWO_OUTPUTS) {
vStore(dest1+idx, val);
}
}
}
template<typename T>
struct FuncPassA {
__device__ T operator()(const T x, const T y) const {
return x;
}
};
template<typename T>
struct FuncSum {
__device__ T operator()(const T x, const T y) const {
return x + y;
}
};
template<class FUNC>
struct MULTI<FUNC, float> {
static_assert(sizeof(PackType) == 2 * sizeof(float),
"PackType must be twice the size of float.");
union converter {
PackType storage;
struct {
float a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
typedef ulong2 Pack128;
template<class FUNC, typename T>
struct MULTI128 {
__device__ void operator()(Pack128& x, Pack128& y) {
x.x = MULTI<FUNC, T>()(x.x, y.x);
x.y = MULTI<FUNC, T>()(x.y, y.y);
}
};
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
v.x = p->x;
v.y = p->y;
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
p->x = v.x;
p->y = v.y;
}
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
const int offset, const int N) {
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
T val = vFetch(srcs[0]+idx);
#pragma unroll
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
}
}
#define WARP_SIZE 64
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
const Pack128* srcs[MAXSRCS];
for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
Pack128* dsts[MAXDSTS];
for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
while (offset < Npack) {
Pack128 vals[UNROLL];
// Load and reduce
for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
for (int i=1; i<MINSRCS; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
// Store
for (int i = 0; i < MINDSTS; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
offset += inc;
}
}
template <typename T>
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
// Try to limit consecutive load/stores to 8.
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
int N) {
int Nrem = N;
if (Nrem <= 0) return;
int alignDiff = 0;
int align = ptrAlign128(srcs[0]);
#pragma unroll
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
#pragma unroll
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
int Npreamble = alignDiff ? Nrem :
N < alignof(Pack128) ? N :
(alignof(Pack128) - align) % alignof(Pack128);
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
if (Npreamble) {
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
}
int offset = Npreamble;
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
int w = tid / WARP_SIZE; // Warp number
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
const int packFactor = sizeof(Pack128) / sizeof(T);
// stage 2a: main loop
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
* (AUTOUNROLL * WARP_SIZE); // round down
int Nelem2a = Npack2a * packFactor;
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
Nrem -= Nelem2a;
if (Nrem == 0) return;
offset += Nelem2a;
// stage 2b: slightly less optimized for section when we don't have full
// unrolling
int Npack2b = Nrem / packFactor;
int Nelem2b = Npack2b * packFactor;
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
Nrem -= Nelem2b;
if (Nrem == 0) return;
offset += Nelem2b;
// stage 2c: tail
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, typename T>
__device__ void Copy(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src;
dsts[0] = (T*)dest;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
1, srcs, 1, dsts, N);
}
template<int UNROLL, int THREADS, typename T>
__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src;
dsts[0] = (T*)dest0;
dsts[1] = (T*)dest1;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
1, srcs, 2, dsts, N);
}
template<int UNROLL, int THREADS, typename T>
__device__ void Reduce(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src0;
srcs[1] = (const T*)src1;
dsts[0] = (T*)dest;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 1, dsts, N);
}
template<int UNROLL, int THREADS, typename T>
__device__ void ReduceCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src0;
srcs[1] = (const T*)src1;
dsts[0] = (T*)dest0;
dsts[1] = (T*)dest1;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 2, dsts, N);
}
#endif // COPY_KERNEL_H_
+4
Ver ficheiro
@@ -0,0 +1,4 @@
# Each line consists of L (# of links) followed by L white-space-separated triples of (srcGpu, dstGpu, #blocks)
# Single link between GPUs 0 and 1
1 0 1 1
+16
Ver ficheiro
@@ -0,0 +1,16 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
EXE=rccl_prim_test
CXXFLAGS = -O3 -g -I/opt/rocm/rocrand/include
all: $(EXE)
$(EXE): rccl_prim_test.cpp
$(HIPCC) $(CXXFLAGS) $^ -o $@
clean:
rm -f *.o $(EXE)
+310
Ver ficheiro
@@ -0,0 +1,310 @@
/*************************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COPY_KERNEL_H_
#define COPY_KERNEL_H_
#include <cstdio>
#include <cstdint>
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
typedef uint64_t PackType;
template<class FUNC, typename T>
struct MULTI {
__device__ PackType operator()(const PackType x, const PackType y) const
{
return FUNC()(x, y);
}
};
#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
template<typename T>
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
size_t ptrval = reinterpret_cast<size_t>(ptr);
return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
}
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
}
template<typename T> inline __device__
void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
__attribute__((noinline))
__device__ inline void ReduceCopy(
const int tid, const int nthreads,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1,
volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1, const int N) {
for (int idx = tid; idx < N; idx += nthreads) {
T val = vFetch(src0+idx);
if (TWO_INPUTS) {
val = FUNC()(val, vFetch(src1+idx));
}
vStore(dest0+idx, val);
if (TWO_OUTPUTS) {
vStore(dest1+idx, val);
}
}
}
template<typename T>
struct FuncPassA {
__device__ T operator()(const T x, const T y) const {
return x;
}
};
template<typename T>
struct FuncSum {
__device__ T operator()(const T x, const T y) const {
return x + y;
}
};
template<class FUNC>
struct MULTI<FUNC, float> {
static_assert(sizeof(PackType) == 2 * sizeof(float),
"PackType must be twice the size of float.");
union converter {
PackType storage;
struct {
float a, b;
};
};
__device__ PackType operator()(const PackType x, const PackType y) const {
converter cx, cy, cr;
cx.storage = x;
cy.storage = y;
cr.a = FUNC()(cx.a, cy.a);
cr.b = FUNC()(cx.b, cy.b);
return cr.storage;
}
};
typedef ulong2 Pack128;
template<class FUNC, typename T>
struct MULTI128 {
__device__ void operator()(Pack128& x, Pack128& y) {
x.x = MULTI<FUNC, T>()(x.x, y.x);
x.y = MULTI<FUNC, T>()(x.y, y.y);
}
};
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
v.x = p->x;
v.y = p->y;
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
p->x = v.x;
p->y = v.y;
}
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
const int offset, const int N) {
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
T val = vFetch(srcs[0]+idx);
#pragma unroll
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
#pragma unroll
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
}
}
#define WARP_SIZE 64
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
const Pack128* srcs[MAXSRCS];
for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
Pack128* dsts[MAXDSTS];
for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
while (offset < Npack) {
Pack128 vals[UNROLL];
// Load and reduce
for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
for (int i=1; i<MINSRCS; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
#pragma unroll 1
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
// Store
for (int i = 0; i < MINDSTS; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
#pragma unroll 1
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
offset += inc;
}
}
template <typename T>
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
// Try to limit consecutive load/stores to 8.
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
int N) {
int Nrem = N;
if (Nrem <= 0) return;
int alignDiff = 0;
int align = ptrAlign128(srcs[0]);
#pragma unroll
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
#pragma unroll
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
int Npreamble = alignDiff ? Nrem :
N < alignof(Pack128) ? N :
(alignof(Pack128) - align) % alignof(Pack128);
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
if (Npreamble) {
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
Nrem -= Npreamble;
if (Nrem == 0) return;
}
int offset = Npreamble;
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
int w = tid / WARP_SIZE; // Warp number
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
const int packFactor = sizeof(Pack128) / sizeof(T);
// stage 2a: main loop
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
* (AUTOUNROLL * WARP_SIZE); // round down
int Nelem2a = Npack2a * packFactor;
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
Nrem -= Nelem2a;
if (Nrem == 0) return;
offset += Nelem2a;
// stage 2b: slightly less optimized for section when we don't have full
// unrolling
int Npack2b = Nrem / packFactor;
int Nelem2b = Npack2b * packFactor;
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
Nrem -= Nelem2b;
if (Nrem == 0) return;
offset += Nelem2b;
// stage 2c: tail
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, typename T>
__device__ void Copy(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src;
dsts[0] = (T*)dest;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
1, srcs, 1, dsts, N);
}
template<int UNROLL, int THREADS, typename T>
__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src;
dsts[0] = (T*)dest0;
dsts[1] = (T*)dest1;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
1, srcs, 2, dsts, N);
}
template<int UNROLL, int THREADS, typename T>
__device__ void Reduce(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src0;
srcs[1] = (const T*)src1;
dsts[0] = (T*)dest;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 1, dsts, N);
}
template<int UNROLL, int THREADS, typename T>
__device__ void ReduceCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src0;
srcs[1] = (const T*)src1;
dsts[0] = (T*)dest0;
dsts[1] = (T*)dest1;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 2, dsts, N);
}
#endif // COPY_KERNEL_H_
+454
Ver ficheiro
@@ -0,0 +1,454 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file rccl_prim_test.cpp
*
* test performance if individual rccl primitives
*/
#include <cstdio> //fprintf
#include <iostream> //cerr
#include <unistd.h> //usleep
#include <cstring>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include "copy_kernel.h"
#define MAX_GPU 8
#define MAX_WORKGROUPS 8
#define THREADS 256
#define COPY_UNROLL 4
#define REDUCE_UNROLL 2
#define DOUBLECOPY_UNROLL 2
#define REDUCECOPY_UNROLL 2
struct transfer_data_t {
float *dest0[MAX_WORKGROUPS]; //remote fine grain
float *src0[MAX_WORKGROUPS]; //local fine grain
float *dest1[MAX_WORKGROUPS]; //local coarse grain
float *src1[MAX_WORKGROUPS]; //local coarse grain
int N;
int gpu;
int ngpu;
uint64_t *remOpCount;
};
struct profiling_data_t {
uint64_t write_cycles;
uint64_t bytes_transferred;
};
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
enum Ops {
OP_COPY,
OP_LOCALCOPY,
OP_DOUBLECOPY,
OP_REDUCE,
OP_REDUCECOPY,
NUM_OPS,
};
template<int op, int sync>
__global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount) {
size_t idx = threadIdx.x;
uint64_t curr_time, next_time;
int bid = blockIdx.x;
int n = transfer_data->N;
// signal self ready and wait until all GPUs are ready
if (idx == 0) {
if (bid == 0)
STORE(&transfer_data->remOpCount[transfer_data->gpu], opCount);
if (sync) {
for (int i = 0; i < transfer_data->ngpu; i++) {
while (LOAD(&transfer_data->remOpCount[i]) < opCount) {};
}
}
}
__syncthreads();
if (idx == 0) {
curr_time = clock64();
}
if (op == OP_COPY) Copy<COPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->src0[bid], n);
if (op == OP_LOCALCOPY) Copy<COPY_UNROLL, THREADS, float>(transfer_data->dest1[bid], transfer_data->src0[bid], n);
if (op == OP_DOUBLECOPY) DoubleCopy<DOUBLECOPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->dest1[bid], transfer_data->src0[bid], n);
if (op == OP_REDUCE) Reduce<REDUCE_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->src0[bid], transfer_data->src1[bid], n);
if (op == OP_REDUCECOPY) ReduceCopy<REDUCECOPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->dest1[bid], transfer_data->src0[bid], transfer_data->src1[bid], n);
__syncthreads();
if (idx == 0) {
next_time = clock64();
__atomic_fetch_add(&(profiling_data->write_cycles), next_time - curr_time, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&(profiling_data->bytes_transferred), n * sizeof(float), __ATOMIC_SEQ_CST);
}
}
typedef void(*flag_sync_kernel_t)(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount);
static flag_sync_kernel_t const flagSyncKerns[NUM_OPS*2] = {
flag_sync_kernel<OP_COPY, 0>,
flag_sync_kernel<OP_COPY, 1>,
flag_sync_kernel<OP_LOCALCOPY, 0>,
flag_sync_kernel<OP_LOCALCOPY, 1>,
flag_sync_kernel<OP_DOUBLECOPY, 0>,
flag_sync_kernel<OP_DOUBLECOPY, 1>,
flag_sync_kernel<OP_REDUCE, 0>,
flag_sync_kernel<OP_REDUCE, 1>,
flag_sync_kernel<OP_REDUCECOPY, 0>,
flag_sync_kernel<OP_REDUCECOPY, 1>,
};
__global__ void initTestDataKernel(float* data, const size_t N, const int gpu) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
data[tid] = 1.0/(float)(gpu*17 + tid%77);
tid += blockDim.x * gridDim.x;
}
}
#define HIPCHECK(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
static void setupPeers(uint32_t *info) {
int deviceCnt, dev;
HIPCHECK(hipGetDeviceCount(&deviceCnt));
HIPCHECK(hipGetDevice(&dev));
//! If gpus are not peer enabled, enable them
for (int i = 0; i < deviceCnt; i++) {
HIPCHECK(hipSetDevice(i));
for (int j = 0; j < deviceCnt; j++) {
if (i != j) {
int p2p;
HIPCHECK(hipDeviceCanAccessPeer(&p2p, i, j));
if (!p2p) {
printf("Cannot enable peer access between device %d and %d. You may use HIP_VISIBLE_DEVICES to limit GPUs.\n",
i, j);
exit(-1);
}
HIPCHECK(hipDeviceEnablePeerAccess(j, 0));
uint32_t linktype;
HIPCHECK(hipExtGetLinkTypeAndHopCount(i, j, &linktype, &info[i*deviceCnt+j]));
}
else
info[i*deviceCnt+j] = 0;
}
}
HIPCHECK(hipSetDevice(dev));
}
static void printRing(int id, int *ring, int deviceCnt) {
printf("Ring %d: ", id);
for (int i = 0; i < deviceCnt; i++)
printf("%1d ", ring[i]);
printf("\n");
}
static void findConnect(uint32_t *info, int *ring, int deviceCnt) {
int n = 0, curr = 0, best;
uint32_t temp[MAX_GPU*MAX_GPU];
for (int i = 0; i < deviceCnt*deviceCnt; i++) temp[i] = 0;
for (int i = 0; i < deviceCnt; i++) {
for (int j = 0; j < deviceCnt; j++) temp[j*deviceCnt+curr] = 1;
ring[n] = curr;
n++;
int hops = 99;
for (int j = 0; j < deviceCnt; j++) {
if (temp[curr*deviceCnt+j]) continue;
if (info[curr*deviceCnt+j] < hops) {
best = j;
hops = info[curr*deviceCnt+j];
}
}
curr = best;
}
}
static int findNextGpu(int *ring, int gpu, int deviceCnt) {
int i;
for (i = 0; i < deviceCnt; i ++)
if (ring[i] == gpu) break;
return ring[(i+1)%deviceCnt];
}
static void setupRings(uint32_t *info, int *ring_0, int *ring_1) {
int deviceCnt, dev;
HIPCHECK(hipGetDeviceCount(&deviceCnt));
printf("Connection matrix:\n");
for (int i = 0; i < deviceCnt; i++) {
for (int j = 0; j < deviceCnt; j++)
printf("%2d ", info[i*deviceCnt+j]);
printf("\n");
}
findConnect(info, ring_0, deviceCnt);
printRing(0, ring_0, deviceCnt);
ring_1[0] =0;
for (int i = 1; i < deviceCnt; i++)
ring_1[i] = ring_0[deviceCnt-i];
printRing(1, ring_1, deviceCnt);
}
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end)
{
return *itr;
}
return 0;
}
bool cmdOptionExists(char** begin, char** end, const std::string& option) {
return std::find(begin, end, option) != end;
}
static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
int main(int argc,char* argv[])
{
if (cmdOptionExists(argv, argv + argc, "-h")) {
printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|reduce|reducecopy|all -i iterations -n bytes -s 0|1\n");
exit(0);
}
int workgroups = 1;
char *wg = getCmdOption(argv, argv + argc, "-w");
if (wg)
workgroups = atol(wg);
printf("Benchmarking using %d workgroups\n", workgroups);
int iters = 10;
char *it = getCmdOption(argv, argv + argc, "-i");
if (it)
iters = atol(it);
printf("Benchmarking using %d iterations\n", iters);
uint64_t nBytes = 2097152;
char *nb = getCmdOption(argv, argv + argc, "-n");
if (nb)
nBytes = atol(nb);
printf("Benchmarking using %ld bytes\n", nBytes);
uint64_t N = nBytes/sizeof(float);
int sync = 0;
char *s = getCmdOption(argv, argv + argc, "-s");
if (s)
sync = atol(s);
if (sync) printf("Sync all GPUs before operation\n");
const char *ops[] = {"copy", "localcopy", "doublecopy", "reduce", "reducecopy", "all"};
char *prim = getCmdOption(argv, argv + argc, "-p");
int op = 5, begin_op, end_op;
if (prim) {
for (op = 0; op < sizeof(ops); op++)
if (!strcmp((const char *)prim, ops[op]))
break;
}
if (op < NUM_OPS ) {
begin_op = op;
end_op = op + 1;
} else {
begin_op = 0;
end_op = NUM_OPS;
printf("Benchmarking all ops\n");
}
uint32_t connection_info[MAX_GPU*MAX_GPU];
// Enable peer access
setupPeers(connection_info);
// clockwise and counter clockwise rings
int ring_0[MAX_GPU] = {-1, -1, -1, -1};
int ring_1[MAX_GPU] = {-1, -1, -1, -1};
setupRings(connection_info, ring_0, ring_1);
// data buffers
float *buff[MAX_GPU*MAX_WORKGROUPS], *buff_coarse[MAX_GPU*MAX_WORKGROUPS];
struct transfer_data_t h_transfer_data[MAX_GPU], *transfer_data[MAX_GPU];
struct profiling_data_t *profiling_data[MAX_GPU], *d_profiling_data[MAX_GPU];
hipStream_t stream[MAX_GPU];
int nGpu = 1;
HIPCHECK(hipGetDeviceCount(&nGpu));
uint64_t *remOpCount, *d_remOpCount;
HIPCHECK(hipHostMalloc((void**)&remOpCount, sizeof(uint64_t)*MAX_GPU, hipHostMallocMapped));
HIPCHECK(hipHostGetDevicePointer((void**)&d_remOpCount, (void*)remOpCount, 0));
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
hipDeviceProp_t prop;
HIPCHECK(hipGetDeviceProperties(&prop, i));
printf("# device %d [0x%02x] %s\n",
i, prop.pciBusID, prop.name);
//create stream
HIPCHECK(hipStreamCreate(&stream[i]));
profiling_data[i] = (struct profiling_data_t *)malloc(sizeof(struct profiling_data_t));
HIPCHECK(hipMalloc((void**) &d_profiling_data[i], sizeof(struct profiling_data_t)));
HIPCHECK(hipExtMallocWithFlags((void**) &transfer_data[i], sizeof(struct transfer_data_t), hipDeviceMallocFinegrained));
for (int j = 0; j < workgroups; j++) {
HIPCHECK(hipExtMallocWithFlags((void**) &buff[i*MAX_WORKGROUPS+j], 2*N*sizeof(float), hipDeviceMallocFinegrained));
HIPCHECK(hipMalloc((void**) &buff_coarse[i*MAX_WORKGROUPS+j], 2*N*sizeof(float)));
//randomize test data
hipLaunchKernelGGL(initTestDataKernel,
/*grid dim x,y,z*/ dim3(32, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ buff[i*MAX_WORKGROUPS+j], 2*N, 0);
hipLaunchKernelGGL(initTestDataKernel,
/*grid dim x,y,z*/ dim3(32, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ buff_coarse[i*MAX_WORKGROUPS+j], 2*N, 0);
}
}
for (int i = 0; i < nGpu; i ++) {
for (int j = 0; j < workgroups; j++) {
int next_gpu;
if (j%2)
next_gpu = findNextGpu(ring_1, i, nGpu);
else
next_gpu = findNextGpu(ring_0, i, nGpu);
//printf("GPU %d Ring %d -> Next GPU %d\n", i, j, next_gpu);
h_transfer_data[i].dest0[j] = buff[next_gpu*MAX_WORKGROUPS+j] + N;
h_transfer_data[i].dest1[j] = buff_coarse[i*MAX_WORKGROUPS+j] + N;
h_transfer_data[i].src0[j] = buff[i*MAX_WORKGROUPS+j];
h_transfer_data[i].src1[j] = buff_coarse[i*MAX_WORKGROUPS+j];
}
h_transfer_data[i].N = N;
h_transfer_data[i].gpu = i;
h_transfer_data[i].ngpu = nGpu;
h_transfer_data[i].remOpCount = d_remOpCount;
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpyAsync(transfer_data[i], &h_transfer_data[i],
sizeof(struct transfer_data_t), hipMemcpyHostToDevice,
stream[i]));
HIPCHECK(hipStreamSynchronize(stream[i]));
}
uint64_t opCount = 0;
for (int op = begin_op; op < end_op; op ++) {
const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "Reduce", "ReduceCopy"};
printf("[Testing %s]: \n", OpsName[op]);
// 2 warm up cycles
for (int i = 0; i < 2; i ++) {
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
//launch the kernel
hipLaunchKernelGGL(flagSyncKerns[op*2 + sync],
/*grid dim x,y,z*/ dim3(workgroups, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ transfer_data[i], d_profiling_data[i], opCount);
}
opCount++;
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipStreamSynchronize(stream[i]));
HIPCHECK(hipMemset(d_profiling_data[i], 0, sizeof(struct profiling_data_t)));
}
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iters; i ++) {
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
//launch the kernel
hipLaunchKernelGGL(flagSyncKerns[op*2 + sync],
/*grid dim x,y,z*/ dim3(workgroups, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ transfer_data[i], d_profiling_data[i], opCount);
}
opCount++;
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipStreamSynchronize(stream[i]));
}
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
std::cout<<"***GPU to GPU Transfer Profiling Data***"<<std::endl;
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipMemcpyAsync(profiling_data[i], d_profiling_data[i],
sizeof(struct profiling_data_t), hipMemcpyDeviceToHost,
stream[i]));
HIPCHECK(hipStreamSynchronize(stream[i]));
#define RTC_CLOCK_FREQ 2.7E07
int next_gpu = findNextGpu(ring_0, i, nGpu);
uint32_t linktype;
uint32_t hopcount;
HIPCHECK(hipExtGetLinkTypeAndHopCount(i, next_gpu , &linktype, &hopcount));
double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ)/(double)workgroups;
fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n",
i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9));
}
std::cout<<"***Application Level Transfer Profiling Data***"<<std::endl;
double speed = (double)(profiling_data[0]->bytes_transferred) / (deltaSec*1.0E9);
printf("Transfered %lu bytes in %f s. Throughput %f GB/s\n", profiling_data[0]->bytes_transferred, deltaSec, speed);
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipStreamDestroy(stream[i]));
HIPCHECK(hipFree((void*) transfer_data[i]));
for (int j = 0; j < workgroups; j++) {
HIPCHECK(hipFree((void*) buff[i*MAX_WORKGROUPS+j]));
HIPCHECK(hipFree((void*) buff_coarse[i*MAX_WORKGROUPS+j]));
}
HIPCHECK(hipFree((void*) d_profiling_data[i]));
free(profiling_data[i]);
}
printf("opCount: ");
for (int i = 0; i < nGpu; i++)
printf("%ld ", remOpCount[i]);
printf("\n");
HIPCHECK(hipHostFree((void*)remOpCount));
}