@@ -55,8 +55,16 @@ else()
|
||||
endif()
|
||||
|
||||
# Setup VERSION
|
||||
set(VERSION_STRING "2.6.0")
|
||||
rocm_setup_version(VERSION ${VERSION_STRING})
|
||||
set(VERSION_STRING "2.6.0.")
|
||||
|
||||
# Check if BUILD_NUMBER is defined in a Jenkins environment
|
||||
if($ENV{BUILD_NUMBER})
|
||||
string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER})
|
||||
else()
|
||||
string(CONCAT BUILD_VERSION ${VERSION_STRING} "0")
|
||||
endif()
|
||||
|
||||
rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION)
|
||||
|
||||
list(APPEND CMAKE_PREFIX_PATH
|
||||
/opt/rocm
|
||||
@@ -79,27 +87,12 @@ include_directories(src/collectives)
|
||||
include_directories(src/collectives/device)
|
||||
|
||||
set(CU_SOURCES
|
||||
src/bootstrap.cu
|
||||
src/collectives/all_gather.cu
|
||||
src/collectives/all_reduce.cu
|
||||
src/collectives/broadcast.cu
|
||||
src/collectives/reduce.cu
|
||||
src/collectives/reduce_scatter.cu
|
||||
src/collectives/device/functions.cu
|
||||
src/init.cu
|
||||
src/misc/enqueue.cu
|
||||
src/misc/group.cu
|
||||
src/misc/ibvwrap.cu
|
||||
src/misc/nvmlwrap_stub.cu
|
||||
src/misc/rings.cu
|
||||
src/misc/utils.cu
|
||||
src/ring.cu
|
||||
src/transport.cu
|
||||
src/transport/net.cu
|
||||
src/transport/net_ib.cu
|
||||
src/transport/net_socket.cu
|
||||
src/transport/p2p.cu
|
||||
src/transport/shm.cu)
|
||||
src/collectives/device/all_reduce.cu
|
||||
src/collectives/device/all_gather.cu
|
||||
src/collectives/device/reduce.cu
|
||||
src/collectives/device/broadcast.cu
|
||||
src/collectives/device/reduce_scatter.cu
|
||||
src/collectives/device/functions.cu)
|
||||
|
||||
set(CPP_SOURCES)
|
||||
foreach(filename ${CU_SOURCES})
|
||||
@@ -111,20 +104,34 @@ foreach(filename ${CU_SOURCES})
|
||||
list(APPEND CPP_SOURCES ${cpp_filename})
|
||||
endforeach(filename)
|
||||
|
||||
list(APPEND CPP_SOURCES src/collectives/device/all_gather_0.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_0.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_1.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_2.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/all_reduce_3.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/broadcast_0.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_0.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_1.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_2.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_3.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_0.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_1.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_2.cpp)
|
||||
list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_3.cpp)
|
||||
set(CC_SOURCES
|
||||
src/init.cc
|
||||
src/collectives/all_reduce.cc
|
||||
src/collectives/all_gather.cc
|
||||
src/collectives/reduce.cc
|
||||
src/collectives/broadcast.cc
|
||||
src/collectives/reduce_scatter.cc
|
||||
src/channel.cc
|
||||
src/misc/trees.cc
|
||||
src/misc/rings.cc
|
||||
src/misc/argcheck.cc
|
||||
src/misc/group.cc
|
||||
src/misc/utils.cc
|
||||
src/misc/ibvwrap.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/topo.cc
|
||||
src/transport/net.cc
|
||||
src/transport/net_ib.cc
|
||||
src/transport/net_socket.cc
|
||||
src/transport/p2p.cc
|
||||
src/transport/shm.cc
|
||||
src/transport.cc
|
||||
src/bootstrap.cc
|
||||
src/enqueue.cc)
|
||||
|
||||
foreach(filename ${CC_SOURCES})
|
||||
list(APPEND CPP_SOURCES ${filename})
|
||||
endforeach(filename)
|
||||
|
||||
add_library(rccl ${CPP_SOURCES})
|
||||
|
||||
@@ -132,18 +139,20 @@ if(TRACE)
|
||||
add_definitions(-DENABLE_TRACE)
|
||||
endif()
|
||||
|
||||
if(PROFILE)
|
||||
add_definitions(-DENABLE_PROFILING)
|
||||
endif()
|
||||
|
||||
target_link_libraries(rccl
|
||||
PRIVATE --amdgpu-target=gfx803
|
||||
PRIVATE --amdgpu-target=gfx900
|
||||
PRIVATE --amdgpu-target=gfx906
|
||||
PRIVATE --amdgpu-target=gfx908)
|
||||
PRIVATE --amdgpu-target=gfx906)
|
||||
|
||||
if("${HIP_COMPILER}" MATCHES "clang")
|
||||
target_compile_options(rccl
|
||||
PRIVATE --amdgpu-target=gfx803
|
||||
PRIVATE --amdgpu-target=gfx900
|
||||
PRIVATE --amdgpu-target=gfx906
|
||||
PRIVATE --amdgpu-target=gfx908
|
||||
PRIVATE -fgpu-rdc)
|
||||
target_link_libraries(rccl PRIVATE -fgpu-rdc)
|
||||
target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
|
||||
|
||||
поставляемый
+1
-1
@@ -80,7 +80,7 @@ rcclCI:
|
||||
sudo dpkg -i package/*.deb
|
||||
"""
|
||||
|
||||
|
||||
|
||||
//platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES
|
||||
# will be relative from the directory where doxygen is started.
|
||||
# This tag requires that the tag FULL_PATH_NAMES is set to YES.
|
||||
|
||||
STRIP_FROM_PATH =
|
||||
STRIP_FROM_PATH =
|
||||
|
||||
# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
|
||||
# path mentioned in the documentation of a class, which tells the reader which
|
||||
@@ -171,7 +171,7 @@ STRIP_FROM_PATH =
|
||||
# specify the list of include paths that are normally passed to the compiler
|
||||
# using the -I flag.
|
||||
|
||||
STRIP_FROM_INC_PATH =
|
||||
STRIP_FROM_INC_PATH =
|
||||
|
||||
# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
|
||||
# less readable) file names. This can be useful is your file systems doesn't
|
||||
@@ -238,13 +238,13 @@ TAB_SIZE = 4
|
||||
# "Side Effects:". You can put \n's in the value part of an alias to insert
|
||||
# newlines.
|
||||
|
||||
ALIASES =
|
||||
ALIASES =
|
||||
|
||||
# This tag can be used to specify a number of word-keyword mappings (TCL only).
|
||||
# A mapping has the form "name=value". For example adding "class=itcl::class"
|
||||
# will allow you to use the command class in the itcl::class meaning.
|
||||
|
||||
TCL_SUBST =
|
||||
TCL_SUBST =
|
||||
|
||||
# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
|
||||
# only. Doxygen will then generate output that is more tailored for C. For
|
||||
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO
|
||||
# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
|
||||
# the files are not read by doxygen.
|
||||
|
||||
EXTENSION_MAPPING =
|
||||
EXTENSION_MAPPING =
|
||||
|
||||
# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
|
||||
# according to the Markdown format, which allows for more readable
|
||||
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
|
||||
# sections, marked by \if <section_label> ... \endif and \cond <section_label>
|
||||
# ... \endcond blocks.
|
||||
|
||||
ENABLED_SECTIONS =
|
||||
ENABLED_SECTIONS =
|
||||
|
||||
# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
|
||||
# initial value of a variable or macro / define can have for it to appear in the
|
||||
@@ -683,7 +683,7 @@ SHOW_NAMESPACES = YES
|
||||
# by doxygen. Whatever the program writes to standard output is used as the file
|
||||
# version. For an example see the documentation.
|
||||
|
||||
FILE_VERSION_FILTER =
|
||||
FILE_VERSION_FILTER =
|
||||
|
||||
# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
|
||||
# by doxygen. The layout file controls the global structure of the generated
|
||||
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER =
|
||||
# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
|
||||
# tag is left empty.
|
||||
|
||||
LAYOUT_FILE =
|
||||
LAYOUT_FILE =
|
||||
|
||||
# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
|
||||
# the reference definitions. This must be a list of .bib files. The .bib
|
||||
@@ -706,7 +706,7 @@ LAYOUT_FILE =
|
||||
# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
|
||||
# search path. See also \cite for info how to create references.
|
||||
|
||||
CITE_BIB_FILES =
|
||||
CITE_BIB_FILES =
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to warning and progress messages
|
||||
@@ -765,7 +765,7 @@ WARN_FORMAT = "$file:$line: $text"
|
||||
# messages should be written. If left blank the output is written to standard
|
||||
# error (stderr).
|
||||
|
||||
WARN_LOGFILE =
|
||||
WARN_LOGFILE =
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the input files
|
||||
@@ -858,7 +858,7 @@ RECURSIVE = NO
|
||||
# Note that relative paths are relative to the directory from which doxygen is
|
||||
# run.
|
||||
|
||||
EXCLUDE =
|
||||
EXCLUDE =
|
||||
|
||||
# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
|
||||
# directories that are symbolic links (a Unix file system feature) are excluded
|
||||
@@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS = NO
|
||||
# Note that the wildcards are matched against the file with absolute path, so to
|
||||
# exclude all test directories for example use the pattern */test/*
|
||||
|
||||
EXCLUDE_PATTERNS =
|
||||
EXCLUDE_PATTERNS =
|
||||
|
||||
# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
|
||||
# (namespaces, classes, functions, etc.) that should be excluded from the
|
||||
@@ -885,13 +885,13 @@ EXCLUDE_PATTERNS =
|
||||
# Note that the wildcards are matched against the file with absolute path, so to
|
||||
# exclude all test directories use the pattern */test/*
|
||||
|
||||
EXCLUDE_SYMBOLS =
|
||||
EXCLUDE_SYMBOLS =
|
||||
|
||||
# The EXAMPLE_PATH tag can be used to specify one or more files or directories
|
||||
# that contain example code fragments that are included (see the \include
|
||||
# command).
|
||||
|
||||
EXAMPLE_PATH =
|
||||
EXAMPLE_PATH =
|
||||
|
||||
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
|
||||
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
|
||||
@@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE = NO
|
||||
# that contain images that are to be included in the documentation (see the
|
||||
# \image command).
|
||||
|
||||
IMAGE_PATH =
|
||||
IMAGE_PATH =
|
||||
|
||||
# The INPUT_FILTER tag can be used to specify a program that doxygen should
|
||||
# invoke to filter for each input file. Doxygen will invoke the filter program
|
||||
@@ -928,7 +928,7 @@ IMAGE_PATH =
|
||||
# code is scanned, but not when the output code is generated. If lines are added
|
||||
# or removed, the anchors will not be placed correctly.
|
||||
|
||||
INPUT_FILTER =
|
||||
INPUT_FILTER =
|
||||
|
||||
# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
|
||||
# basis. Doxygen will compare the file name with each pattern and apply the
|
||||
@@ -937,7 +937,7 @@ INPUT_FILTER =
|
||||
# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
|
||||
# patterns match the file name, INPUT_FILTER is applied.
|
||||
|
||||
FILTER_PATTERNS =
|
||||
FILTER_PATTERNS =
|
||||
|
||||
# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
|
||||
# INPUT_FILTER) will also be used to filter the input files that are used for
|
||||
@@ -952,7 +952,7 @@ FILTER_SOURCE_FILES = NO
|
||||
# *.ext= (so without naming a filter).
|
||||
# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
|
||||
|
||||
FILTER_SOURCE_PATTERNS =
|
||||
FILTER_SOURCE_PATTERNS =
|
||||
|
||||
# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
|
||||
# is part of the input, its contents will be placed on the main page
|
||||
@@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO
|
||||
# specified with INPUT and INCLUDE_PATH.
|
||||
# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
|
||||
|
||||
CLANG_OPTIONS =
|
||||
CLANG_OPTIONS =
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the alphabetical class index
|
||||
@@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX = 5
|
||||
# while generating the index headers.
|
||||
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
|
||||
|
||||
IGNORE_PREFIX =
|
||||
IGNORE_PREFIX =
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the HTML output
|
||||
@@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION = .html
|
||||
# of the possible markers and block names see the documentation.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
HTML_HEADER =
|
||||
HTML_HEADER =
|
||||
|
||||
# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
|
||||
# generated HTML page. If the tag is left blank doxygen will generate a standard
|
||||
@@ -1144,7 +1144,7 @@ HTML_HEADER =
|
||||
# that doxygen normally uses.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
HTML_FOOTER =
|
||||
HTML_FOOTER =
|
||||
|
||||
# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
|
||||
# sheet that is used by each HTML page. It can be used to fine-tune the look of
|
||||
@@ -1156,7 +1156,7 @@ HTML_FOOTER =
|
||||
# obsolete.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
HTML_STYLESHEET =
|
||||
HTML_STYLESHEET =
|
||||
|
||||
# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
|
||||
# cascading style sheets that are included after the standard style sheets
|
||||
@@ -1169,7 +1169,7 @@ HTML_STYLESHEET =
|
||||
# list). For an example see the documentation.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
HTML_EXTRA_STYLESHEET =
|
||||
HTML_EXTRA_STYLESHEET =
|
||||
|
||||
# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
|
||||
# other source files which should be copied to the HTML output directory. Note
|
||||
@@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET =
|
||||
# files will be copied as-is; there are no commands or markers available.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
HTML_EXTRA_FILES =
|
||||
HTML_EXTRA_FILES =
|
||||
|
||||
# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
|
||||
# will adjust the colors in the style sheet and background images according to
|
||||
@@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP = NO
|
||||
# written to the html output directory.
|
||||
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
|
||||
|
||||
CHM_FILE =
|
||||
CHM_FILE =
|
||||
|
||||
# The HHC_LOCATION tag can be used to specify the location (absolute path
|
||||
# including file name) of the HTML help compiler (hhc.exe). If non-empty,
|
||||
@@ -1316,7 +1316,7 @@ CHM_FILE =
|
||||
# The file has to be specified with full path.
|
||||
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
|
||||
|
||||
HHC_LOCATION =
|
||||
HHC_LOCATION =
|
||||
|
||||
# The GENERATE_CHI flag controls if a separate .chi index file is generated
|
||||
# (YES) or that it should be included in the master .chm file (NO).
|
||||
@@ -1329,7 +1329,7 @@ GENERATE_CHI = NO
|
||||
# and project file content.
|
||||
# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
|
||||
|
||||
CHM_INDEX_ENCODING =
|
||||
CHM_INDEX_ENCODING =
|
||||
|
||||
# The BINARY_TOC flag controls whether a binary table of contents is generated
|
||||
# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
|
||||
@@ -1360,7 +1360,7 @@ GENERATE_QHP = NO
|
||||
# the HTML output folder.
|
||||
# This tag requires that the tag GENERATE_QHP is set to YES.
|
||||
|
||||
QCH_FILE =
|
||||
QCH_FILE =
|
||||
|
||||
# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
|
||||
# Project output. For more information please see Qt Help Project / Namespace
|
||||
@@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER = doc
|
||||
# filters).
|
||||
# This tag requires that the tag GENERATE_QHP is set to YES.
|
||||
|
||||
QHP_CUST_FILTER_NAME =
|
||||
QHP_CUST_FILTER_NAME =
|
||||
|
||||
# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
|
||||
# custom filter to add. For more information please see Qt Help Project / Custom
|
||||
@@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME =
|
||||
# filters).
|
||||
# This tag requires that the tag GENERATE_QHP is set to YES.
|
||||
|
||||
QHP_CUST_FILTER_ATTRS =
|
||||
QHP_CUST_FILTER_ATTRS =
|
||||
|
||||
# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
|
||||
# project's filter section matches. Qt Help Project / Filter Attributes (see:
|
||||
# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
|
||||
# This tag requires that the tag GENERATE_QHP is set to YES.
|
||||
|
||||
QHP_SECT_FILTER_ATTRS =
|
||||
QHP_SECT_FILTER_ATTRS =
|
||||
|
||||
# The QHG_LOCATION tag can be used to specify the location of Qt's
|
||||
# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
|
||||
# generated .qhp file.
|
||||
# This tag requires that the tag GENERATE_QHP is set to YES.
|
||||
|
||||
QHG_LOCATION =
|
||||
QHG_LOCATION =
|
||||
|
||||
# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
|
||||
# generated, together with the HTML files, they form an Eclipse help plugin. To
|
||||
@@ -1540,7 +1540,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
|
||||
# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
|
||||
# This tag requires that the tag USE_MATHJAX is set to YES.
|
||||
|
||||
MATHJAX_EXTENSIONS =
|
||||
MATHJAX_EXTENSIONS =
|
||||
|
||||
# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
|
||||
# of code that will be used on startup of the MathJax code. See the MathJax site
|
||||
@@ -1548,7 +1548,7 @@ MATHJAX_EXTENSIONS =
|
||||
# example see the documentation.
|
||||
# This tag requires that the tag USE_MATHJAX is set to YES.
|
||||
|
||||
MATHJAX_CODEFILE =
|
||||
MATHJAX_CODEFILE =
|
||||
|
||||
# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
|
||||
# the HTML output. The underlying search engine uses javascript and DHTML and
|
||||
@@ -1608,7 +1608,7 @@ EXTERNAL_SEARCH = NO
|
||||
# Searching" for details.
|
||||
# This tag requires that the tag SEARCHENGINE is set to YES.
|
||||
|
||||
SEARCHENGINE_URL =
|
||||
SEARCHENGINE_URL =
|
||||
|
||||
# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
|
||||
# search data is written to a file for indexing by an external tool. With the
|
||||
@@ -1624,7 +1624,7 @@ SEARCHDATA_FILE = searchdata.xml
|
||||
# projects and redirect the results back to the right project.
|
||||
# This tag requires that the tag SEARCHENGINE is set to YES.
|
||||
|
||||
EXTERNAL_SEARCH_ID =
|
||||
EXTERNAL_SEARCH_ID =
|
||||
|
||||
# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
|
||||
# projects other than the one defined by this configuration file, but that are
|
||||
@@ -1634,7 +1634,7 @@ EXTERNAL_SEARCH_ID =
|
||||
# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
|
||||
# This tag requires that the tag SEARCHENGINE is set to YES.
|
||||
|
||||
EXTRA_SEARCH_MAPPINGS =
|
||||
EXTRA_SEARCH_MAPPINGS =
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the LaTeX output
|
||||
@@ -1698,7 +1698,7 @@ PAPER_TYPE = a4
|
||||
# If left blank no extra packages will be included.
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
EXTRA_PACKAGES =
|
||||
EXTRA_PACKAGES =
|
||||
|
||||
# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
|
||||
# generated LaTeX document. The header should contain everything until the first
|
||||
@@ -1714,7 +1714,7 @@ EXTRA_PACKAGES =
|
||||
# to HTML_HEADER.
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
LATEX_HEADER =
|
||||
LATEX_HEADER =
|
||||
|
||||
# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
|
||||
# generated LaTeX document. The footer should contain everything after the last
|
||||
@@ -1725,7 +1725,7 @@ LATEX_HEADER =
|
||||
# Note: Only use a user-defined footer if you know what you are doing!
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
LATEX_FOOTER =
|
||||
LATEX_FOOTER =
|
||||
|
||||
# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
|
||||
# LaTeX style sheets that are included after the standard style sheets created
|
||||
@@ -1736,7 +1736,7 @@ LATEX_FOOTER =
|
||||
# list).
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
LATEX_EXTRA_STYLESHEET =
|
||||
LATEX_EXTRA_STYLESHEET =
|
||||
|
||||
# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
|
||||
# other source files which should be copied to the LATEX_OUTPUT output
|
||||
@@ -1744,7 +1744,7 @@ LATEX_EXTRA_STYLESHEET =
|
||||
# markers available.
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
LATEX_EXTRA_FILES =
|
||||
LATEX_EXTRA_FILES =
|
||||
|
||||
# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
|
||||
# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
|
||||
@@ -1844,14 +1844,14 @@ RTF_HYPERLINKS = NO
|
||||
# default style sheet that doxygen normally uses.
|
||||
# This tag requires that the tag GENERATE_RTF is set to YES.
|
||||
|
||||
RTF_STYLESHEET_FILE =
|
||||
RTF_STYLESHEET_FILE =
|
||||
|
||||
# Set optional variables used in the generation of an RTF document. Syntax is
|
||||
# similar to doxygen's config file. A template extensions file can be generated
|
||||
# using doxygen -e rtf extensionFile.
|
||||
# This tag requires that the tag GENERATE_RTF is set to YES.
|
||||
|
||||
RTF_EXTENSIONS_FILE =
|
||||
RTF_EXTENSIONS_FILE =
|
||||
|
||||
# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
|
||||
# with syntax highlighting in the RTF output.
|
||||
@@ -1896,7 +1896,7 @@ MAN_EXTENSION = .3
|
||||
# MAN_EXTENSION with the initial . removed.
|
||||
# This tag requires that the tag GENERATE_MAN is set to YES.
|
||||
|
||||
MAN_SUBDIR =
|
||||
MAN_SUBDIR =
|
||||
|
||||
# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
|
||||
# will generate one additional man file for each entity documented in the real
|
||||
@@ -1915,7 +1915,7 @@ MAN_LINKS = NO
|
||||
# captures the structure of the code including all documentation.
|
||||
# The default value is: NO.
|
||||
|
||||
GENERATE_XML = YES
|
||||
GENERATE_XML = YES
|
||||
|
||||
# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
|
||||
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
|
||||
@@ -2009,7 +2009,7 @@ PERLMOD_PRETTY = YES
|
||||
# overwrite each other's variables.
|
||||
# This tag requires that the tag GENERATE_PERLMOD is set to YES.
|
||||
|
||||
PERLMOD_MAKEVAR_PREFIX =
|
||||
PERLMOD_MAKEVAR_PREFIX =
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the preprocessor
|
||||
@@ -2050,7 +2050,7 @@ SEARCH_INCLUDES = YES
|
||||
# preprocessor.
|
||||
# This tag requires that the tag SEARCH_INCLUDES is set to YES.
|
||||
|
||||
INCLUDE_PATH =
|
||||
INCLUDE_PATH =
|
||||
|
||||
# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
|
||||
# patterns (like *.h and *.hpp) to filter out the header-files in the
|
||||
@@ -2058,7 +2058,7 @@ INCLUDE_PATH =
|
||||
# used.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
INCLUDE_FILE_PATTERNS =
|
||||
INCLUDE_FILE_PATTERNS =
|
||||
|
||||
# The PREDEFINED tag can be used to specify one or more macro names that are
|
||||
# defined before the preprocessor is started (similar to the -D option of e.g.
|
||||
@@ -2068,7 +2068,7 @@ INCLUDE_FILE_PATTERNS =
|
||||
# recursively expanded use the := operator instead of the = operator.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
PREDEFINED =
|
||||
PREDEFINED =
|
||||
|
||||
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
|
||||
# tag can be used to specify a list of macro names that should be expanded. The
|
||||
@@ -2077,7 +2077,7 @@ PREDEFINED =
|
||||
# definition found in the source code.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
EXPAND_AS_DEFINED =
|
||||
EXPAND_AS_DEFINED =
|
||||
|
||||
# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
|
||||
# remove all references to function-like macros that are alone on a line, have
|
||||
@@ -2106,13 +2106,13 @@ SKIP_FUNCTION_MACROS = YES
|
||||
# the path). If a tag file is not located in the directory in which doxygen is
|
||||
# run, you must also specify the path to the tagfile here.
|
||||
|
||||
TAGFILES =
|
||||
TAGFILES =
|
||||
|
||||
# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
|
||||
# tag file that is based on the input files it reads. See section "Linking to
|
||||
# external documentation" for more information about the usage of tag files.
|
||||
|
||||
GENERATE_TAGFILE =
|
||||
GENERATE_TAGFILE =
|
||||
|
||||
# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
|
||||
# the class index. If set to NO, only the inherited external classes will be
|
||||
@@ -2161,14 +2161,14 @@ CLASS_DIAGRAMS = NO
|
||||
# the mscgen tool resides. If left empty the tool is assumed to be found in the
|
||||
# default search path.
|
||||
|
||||
MSCGEN_PATH =
|
||||
MSCGEN_PATH =
|
||||
|
||||
# You can include diagrams made with dia in doxygen documentation. Doxygen will
|
||||
# then run dia to produce the diagram and insert it in the documentation. The
|
||||
# DIA_PATH tag allows you to specify the directory where the dia binary resides.
|
||||
# If left empty dia is assumed to be found in the default search path.
|
||||
|
||||
DIA_PATH =
|
||||
DIA_PATH =
|
||||
|
||||
# If set to YES the inheritance and collaboration graphs will hide inheritance
|
||||
# and usage relations if the target is undocumented or is not a class.
|
||||
@@ -2217,7 +2217,7 @@ DOT_FONTSIZE = 10
|
||||
# the path where dot can find it using this tag.
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOT_FONTPATH =
|
||||
DOT_FONTPATH =
|
||||
|
||||
# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
|
||||
# each documented class showing the direct and indirect inheritance relations.
|
||||
@@ -2361,26 +2361,26 @@ INTERACTIVE_SVG = NO
|
||||
# found. If left blank, it is assumed the dot tool can be found in the path.
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOT_PATH =
|
||||
DOT_PATH =
|
||||
|
||||
# The DOTFILE_DIRS tag can be used to specify one or more directories that
|
||||
# contain dot files that are included in the documentation (see the \dotfile
|
||||
# command).
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOTFILE_DIRS =
|
||||
DOTFILE_DIRS =
|
||||
|
||||
# The MSCFILE_DIRS tag can be used to specify one or more directories that
|
||||
# contain msc files that are included in the documentation (see the \mscfile
|
||||
# command).
|
||||
|
||||
MSCFILE_DIRS =
|
||||
MSCFILE_DIRS =
|
||||
|
||||
# The DIAFILE_DIRS tag can be used to specify one or more directories that
|
||||
# contain dia files that are included in the documentation (see the \diafile
|
||||
# command).
|
||||
|
||||
DIAFILE_DIRS =
|
||||
DIAFILE_DIRS =
|
||||
|
||||
# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
|
||||
# path where java can find the plantuml.jar file. If left blank, it is assumed
|
||||
@@ -2388,12 +2388,12 @@ DIAFILE_DIRS =
|
||||
# generate a warning when it encounters a \startuml command in this case and
|
||||
# will not generate output for the diagram.
|
||||
|
||||
PLANTUML_JAR_PATH =
|
||||
PLANTUML_JAR_PATH =
|
||||
|
||||
# When using plantuml, the specified paths are searched for files specified by
|
||||
# the !include statement in a plantuml block.
|
||||
|
||||
PLANTUML_INCLUDE_PATH =
|
||||
PLANTUML_INCLUDE_PATH =
|
||||
|
||||
# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
|
||||
# that will be shown in the graph. If the number of nodes in a graph becomes
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
:maxdepth: 4
|
||||
:caption: Contents:
|
||||
|
||||
=======
|
||||
@@ -8,4 +8,4 @@ All API
|
||||
|
||||
.. doxygenindex::
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
:maxdepth: 4
|
||||
:caption: Contents:
|
||||
|
||||
===
|
||||
|
||||
@@ -7,10 +7,10 @@ Welcome to RCCL's documentation!
|
||||
==================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
:maxdepth: 4
|
||||
:caption: Contents:
|
||||
|
||||
library
|
||||
library
|
||||
api
|
||||
allapi
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
:maxdepth: 4
|
||||
:caption: Contents:
|
||||
|
||||
======
|
||||
@@ -10,4 +10,4 @@ RCCL
|
||||
Introduction
|
||||
------------
|
||||
|
||||
The RCCL is an AMD port of NCCL.
|
||||
The RCCL is an AMD port of NCCL.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
FILES="
|
||||
./src/nccl.h.in
|
||||
./src/bootstrap.cu
|
||||
./src/collectives/all_gather.cu
|
||||
./src/collectives/all_reduce.cu
|
||||
./src/collectives/broadcast.cu
|
||||
./src/collectives/collectives.h
|
||||
./src/collectives/device/all_gather.cu
|
||||
./src/collectives/device/all_gather.h
|
||||
./src/collectives/device/all_reduce.cu
|
||||
./src/collectives/device/all_reduce.h
|
||||
./src/collectives/device/broadcast.cu
|
||||
./src/collectives/device/broadcast.h
|
||||
./src/collectives/device/common.h
|
||||
./src/collectives/device/common_kernel.h
|
||||
./src/collectives/device/functions.cu
|
||||
./src/collectives/device/ll_kernel.h
|
||||
./src/collectives/device/primitives.h
|
||||
./src/collectives/device/reduce.cu
|
||||
./src/collectives/device/reduce.h
|
||||
./src/collectives/device/reduce_kernel.h
|
||||
./src/collectives/device/reduce_scatter.cu
|
||||
./src/collectives/device/reduce_scatter.h
|
||||
./src/collectives/reduce.cu
|
||||
./src/collectives/reduce_scatter.cu
|
||||
./src/include/bootstrap.h
|
||||
./src/include/common_coll.h
|
||||
./src/include/core.h
|
||||
./src/include/debug.h
|
||||
./src/include/enqueue.h
|
||||
./src/include/group.h
|
||||
./src/include/ibvwrap.h
|
||||
./src/include/nccl_net.h
|
||||
./src/include/net.h
|
||||
./src/include/nvlink.h
|
||||
./src/include/nvmlwrap.h
|
||||
./src/include/param.h
|
||||
./src/include/ring.h
|
||||
./src/include/rings.h
|
||||
./src/include/shm.h
|
||||
./src/include/socket.h
|
||||
./src/include/topo.h
|
||||
./src/include/transport.h
|
||||
./src/include/utils.h
|
||||
./src/init.cu
|
||||
./src/misc/enqueue.cu
|
||||
./src/misc/group.cu
|
||||
./src/misc/ibvwrap.cu
|
||||
./src/misc/nvmlwrap.cu
|
||||
./src/misc/rings.cu
|
||||
./src/misc/utils.cu
|
||||
./src/ring.cu
|
||||
./src/transport.cu
|
||||
./src/transport/net.cu
|
||||
./src/transport/net_ib.cu
|
||||
./src/transport/net_socket.cu
|
||||
./src/transport/p2p.cu
|
||||
./src/transport/shm.cu
|
||||
"
|
||||
|
||||
for f in $FILES
|
||||
do
|
||||
sed -i \
|
||||
-e 's@cuda_runtime.h@hip/hip_runtime_api.h@g' \
|
||||
-e 's@cuda_fp16.h@hip/hip_fp16.h@g' \
|
||||
-e 's/cudaDeviceCanAccessPeer/hipDeviceCanAccessPeer/g' \
|
||||
-e 's/cudaDeviceEnablePeerAccess/hipDeviceEnablePeerAccess/g' \
|
||||
-e 's/cudaDeviceGetPCIBusId/hipDeviceGetPCIBusId/g' \
|
||||
-e 's/cudaErrorPeerAccessAlreadyEnabled/hipErrorPeerAccessAlreadyEnabled/g' \
|
||||
-e 's/cudaError_t/hipError_t/g' \
|
||||
-e 's/cudaEventCreateWithFlags/hipEventCreateWithFlags/g' \
|
||||
-e 's/cudaEventDestroy/hipEventDestroy/g' \
|
||||
-e 's/cudaEventDisableTiming/hipEventDisableTiming/g' \
|
||||
-e 's/cudaEventRecord/hipEventRecord/g' \
|
||||
-e 's/cudaEvent_t/hipEvent_t/g' \
|
||||
-e 's/cudaFree/hipFree/g' \
|
||||
-e 's/cudaFreeHost/hipHostFree/g' \
|
||||
-e 's/cudaGetDevice/hipGetDevice/g' \
|
||||
-e 's/cudaGetErrorString/hipGetErrorString/g' \
|
||||
-e 's/cudaGetLastError/hipGetLastError/g' \
|
||||
-e 's/cudaHostAlloc/hipHostMalloc/g' \
|
||||
-e 's/cudaHostAllocMapped/hipHostMallocMapped/g' \
|
||||
-e 's/cudaHostGetDevicePointer/hipHostGetDevicePointer/g' \
|
||||
-e 's/cudaHostRegister/hipHostRegister/g' \
|
||||
-e 's/cudaHostRegisterMapped/hipHostRegisterMapped/g' \
|
||||
-e 's/cudaHostUnregister/hipHostUnregister/g' \
|
||||
-e 's/cudaIpcCloseMemHandle/hipIpcCloseMemHandle/g' \
|
||||
-e 's/cudaIpcGetMemHandle/hipIpcGetMemHandle/g' \
|
||||
-e 's/cudaIpcMemHandle_t/hipIpcMemHandle_t/g' \
|
||||
-e 's/cudaIpcMemLazyEnablePeerAccess/hipIpcMemLazyEnablePeerAccess/g' \
|
||||
-e 's/cudaIpcOpenMemHandle/hipIpcOpenMemHandle/g' \
|
||||
-e 's/cudaMalloc/hipMalloc/g' \
|
||||
-e 's/cudaMemcpy/hipMemcpy/g' \
|
||||
-e 's/cudaMemcpyAsync/hipMemcpyAsync/g' \
|
||||
-e 's/cudaMemcpyDefault/hipMemcpyDefault/g' \
|
||||
-e 's/cudaMemcpyDeviceToDevice/hipMemcpyDeviceToDevice/g' \
|
||||
-e 's/cudaMemoryTypeDevice/hipMemoryTypeDevice/g' \
|
||||
-e 's/cudaMemset/hipMemset/g' \
|
||||
-e 's/cudaPointerAttributes/hipPointerAttribute_t/g' \
|
||||
-e 's/cudaPointerGetAttributes/hipPointerGetAttributes/g' \
|
||||
-e 's/cudaSetDevice/hipSetDevice/g' \
|
||||
-e 's/cudaStreamCreateWithFlags/hipStreamCreateWithFlags/g' \
|
||||
-e 's/cudaStreamDestroy/hipStreamDestroy/g' \
|
||||
-e 's/cudaStreamNonBlocking/hipStreamNonBlocking/g' \
|
||||
-e 's/cudaStreamWaitEvent/hipStreamWaitEvent/g' \
|
||||
-e 's/cudaStream_t/hipStream_t/g' \
|
||||
-e 's/cudaSuccess/hipSuccess/g' \
|
||||
$f
|
||||
done
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -16,7 +16,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
||||
CUDA_INC ?= $(CUDA_HOME)/include
|
||||
CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
|
||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
@@ -36,15 +36,16 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
|
||||
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
|
||||
|
||||
# Include Volta support if we're using CUDA9 or above
|
||||
ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
|
||||
else
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
|
||||
endif
|
||||
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
||||
|
||||
CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS += -Wall -Wno-sign-compare
|
||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
|
||||
CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
|
||||
CXXFLAGS += -I $(CUDA_INC)
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
# Use addprefix so that we can specify more than one path
|
||||
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
||||
@@ -68,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
|
||||
endif
|
||||
|
||||
ifneq ($(VERBOSE), 0)
|
||||
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
|
||||
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
|
||||
CXXFLAGS += -Wall -Wextra
|
||||
else
|
||||
.SILENT:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 3
|
||||
NCCL_PATCH := 7
|
||||
NCCL_MINOR := 4
|
||||
NCCL_PATCH := 8
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
Name: libnccl
|
||||
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
|
||||
Release: ${pkg:Revision}
|
||||
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
|
||||
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
|
||||
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
|
||||
|
||||
Group: Development/Libraries
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in
|
||||
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
|
||||
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
|
||||
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
|
||||
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
|
||||
$< > $@
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major}
|
||||
NCCL_MINOR=${nccl:Minor}
|
||||
NCCL_PATCH=${nccl:Patch}
|
||||
NCCL_SUFFIX=${nccl:Suffix}
|
||||
NCCL_BUILD=${pkg:Revision}
|
||||
|
||||
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
|
||||
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
|
||||
|
||||
tar --exclude build \
|
||||
--exclude ".git*" \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -9,41 +9,48 @@ include ../makefiles/version.mk
|
||||
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
|
||||
misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
|
||||
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
|
||||
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
|
||||
misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
|
||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
|
||||
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
|
||||
|
||||
##### lib files
|
||||
LIBNAME := libnccl.so
|
||||
STATICLIBNAME := libnccl_static.a
|
||||
##### pkgconfig files
|
||||
PKGCONFIGFILE := nccl.pc
|
||||
##### dirs
|
||||
BUILDDIR ?= $(abspath ../build)
|
||||
INCDIR := $(BUILDDIR)/include
|
||||
LIBDIR := $(BUILDDIR)/lib
|
||||
OBJDIR := $(BUILDDIR)/obj
|
||||
PKGDIR := $(BUILDDIR)/lib/pkgconfig
|
||||
##### target files
|
||||
CUDARTLIB ?= cudart_static
|
||||
INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
|
||||
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
|
||||
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
|
||||
STATICLIBTARGET := $(STATICLIBNAME)
|
||||
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
|
||||
PKGTARGET := $(PKGCONFIGFILE)
|
||||
LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
|
||||
DEPFILES := $(LIBOBJ:%.o=%.d)
|
||||
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt
|
||||
LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
|
||||
|
||||
DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a
|
||||
|
||||
|
||||
##### rules
|
||||
build : lib staticlib
|
||||
|
||||
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
|
||||
lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
|
||||
|
||||
staticlib : $(LIBDIR)/$(STATICLIBTARGET)
|
||||
|
||||
devicelib: $(INCDIR)/nccl.h
|
||||
$(DEVICELIB): ALWAYS_REBUILD
|
||||
$(MAKE) -C collectives/device
|
||||
|
||||
# Empty target to force rebuild
|
||||
ALWAYS_REBUILD:
|
||||
|
||||
-include $(DEPFILES)
|
||||
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
|
||||
|
||||
@@ -51,7 +58,7 @@ $(INCDIR)/nccl.h : nccl.h.in
|
||||
# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
|
||||
@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
|
||||
mkdir -p $(INCDIR)
|
||||
printf "Generating %-35s > %s\n" $< $@
|
||||
@printf "Generating %-35s > %s\n" $< $@
|
||||
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
|
||||
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
|
||||
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
|
||||
@@ -59,14 +66,14 @@ $(INCDIR)/nccl.h : nccl.h.in
|
||||
-e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
|
||||
$< > $@
|
||||
|
||||
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
|
||||
$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
|
||||
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
|
||||
mkdir -p $(LIBDIR)
|
||||
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
|
||||
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
|
||||
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
|
||||
|
||||
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
|
||||
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
|
||||
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
|
||||
mkdir -p $(LIBDIR)
|
||||
$(eval TMP := $(shell mktemp -d))
|
||||
@@ -75,6 +82,15 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
|
||||
ar cr $@ $(LIBOBJ) $(TMP)/*.o
|
||||
rm -Rf $(TMP)
|
||||
|
||||
$(PKGDIR)/nccl.pc : nccl.pc.in
|
||||
mkdir -p $(PKGDIR)
|
||||
@printf "Generating %-35s > %s\n" $< $@
|
||||
sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
|
||||
-e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
|
||||
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
|
||||
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
|
||||
$< > $@
|
||||
|
||||
$(INCDIR)/%.h : %.h
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(INCDIR)
|
||||
@@ -85,27 +101,34 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
|
||||
mkdir -p $(INCDIR)
|
||||
cp -f $< $@
|
||||
|
||||
$(OBJDIR)/%.o : %.cu
|
||||
$(PKGDIR)/%.pc : %.pc
|
||||
@printf "Grabbing %-35s > %s\n" $< $@
|
||||
mkdir -p $(PKGDIR)
|
||||
cp -f $< $@
|
||||
|
||||
$(OBJDIR)/%.o : %.cc
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
|
||||
@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
|
||||
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
|
||||
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
|
||||
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
|
||||
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
|
||||
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
|
||||
@rm -f $(@:%.o=%.d.tmp)
|
||||
|
||||
clean :
|
||||
rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
|
||||
rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
|
||||
$(MAKE) -C collectives/device clean
|
||||
|
||||
install : lib
|
||||
mkdir -p $(PREFIX)/lib
|
||||
mkdir -p $(PREFIX)/lib/pkgconfig
|
||||
mkdir -p $(PREFIX)/include
|
||||
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
|
||||
cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
|
||||
cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
|
||||
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
|
||||
|
||||
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
|
||||
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
|
||||
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
|
||||
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
|
||||
# as the BUILDDIR variable.
|
||||
|
||||
@@ -0,0 +1,467 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "bootstrap.h"
|
||||
#include "net.h"
|
||||
#include "socket.h"
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
// Always use sockets for bootstrap
|
||||
struct bootstrapNetHandle {
|
||||
union socketAddress connectAddr;
|
||||
};
|
||||
|
||||
struct bootstrapNetComm {
|
||||
int fd;
|
||||
};
|
||||
|
||||
/* Init functions */
|
||||
static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
|
||||
static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
|
||||
static int bootstrapNetIfs = -1;
|
||||
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
ncclResult_t bootstrapNetInit() {
|
||||
if (bootstrapNetIfs == -1) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
if (bootstrapNetIfs == -1) {
|
||||
bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
|
||||
if (bootstrapNetIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
return ncclInternalError;
|
||||
} else {
|
||||
char line[1024];
|
||||
char addrline[1024];
|
||||
line[0] = '\0';
|
||||
for (int i=0; i<bootstrapNetIfs; i++) {
|
||||
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
|
||||
socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
|
||||
}
|
||||
line[1023] = '\0';
|
||||
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
|
||||
NCCLCHECK(ncclCalloc(comm, 1));
|
||||
(*comm)->fd = -1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
|
||||
if (dev >= bootstrapNetIfs) return ncclInternalError;
|
||||
memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Socket Interface Selection type */
|
||||
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
|
||||
|
||||
static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
|
||||
static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
|
||||
// if dev >= 0, listen based on dev
|
||||
if (dev >= 0) {
|
||||
NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
|
||||
} else if (dev == findSubnetIf) {
|
||||
// handle stores a remote address
|
||||
// need to find a local addr that is in the same network as the remote addr
|
||||
union socketAddress localAddr;
|
||||
char ifName[MAX_IF_NAME_SIZE];
|
||||
if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
return ncclSystemError;
|
||||
}
|
||||
// pass the local address back
|
||||
memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
|
||||
} // Otherwise, handle stores a local address
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
|
||||
struct bootstrapNetComm* comm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&comm));
|
||||
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
|
||||
NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
|
||||
*sendComm = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
|
||||
struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
|
||||
struct bootstrapNetComm* rComm;
|
||||
NCCLCHECK(bootstrapNetNewComm(&rComm));
|
||||
struct sockaddr_in sockaddr;
|
||||
socklen_t socklen = sizeof(struct sockaddr_in);
|
||||
SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
|
||||
*recvComm = rComm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetClose(void* opaqueComm) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
|
||||
if (comm) {
|
||||
close(comm->fd);
|
||||
free(comm);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
|
||||
|
||||
// Additional sync functions
|
||||
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
|
||||
NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
|
||||
NCCLCHECK(socketSend(comm->fd, data, size));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
|
||||
struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
|
||||
int recvSize;
|
||||
NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
|
||||
if (recvSize > size) {
|
||||
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
|
||||
struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
|
||||
NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extId {
|
||||
ncclNetHandle_t extHandleRoot;
|
||||
void* extListenComm;
|
||||
uint64_t hostHash;
|
||||
pid_t pid;
|
||||
int fd;
|
||||
pthread_t boostrapThread;
|
||||
};
|
||||
|
||||
struct extInfo {
|
||||
int rank;
|
||||
int nranks;
|
||||
ncclNetHandle_t extHandleListenRoot;
|
||||
ncclNetHandle_t extHandleListen;
|
||||
};
|
||||
|
||||
#include <sys/resource.h>
|
||||
|
||||
static ncclResult_t setFilesLimit() {
|
||||
struct rlimit filesLimit;
|
||||
SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
|
||||
filesLimit.rlim_cur = filesLimit.rlim_max;
|
||||
SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void *bootstrapRoot(void* commId) {
|
||||
struct extInfo info;
|
||||
struct extId* id = (struct extId*)commId;
|
||||
ncclNetHandle_t *rankHandles = NULL;
|
||||
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
|
||||
ncclNetHandle_t zero = { 0 }; // for sanity checking
|
||||
void* tmpComm;
|
||||
ncclResult_t res;
|
||||
setFilesLimit();
|
||||
|
||||
TRACE(NCCL_INIT, "BEGIN");
|
||||
/* Receive addresses from all ranks */
|
||||
int nranks = 0, c = 0;
|
||||
do {
|
||||
NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
|
||||
|
||||
if (c == 0) {
|
||||
nranks = info.nranks;
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
|
||||
NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
|
||||
}
|
||||
|
||||
if (nranks != info.nranks) {
|
||||
WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
|
||||
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Save the connection handle for that rank
|
||||
memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
|
||||
++c;
|
||||
} while (c < nranks);
|
||||
TRACE(NCCL_INIT, "COLLECTED HANDLES");
|
||||
|
||||
// Send the connect handle for the next rank in the AllGather ring
|
||||
for (int r=0; r<nranks; ++r) {
|
||||
int next = (r+1) % nranks;
|
||||
void *tmpSendComm;
|
||||
NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
|
||||
}
|
||||
TRACE(NCCL_INIT, "SENT OUT HANDLES");
|
||||
|
||||
out:
|
||||
bootstrapNetCloseListen(id->extListenComm);
|
||||
free(commId);
|
||||
if (rankHandles) free(rankHandles);
|
||||
if (rankHandlesRoot) free(rankHandlesRoot);
|
||||
|
||||
TRACE(NCCL_INIT, "DONE");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
|
||||
struct extId* id = (struct extId*)commId;
|
||||
id->hostHash = getHostHash();
|
||||
NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
|
||||
ncclUniqueId* threadIdCopy;
|
||||
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
|
||||
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
|
||||
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
|
||||
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
extId* id = (extId*)out;
|
||||
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
id->pid = -1;
|
||||
} else {
|
||||
id->pid = getpid();
|
||||
NCCLCHECK(bootstrapCreateRoot(out, false));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct unexConn {
|
||||
int peer;
|
||||
void* comm;
|
||||
struct unexConn* next;
|
||||
};
|
||||
|
||||
struct extState {
|
||||
void* extBstrapListenComm;
|
||||
void* extBstrapRingRecvComm;
|
||||
void* extBstrapRingSendComm;
|
||||
ncclNetHandle_t* peerBstrapHandles;
|
||||
struct unexConn* unexpectedConnections;
|
||||
int rank;
|
||||
int nranks;
|
||||
int dev;
|
||||
};
|
||||
|
||||
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
|
||||
struct extId* id = (struct extId*)commId;
|
||||
bool idFromEnv = id->pid < 0;
|
||||
struct extState* state;
|
||||
NCCLCHECK(ncclCalloc(&state, 1));
|
||||
state->rank = rank;
|
||||
state->nranks = nranks;
|
||||
*commState = state;
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
|
||||
|
||||
struct extInfo info = { 0 };
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
void *tmpSendComm, *tmpRecvComm;
|
||||
// Pass the remote address to listen via info
|
||||
if (idFromEnv) {
|
||||
memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
}
|
||||
// listen will return the local address via info (specify interface type 'findSubnetIf')
|
||||
state->dev = idFromEnv ? findSubnetIf : 0;
|
||||
void* extBstrapListenCommRoot;
|
||||
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
|
||||
NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
|
||||
|
||||
// stagger connection times to avoid an overload of the root at very high rank counts
|
||||
if (nranks > 128) {
|
||||
long msec = rank;
|
||||
struct timespec tv;
|
||||
tv.tv_sec = msec / 1000;
|
||||
tv.tv_nsec = 1000000 * (msec % 1000);
|
||||
TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
|
||||
(void) nanosleep(&tv, NULL);
|
||||
}
|
||||
|
||||
// send info on my listening socket to root
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
|
||||
// get info on my "next" rank in the bootstrap ring from root
|
||||
ncclNetHandle_t extHandleNext;
|
||||
NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
|
||||
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
|
||||
// Accept the connect request from the previous rank in the AllGather ring
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
|
||||
|
||||
// AllGather all listen handlers
|
||||
NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
|
||||
memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
char* data = (char*)allData;
|
||||
int rank = state->rank;
|
||||
int nranks = state->nranks;
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
|
||||
|
||||
/* Simple ring based AllGather
|
||||
* At each step i receive data from (rank-i-1) from left
|
||||
* and send previous step's data from (rank-i) to right
|
||||
*/
|
||||
for (int i=0; i<nranks-1; i++) {
|
||||
size_t rslice = (rank - i - 1 + nranks) % nranks;
|
||||
size_t sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right
|
||||
NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
|
||||
// Recv slice from the left
|
||||
NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
void* tmpSendComm;
|
||||
NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
|
||||
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
|
||||
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
|
||||
// New unex
|
||||
struct unexConn* unex;
|
||||
NCCLCHECK(ncclCalloc(&unex, 1));
|
||||
unex->peer = peer;
|
||||
unex->comm = comm;
|
||||
|
||||
// Enqueue
|
||||
struct unexConn* list = state->unexpectedConnections;
|
||||
if (list == NULL) {
|
||||
state->unexpectedConnections = unex;
|
||||
return ncclSuccess;
|
||||
}
|
||||
while (list->next) list = list->next;
|
||||
list->next = unex;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* unexpectedDequeue(struct extState* state, int peer) {
|
||||
struct unexConn* elem = state->unexpectedConnections;
|
||||
struct unexConn* prev = NULL;
|
||||
while (elem) {
|
||||
if (elem->peer == peer) {
|
||||
if (prev == NULL) {
|
||||
state->unexpectedConnections = elem->next;
|
||||
} else {
|
||||
prev->next = elem->next;
|
||||
}
|
||||
void* comm = elem->comm;
|
||||
free(elem);
|
||||
return comm;
|
||||
}
|
||||
prev = elem;
|
||||
elem = elem->next;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// We can't know who we'll receive from, so we need to receive everything at once
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
|
||||
void* tmpRecvComm;
|
||||
|
||||
// Search unexpected connections first
|
||||
if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Then look for new connections
|
||||
while (1) {
|
||||
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
|
||||
int newPeer;
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
|
||||
if (newPeer == peer) {
|
||||
NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
// Unexpected connection. Save for later.
|
||||
NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapClose(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
WARN("Unexpected connections are not empty.\n");
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
|
||||
NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
|
||||
NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
|
||||
|
||||
free(state->peerBstrapHandles);
|
||||
free(state);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1,249 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "bootstrap.h"
|
||||
#include "net.h"
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
// Always use sockets for bootstrap
|
||||
ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
|
||||
|
||||
static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
// Additional sync functions based on async + test for bootstrap, using host ptrs.
|
||||
static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
|
||||
void* request;
|
||||
NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
|
||||
int done = 0;
|
||||
while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
|
||||
void* request;
|
||||
NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
|
||||
int done = 0;
|
||||
while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extId {
|
||||
ncclNetHandle_t extHandleRoot;
|
||||
void* extListenComm;
|
||||
uint64_t hostHash;
|
||||
pid_t pid;
|
||||
int fd;
|
||||
pthread_t boostrapThread;
|
||||
};
|
||||
|
||||
struct extInfo {
|
||||
int rank;
|
||||
int nranks;
|
||||
ncclNetHandle_t extHandleListenFromRoot;
|
||||
ncclNetHandle_t extHandleRing;
|
||||
};
|
||||
|
||||
#include <sys/resource.h>
|
||||
|
||||
static ncclResult_t setFilesLimit() {
|
||||
struct rlimit filesLimit;
|
||||
SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
|
||||
filesLimit.rlim_cur = filesLimit.rlim_max;
|
||||
SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void *bootstrapRoot(void* commId) {
|
||||
struct extInfo info;
|
||||
struct extId* id = (struct extId*)commId;
|
||||
ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
|
||||
ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
|
||||
ncclNetHandle_t zero = { 0 }; // for sanity checking
|
||||
void* tmpComm;
|
||||
ncclResult_t res;
|
||||
setFilesLimit();
|
||||
|
||||
/* Receive addresses from all ranks */
|
||||
int nranks = 0, c = 0;
|
||||
do {
|
||||
NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
|
||||
|
||||
if (c == 0) {
|
||||
extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
|
||||
extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
|
||||
if (extHandleBstrap == NULL || extHandleRing == NULL) {
|
||||
WARN("Bootstrap thread : failed to allocate memory");
|
||||
goto out;
|
||||
}
|
||||
nranks = info.nranks;
|
||||
}
|
||||
|
||||
if (nranks != info.nranks) {
|
||||
WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
|
||||
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
|
||||
goto out;
|
||||
}
|
||||
|
||||
// Save the connection handle for connecting back to the ranks
|
||||
memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
|
||||
// Save the connection handle for the AllGather ring
|
||||
memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
|
||||
|
||||
++c;
|
||||
} while (c < nranks);
|
||||
|
||||
// Send the connect handle for the next rank in the AllGather ring
|
||||
for (int r=0; r<nranks; ++r) {
|
||||
int next = (r+1) % nranks;
|
||||
void *tmpSendComm;
|
||||
NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
|
||||
NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
|
||||
NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
|
||||
}
|
||||
|
||||
out:
|
||||
bootstrapCloseListen(id->extListenComm);
|
||||
free(commId);
|
||||
free(extHandleBstrap);
|
||||
free(extHandleRing);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
|
||||
struct extId* id = (struct extId*)commId;
|
||||
id->hostHash = getHostHash();
|
||||
NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
|
||||
ncclUniqueId* threadIdCopy;
|
||||
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
|
||||
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
|
||||
pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
|
||||
static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
|
||||
extId* id = (extId*)out;
|
||||
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env) {
|
||||
if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
|
||||
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
id->pid = -1;
|
||||
} else {
|
||||
id->pid = getpid();
|
||||
NCCLCHECK(bootstrapCreateRoot(out, false));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct extState {
|
||||
void* extBstrapRingRecvComm;
|
||||
void* extBstrapRingSendComm;
|
||||
ncclNetHandle_t extBstrapRootHandle;
|
||||
int rank;
|
||||
int nranks;
|
||||
int dev;
|
||||
};
|
||||
|
||||
ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
|
||||
struct extId* id = (struct extId*)commId;
|
||||
bool idFromEnv = id->pid < 0;
|
||||
struct extState* state;
|
||||
NCCLCHECK(ncclCalloc(&state, 1));
|
||||
state->rank = rank;
|
||||
state->nranks = nranks;
|
||||
*commState = state;
|
||||
void* extBstrapRootListenComm; // comm on which we accept root's connections
|
||||
|
||||
struct extInfo info = { 0 };
|
||||
info.rank = rank;
|
||||
info.nranks = nranks;
|
||||
void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
|
||||
// Pass the remote address to listen via info
|
||||
if (idFromEnv) {
|
||||
memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
}
|
||||
// listen will return the local address via info (specify interface type 'findSubnetIf')
|
||||
state->dev = idFromEnv ? findSubnetIf : 0;
|
||||
NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
|
||||
NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
|
||||
|
||||
memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
|
||||
// send info on my listening sockets to root
|
||||
NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
|
||||
NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
|
||||
NCCLCHECK(bootstrapCloseSend(tmpSendComm));
|
||||
|
||||
// get info on my "next" rank in the bootstrap ring from root
|
||||
ncclNetHandle_t extHandleNext;
|
||||
NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
|
||||
NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
|
||||
NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
|
||||
|
||||
NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
|
||||
// Accept the connect request from the previous rank in the AllGather ring
|
||||
NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
|
||||
NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
|
||||
NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
char* data = (char*)allData;
|
||||
int rank = state->rank;
|
||||
int nranks = state->nranks;
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
|
||||
|
||||
/* Simple ring based AllGather
|
||||
* At each step i receive data from (rank-i-1) from left
|
||||
* and send previous step's data from (rank-i) to right
|
||||
*/
|
||||
for (int i=0; i<nranks-1; i++) {
|
||||
int rslice = (rank - i - 1 + nranks) % nranks;
|
||||
int sslice = (rank - i + nranks) % nranks;
|
||||
|
||||
// Send slice to the right
|
||||
NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
|
||||
// Recv slice from the left
|
||||
NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
|
||||
}
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t bootstrapClose(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
|
||||
NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
|
||||
NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
|
||||
|
||||
free(state);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "channel.h"
|
||||
#include "param.h"
|
||||
|
||||
NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
struct ncclChannel* channel = comm->channels+channelid;
|
||||
channel->id = channelid;
|
||||
|
||||
// Setup intermediate buffering
|
||||
channel->buffSize = ncclParamBuffsize();
|
||||
|
||||
// Ring index to user rank table.
|
||||
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
|
||||
|
||||
// Communication structures with peers.
|
||||
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
|
||||
for (size_t i=0; i<comm->nRanks; ++i) {
|
||||
channel->peers[i].send.comm = comm;
|
||||
channel->peers[i].recv.comm = comm;
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
// Operation list
|
||||
NCCLCHECK(ncclCudaHostFree(channel->collectives));
|
||||
|
||||
// Free Ring index to rank tables
|
||||
free(channel->ring.userRanks);
|
||||
CUDACHECK(hipFree(channel->ring.devUserRanks));
|
||||
|
||||
// Free transport proxy resources
|
||||
for (int r=0; r<nRanks; r++) {
|
||||
struct ncclPeer* peer = channel->peers+r;
|
||||
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
|
||||
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
|
||||
}
|
||||
|
||||
// Free the peer structures.
|
||||
CUDACHECK(hipFree(channel->devPeers));
|
||||
free(channel->peers);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollAllGather, "AllGather",
|
||||
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
|
||||
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "common_coll.h"
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
size_t nbytes = count*ncclTypeSize(datatype);
|
||||
INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||
if (comm->nRanks == 1) {
|
||||
if (sendbuff != recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
|
||||
NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
|
||||
ncclSum, 0, comm, stream);
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
|
||||
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
|
||||
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "common_coll.h"
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
size_t nbytes = count*ncclTypeSize(datatype);
|
||||
INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||
if (comm->nRanks == 1) {
|
||||
if (sendbuff != recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
|
||||
NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
|
||||
return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
|
||||
op, 0, comm, stream);
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
|
||||
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
|
||||
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
/* Deprecated original "in place" function, similar to MPI */
|
||||
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
|
||||
}
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "common_coll.h"
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
size_t nbytes = count*ncclTypeSize(datatype);
|
||||
INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||
if (comm->nRanks == 1) {
|
||||
if (sendbuff != recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
|
||||
NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Deprecated original "in place" function, similar to MPI */
|
||||
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
|
||||
ncclSum, root, comm, stream);
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
|
||||
ncclSum, root, comm, stream);
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -8,9 +9,7 @@
|
||||
#ifndef NCCL_COLLECTIVES_H_
|
||||
#define NCCL_COLLECTIVES_H_
|
||||
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
||||
|
||||
#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
|
||||
#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
|
||||
|
||||
#define NCCL_COLL_NAME(coll, op, dtype) \
|
||||
coll##_##op##_##dtype
|
||||
@@ -19,13 +18,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
|
||||
coll##Kernel_##op##_##dtype
|
||||
|
||||
/* Declare all collective operations */
|
||||
#define DECL_COLL4(coll, op, dtype) \
|
||||
#define DECL_COLL5(coll, op, dtype) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
|
||||
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
|
||||
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
|
||||
|
||||
#define DECL_COLL4(coll, op, dtype) \
|
||||
DECL_COLL5(coll, op, dtype) \
|
||||
DECL_COLL5(coll##LL, op, dtype)
|
||||
|
||||
#define DECL_COLL3(coll, op, dtype) \
|
||||
DECL_COLL4(coll##LL, op, dtype) \
|
||||
DECL_COLL4(coll, op, dtype)
|
||||
DECL_COLL4(coll##Ring, op, dtype)
|
||||
|
||||
#define DECL_COLL2(coll, op) \
|
||||
DECL_COLL3(coll, op, i8) \
|
||||
@@ -53,15 +55,22 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
|
||||
|
||||
DECL_ALL_COLLS
|
||||
|
||||
#define ALLREDUCE_SUBSTEPS 2
|
||||
#define ALLREDUCE_BUFCHUNKS 2
|
||||
#define ALLGATHER_SUBSTEPS 2
|
||||
#define ALLGATHER_BUFCHUNKS 2
|
||||
#define REDUCESCATTER_SUBSTEPS 2
|
||||
#define REDUCESCATTER_BUFCHUNKS 2
|
||||
#define BROADCAST_SUBSTEPS 8
|
||||
#define BROADCAST_BUFCHUNKS 2
|
||||
#define REDUCE_SUBSTEPS 8
|
||||
#define REDUCE_BUFCHUNKS 2
|
||||
// CHUNKSIZE must be a multiple of SLICESIZE
|
||||
//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
|
||||
//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
|
||||
//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
|
||||
//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
|
||||
//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
|
||||
//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
|
||||
#define ALLREDUCE_SLICESTEPS 4
|
||||
#define ALLREDUCE_CHUNKSTEPS 4
|
||||
#define ALLGATHER_SLICESTEPS 4
|
||||
#define ALLGATHER_CHUNKSTEPS 4
|
||||
#define REDUCESCATTER_SLICESTEPS 4
|
||||
#define REDUCESCATTER_CHUNKSTEPS 4
|
||||
#define BROADCAST_SLICESTEPS 1
|
||||
#define BROADCAST_CHUNKSTEPS 1
|
||||
#define REDUCE_SLICESTEPS 1
|
||||
#define REDUCE_CHUNKSTEPS 1
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device
|
||||
|
||||
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
|
||||
|
||||
LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
|
||||
$(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
|
||||
$(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
|
||||
$(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
|
||||
$(OBJDIR)/functions.o
|
||||
|
||||
LIBSRCFILES += functions.cu
|
||||
|
||||
DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
|
||||
DEPENDFILES := $(DEPFILES:%.d=%.dep)
|
||||
DEPENDFILES:= $(DEPFILES:%.d=%.dep)
|
||||
STATICLIB := $(OBJDIR)/colldevice.a
|
||||
DEVOBJ := $(OBJDIR)/devlink.o
|
||||
RULESFILE := $(OBJDIR)/Makefile.rules
|
||||
|
||||
NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
|
||||
|
||||
@@ -33,6 +28,16 @@ all: $(STATICLIB)
|
||||
# Dummy rule so that the extra dependency (%.dep) files are preserved by make
|
||||
all_deps: $(DEPENDFILES)
|
||||
|
||||
# Auto-generating the rules per op/reduction/datatype/algorithm
|
||||
$(RULESFILE) :
|
||||
@printf "Generating %-35s > %s\n" rules $@
|
||||
@mkdir -p $(OBJDIR)
|
||||
@./gen_rules.sh $(OBJDIR) > $@
|
||||
|
||||
-include $(RULESFILE)
|
||||
|
||||
LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o
|
||||
|
||||
-include $(DEPFILES)
|
||||
|
||||
$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
|
||||
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
|
||||
|
||||
$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
|
||||
|
||||
$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
|
||||
|
||||
$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
|
||||
|
||||
$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
|
||||
|
||||
# ... and create the device-side linked object with all those.
|
||||
$(DEVOBJ) : $(LIBOBJ)
|
||||
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,6 +11,4 @@
|
||||
|
||||
#define UNROLL 4
|
||||
|
||||
#if NCCL_OP == 0
|
||||
IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
|
||||
#endif
|
||||
|
||||
@@ -1,81 +1,44 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
// Increase Step and poffset/noffset for buffer sync
|
||||
#define NEXT_STEP \
|
||||
step++; \
|
||||
poffset = noffset; \
|
||||
noffset += sliceSize; \
|
||||
if (noffset == buffSize) noffset = 0;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int bid = args->bid;
|
||||
__shared__ T* sharedNextOutput;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
int prevdirect = 0;
|
||||
int nextdirect = 0;
|
||||
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->next_hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int buffSize = ring->buffSize / sizeof(T);
|
||||
const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
|
||||
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
|
||||
|
||||
if (tid == 0) {
|
||||
// Update in case we skipped some collectives
|
||||
STORE(ring->recv.conn.opCount, args->opCount);
|
||||
// Wait for next to be ready
|
||||
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
|
||||
waitOpCountNext.wait(args->opCount);
|
||||
if (prevdirect) {
|
||||
*ring->recv.conn.ptrExchange = args->ThisOutput;
|
||||
}
|
||||
if (nextdirect) {
|
||||
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
|
||||
while (LOAD(ptr) == nullptr);
|
||||
sharedNextOutput = (T*)LOAD(ptr);
|
||||
STORE(ptr, nullptr);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint64_t step = 0ULL;
|
||||
int poffset, noffset = 0;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
|
||||
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
|
||||
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int maxOffset = min(chunkSize, size-chunkOffset);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
@@ -83,130 +46,53 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisInput + chunkOffset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
prims.directSend(thisInput+chunkOffset, offset, nelem);
|
||||
} else {
|
||||
Prims::DoubleCopy(tid, nthreads,
|
||||
thisInput + chunkOffset,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
|
||||
}
|
||||
|
||||
NEXT_STEP; // Increases step, poffset, noffset
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
if (prevdirect) {
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
}
|
||||
Prims::Copy(tid, nthreads,
|
||||
NULL,
|
||||
NULL,
|
||||
0, 0,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
} else {
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
Prims::DoubleCopy(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ring->devUserRanks[1];
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
Prims::Copy(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
|
||||
STORE(ring->send.conn.head, 0ULL);
|
||||
STORE(ring->recv.conn.tail, 0ULL);
|
||||
__threadfence_system();
|
||||
STORE(ring->recv.conn.opCount, args->opCount+1);
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
#include "ll_kernel.h"
|
||||
|
||||
#define NEXT_STEP_LL \
|
||||
poffset = noffset; \
|
||||
pflag = nflag; \
|
||||
noffset += NCCL_LL_SLICE_LINES; \
|
||||
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
|
||||
nflag++; \
|
||||
step++;
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int llNthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
|
||||
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
|
||||
volatile int * sizesFifo = ring->send.conn.llFifo;
|
||||
uint64_t sendHead = sendHeadPtr[0];
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
typedef LLPrimitives<T, FUNC> LL;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nRings*chunkSize;
|
||||
|
||||
uint64_t step = ring->send.conn.llStep;
|
||||
uint32_t pflag, nflag = step + 1;
|
||||
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
|
||||
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
@@ -216,57 +102,35 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
ssize_t offset;
|
||||
int maxOffset = min(chunkSize, size-chunkOffset);
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
WAIT_NEXT;
|
||||
if (thisInput + chunkOffset == thisOutput + offset) { // In place
|
||||
LL::ReduceCopy(
|
||||
thisInput + chunkOffset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, nflag, llNthreads);
|
||||
LLprims.send(thisInput+chunkOffset, nelem);
|
||||
} else {
|
||||
LL::ReduceCopy(
|
||||
thisInput + chunkOffset,
|
||||
thisOutput + offset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, nflag, llNthreads);
|
||||
LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
|
||||
}
|
||||
POST_SIZE;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, pflag, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
ACK_PREV;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: final store
|
||||
rankDest = ring->devUserRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LL::ReduceCopy(
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
maxOffset, pflag, llNthreads);
|
||||
ACK_PREV;
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
FIFO_CLEANING_AND_SAVE_STEP(nflag);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,12 +11,7 @@
|
||||
|
||||
#define UNROLL 4
|
||||
|
||||
#if NCCL_OP == 0
|
||||
IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum);
|
||||
#elif NCCL_OP == 1
|
||||
IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
|
||||
#elif NCCL_OP == 2
|
||||
IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin);
|
||||
#elif NCCL_OP == 3
|
||||
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
|
||||
#endif
|
||||
IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
|
||||
@@ -1,243 +1,181 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
// Increase Step and poffset/noffset for buffer sync
|
||||
#define NEXT_STEP \
|
||||
step++; \
|
||||
poffset = noffset; \
|
||||
noffset += sliceSize; \
|
||||
if (noffset == buffSize) noffset = 0;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int bid = args->bid;
|
||||
__shared__ T* sharedNextOutput;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
int prevdirect = 0;
|
||||
int nextdirect = 0;
|
||||
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->next_hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
const int buffSize = ring->buffSize / sizeof(T);
|
||||
const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
|
||||
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
|
||||
|
||||
if (tid == 0) {
|
||||
// Update in case we skipped some collectives
|
||||
STORE(ring->recv.conn.opCount, args->opCount);
|
||||
// Wait for next to be ready
|
||||
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
|
||||
waitOpCountNext.wait(args->opCount);
|
||||
if (prevdirect) {
|
||||
*ring->recv.conn.ptrExchange = args->ThisOutput;
|
||||
}
|
||||
if (nextdirect) {
|
||||
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
|
||||
while (LOAD(ptr) == nullptr);
|
||||
sharedNextOutput = (T*)LOAD(ptr);
|
||||
STORE(ptr, nullptr);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint64_t step = 0ULL;
|
||||
int poffset, noffset = 0;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws, wr;
|
||||
if (tid == 0) clk = clock64();
|
||||
#endif
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
|
||||
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
|
||||
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
|
||||
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int maxOffset;
|
||||
int nelem;
|
||||
int slice;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisInput + offset,
|
||||
nextOutput + noffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
|
||||
NEXT_STEP; // Increases step, poffset, noffset
|
||||
INIT_COUNTER;
|
||||
prims.send(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
Prims::Reduce(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisInput + offset,
|
||||
nextOutput + noffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
INIT_COUNTER;
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvReduceSend);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
Prims::ReduceCopy(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisInput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
thisOutput + offset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
INIT_COUNTER;
|
||||
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvReduceCopySend);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
if (prevdirect) {
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks - j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
}
|
||||
Prims::Copy(tid, nthreads,
|
||||
NULL,
|
||||
NULL,
|
||||
0, 0,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
} else {
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks - j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
|
||||
Prims::DoubleCopy(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
Prims::Copy(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
INIT_COUNTER;
|
||||
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvCopySend);
|
||||
}
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
// Wait for next to have consumed all data before we reset the flag
|
||||
waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
|
||||
STORE(ring->send.conn.head, 0ULL);
|
||||
STORE(ring->recv.conn.tail, 0ULL);
|
||||
__threadfence_system();
|
||||
STORE(ring->recv.conn.opCount, args->opCount+1);
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = chunkOffset + slice * realChunkSize;
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
// Final wait/copy.
|
||||
INIT_COUNTER;
|
||||
prims.directRecv(thisOutput+offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecv);
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "ll_kernel.h"
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = args->lastChunkSize;
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
#define NEXT_STEP_LL \
|
||||
poffset = noffset; \
|
||||
pflag = nflag; \
|
||||
noffset += NCCL_LL_SLICE_LINES; \
|
||||
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
|
||||
nflag++; \
|
||||
step++;
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
do {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
prims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int llNthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
|
||||
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
|
||||
volatile int * sizesFifo = ring->send.conn.llFifo;
|
||||
uint64_t sendHead = sendHeadPtr[0];
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
typedef LLPrimitives<T, FUNC> LL;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nRings*nranks*chunkSize;
|
||||
|
||||
uint64_t step = ring->send.conn.llStep;
|
||||
uint32_t pflag, nflag = step + 1;
|
||||
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
|
||||
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
|
||||
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
@@ -247,89 +185,100 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
/////////////// begin AllReduce steps ///////////////
|
||||
ssize_t offset;
|
||||
int maxOffset;
|
||||
int nelem;
|
||||
int slice;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
slice = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
prevInput + poffset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, pflag, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
ACK_PREV;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
slice = ring->devUserRanks[0];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, pflag, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
ACK_PREV;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
slice = ring->devUserRanks[nranks - j];
|
||||
slice = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, pflag, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
ACK_PREV;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
slice = ring->devUserRanks[1];
|
||||
offset = chunkOffset + slice * chunkSize;
|
||||
maxOffset = min(chunkSize, size-offset);
|
||||
nelem = min(chunkSize, size-offset);
|
||||
|
||||
// Here we need to copy from buffer to this output.
|
||||
LL::ReduceCopy(
|
||||
prevInput + poffset,
|
||||
thisOutput + offset,
|
||||
maxOffset, pflag, llNthreads);
|
||||
ACK_PREV;
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
}
|
||||
|
||||
FIFO_CLEANING_AND_SAVE_STEP(nflag);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nThreads;
|
||||
const int bid = args->bid;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
do {
|
||||
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Up
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
|
||||
do {
|
||||
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
// Down
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (tree->up == -1) {
|
||||
LLprims.send(thisOutput+offset, nelem);
|
||||
} else if (tree->down[0] == -1) {
|
||||
LLprims.recv(thisOutput+offset, nelem);
|
||||
} else {
|
||||
LLprims.recvCopySend(thisOutput+offset, nelem);
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
}
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 1
|
||||
#include "device/all_reduce.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 2
|
||||
#include "device/all_reduce.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 3
|
||||
#include "device/all_reduce.cu"
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,6 +11,4 @@
|
||||
|
||||
#define UNROLL 4
|
||||
|
||||
#if NCCL_OP == 0
|
||||
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
|
||||
#endif
|
||||
IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
|
||||
@@ -1,184 +1,101 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
// Increase Step and boffset for buffer sync
|
||||
#define NEXT_STEP \
|
||||
step++; \
|
||||
boffset += sliceSize; \
|
||||
if (boffset == buffSize) boffset = 0;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int bid = args->bid;
|
||||
__shared__ T* sharedNextOutput;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
int prevdirect = 0;
|
||||
int nextdirect = 0;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws, wr;
|
||||
if (tid == 0) clk = clock64();
|
||||
#endif
|
||||
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->next_hdp_reg);
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
|
||||
typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
INIT_COUNTER;
|
||||
prims.send(thisInput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.copySend(thisInput+offset, thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(copySend);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
INIT_COUNTER;
|
||||
prims.recv(thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recv);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.recvCopySend(thisOutput+offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvCopySend);
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int buffSize = ring->buffSize / sizeof(T);
|
||||
const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
|
||||
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
|
||||
if (tid == 0) {
|
||||
// Update in case we skipped some collectives
|
||||
STORE(ring->recv.conn.opCount, args->opCount);
|
||||
if (nextRank != root) {
|
||||
// Wait for next to be ready
|
||||
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
|
||||
waitOpCountNext.wait(args->opCount);
|
||||
}
|
||||
if (rank != root && prevdirect) {
|
||||
*ring->recv.conn.ptrExchange = args->ThisOutput;
|
||||
}
|
||||
if (nextRank != root && nextdirect) {
|
||||
void* volatile* ptr = &(ring->devMemSend->ptrExchange);
|
||||
while (LOAD(ptr) == nullptr);
|
||||
sharedNextOutput = (T*)LOAD(ptr);
|
||||
STORE(ptr, nullptr);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint64_t step = 0ULL;
|
||||
int boffset = 0;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
|
||||
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
|
||||
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int maxOffset = min(chunkSize, size-offset);
|
||||
|
||||
if (rank == root) {
|
||||
if (thisInput == thisOutput) {
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisInput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
} else {
|
||||
Prims::DoubleCopy(tid, nthreads,
|
||||
thisInput + offset,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
if (prevdirect) maxOffset = 0; // Only wait for signals
|
||||
Prims::Copy(tid, nthreads,
|
||||
prevInput + boffset,
|
||||
thisOutput + offset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
} else {
|
||||
if (prevdirect) {
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
} else {
|
||||
Prims::DoubleCopy(tid, nthreads,
|
||||
prevInput + boffset,
|
||||
thisOutput + offset,
|
||||
nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
}
|
||||
}
|
||||
NEXT_STEP; // Increases step, boffset
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
if (nextRank != root) {
|
||||
// Wait for next to have consumed data before resetting the flag
|
||||
waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
|
||||
STORE(ring->send.conn.head, 0ULL);
|
||||
}
|
||||
STORE(ring->recv.conn.tail, 0ULL);
|
||||
__threadfence_system();
|
||||
STORE(ring->recv.conn.opCount, args->opCount+1);
|
||||
}
|
||||
}
|
||||
|
||||
#include "ll_kernel.h"
|
||||
|
||||
#define NEXT_STEP_LL \
|
||||
boffset += NCCL_LL_SLICE_LINES; \
|
||||
if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
|
||||
flag++; \
|
||||
step++;
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int llNthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
|
||||
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
|
||||
volatile int * sizesFifo = ring->send.conn.llFifo;
|
||||
uint64_t sendHead = sendHeadPtr[0];
|
||||
const int rank = comm->rank;
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int root = args->root;
|
||||
|
||||
typedef LLPrimitives<T, FUNC> LL;
|
||||
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nRings*chunkSize;
|
||||
|
||||
uint64_t step = ring->send.conn.llStep;
|
||||
uint32_t flag = step + 1;
|
||||
int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
|
||||
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
@@ -186,46 +103,21 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int maxOffset = min(chunkSize, size-offset);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (rank == root) {
|
||||
WAIT_NEXT;
|
||||
if (thisInput == thisOutput) {
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
nextOutput + boffset,
|
||||
maxOffset, flag, llNthreads);
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else {
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
thisOutput + offset,
|
||||
nextOutput + boffset,
|
||||
maxOffset, flag, llNthreads);
|
||||
LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
|
||||
}
|
||||
POST_SIZE;
|
||||
NEXT_STEP_LL;
|
||||
} else if (nextRank == root) {
|
||||
LL::ReduceCopy(
|
||||
prevInput + boffset,
|
||||
thisOutput + offset,
|
||||
maxOffset, flag, llNthreads);
|
||||
NEXT_STEP_LL;
|
||||
ACK_PREV;
|
||||
LLprims.recv(thisOutput + offset, nelem);
|
||||
} else {
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
prevInput + boffset,
|
||||
thisOutput + offset,
|
||||
nextOutput + boffset,
|
||||
maxOffset, flag, flag, llNthreads);
|
||||
POST_SIZE;
|
||||
NEXT_STEP_LL;
|
||||
ACK_PREV;
|
||||
LLprims.recvCopySend(thisOutput + offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
// We need everyone to acknowledge data even if they didn't receive anything
|
||||
// so that the next collective can start right away.
|
||||
ACK_PREV;
|
||||
|
||||
FIFO_CLEANING_AND_SAVE_STEP(flag);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 0
|
||||
#include "device/broadcast.cu"
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -8,18 +9,38 @@
|
||||
#ifndef NCCL_DEVICE_COMMON_H_
|
||||
#define NCCL_DEVICE_COMMON_H_
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "../collectives.h"
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "nccl.h"
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
typedef void(*ncclKern_t)(struct CollectiveArgs* args);
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if abort == 1
|
||||
// all CTA's threads enter the barrier and do a popc on their predicates being True
|
||||
// If any of the thread's predicate was True, all the threads call exit()
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define exitIfAbortBarrier(abort, abortCount) \
|
||||
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
|
||||
__syncthreads(); \
|
||||
if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
|
||||
#else
|
||||
static inline __device__ void exitIfAbortBarrier(int abort) {
|
||||
uint32_t popc;
|
||||
asm ("{");
|
||||
asm volatile (" .reg .pred barr_pred;");
|
||||
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
|
||||
asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
|
||||
asm ("}");
|
||||
if (popc) { asm volatile ("exit;"); }
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype) \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
@@ -64,20 +85,13 @@ typedef void(*ncclKern_t)(struct CollectiveArgs* args);
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclKern_t = void (*)(struct CollectiveArgs*);
|
||||
using ncclFunc_t = void (*)(struct CollectiveArgs*);
|
||||
|
||||
static const __device__ constexpr ncclKern_t ncclFuncs[]{
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
#endif
|
||||
static const __device__ constexpr ncclFunc_t ncclFuncs[]{
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
@@ -88,82 +102,89 @@ static const __device__ constexpr ncclKern_t ncclFuncs[]{
|
||||
|
||||
template<unsigned short f, unsigned short l>
|
||||
struct Caller {
|
||||
static
|
||||
__device__ void call(ncclColl* const c) noexcept
|
||||
static __device__ __host__
|
||||
void call(ncclColl* const c) noexcept
|
||||
{
|
||||
constexpr unsigned short m = f + (l - f) / 2;
|
||||
|
||||
return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
|
||||
return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
|
||||
}
|
||||
};
|
||||
|
||||
template<unsigned short f>
|
||||
struct Caller<f, f + 1>{
|
||||
static
|
||||
__device__ void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
|
||||
static __device__ __host__
|
||||
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
|
||||
};
|
||||
|
||||
inline
|
||||
__device__
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept
|
||||
{
|
||||
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
|
||||
if (c->funcIndex < 72) {
|
||||
if (c->funcIndex % 2) ncclBroadcastLL_copy_i8(&c->args);
|
||||
else ncclBroadcast_copy_i8(&c->args);
|
||||
if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args);
|
||||
else ncclBroadcastRing_copy_i8(&c->args);
|
||||
}
|
||||
else if (c->funcIndex < 144) Caller<72, 144>::call(c);
|
||||
else if (c->funcIndex < 216) {
|
||||
if (c->funcIndex % 2) ncclAllGatherLL_copy_i8(&c->args);
|
||||
else ncclAllGather_copy_i8(&c->args);
|
||||
if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args);
|
||||
else ncclAllGatherRing_copy_i8(&c->args);
|
||||
}
|
||||
else Caller<216, 360>::call(c);
|
||||
}
|
||||
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
|
||||
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
|
||||
int* d = (int*)dst;
|
||||
int* s = (int*)src;
|
||||
__syncthreads();
|
||||
// When aggregation is effective, if some threads have aborted inside the LL kernel,
|
||||
// make sure the rest of the threads abort as well
|
||||
exitIfAbortBarrier(0, abortCount);
|
||||
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
|
||||
__syncthreads();
|
||||
}
|
||||
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
|
||||
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
|
||||
static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
|
||||
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
|
||||
if (tid == 0) hostColl->active = 0;
|
||||
}
|
||||
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
|
||||
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
|
||||
coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
|
||||
}
|
||||
|
||||
#if NCCL_OP == 0
|
||||
/* Kernels with the first operation inlined */
|
||||
#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
|
||||
__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
|
||||
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
int tid = threadIdx.x; \
|
||||
int bid = blockIdx.x; \
|
||||
__shared__ struct ncclColl localColl; \
|
||||
__shared__ uint32_t abortCount; \
|
||||
if (tid == 0) abortCount = 0; \
|
||||
__syncthreads(); \
|
||||
\
|
||||
struct ncclComm* comm = firstColl.args.comm; \
|
||||
struct ncclRing* ring = comm->rings+bid; \
|
||||
struct ncclDevComm* comm = firstColl.args.comm; \
|
||||
struct ncclChannel* channel = comm->channels+bid; \
|
||||
struct ncclColl* c; \
|
||||
channel->abortCount = &abortCount; \
|
||||
if (bid == 0) { \
|
||||
/* To optimize for latency, (only) the first operation is passed as argument.*/ \
|
||||
c = &firstColl; \
|
||||
} else { \
|
||||
c = &localColl; \
|
||||
load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
|
||||
load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
|
||||
} \
|
||||
while (1) { \
|
||||
if (tid < c->nThreads) { \
|
||||
if (tid < c->args.nThreads) { \
|
||||
if (c->funcIndex == fIndex) { \
|
||||
coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
|
||||
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
|
||||
} else { \
|
||||
NCCL_CALL_FUNCTIONS(c); \
|
||||
} \
|
||||
} \
|
||||
int nextIndex = c->nextIndex; \
|
||||
if (tid == 0) ring->collFifoHead = nextIndex; \
|
||||
if (tid == 0) channel->collFifoHead = nextIndex; \
|
||||
\
|
||||
if (c->active == 2) { \
|
||||
return; \
|
||||
@@ -171,15 +192,21 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
\
|
||||
/* Load next collective operation*/ \
|
||||
c = &localColl; /* for bid 0 */ \
|
||||
load_coll(c, ring->devCollectives+nextIndex, tid); \
|
||||
load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
|
||||
#endif
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
|
||||
IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
|
||||
|
||||
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
|
||||
IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
|
||||
IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
|
||||
IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
|
||||
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0)
|
||||
|
||||
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
|
||||
@@ -192,4 +219,6 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
|
||||
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64)
|
||||
|
||||
#define COLL_UNROLL 2
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -8,25 +8,25 @@
|
||||
#ifndef NCCL_COMMON_KERNEL_H_
|
||||
#define NCCL_COMMON_KERNEL_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
// Define min for ssize_t
|
||||
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
|
||||
|
||||
typedef uint64_t PackType;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
template<class FUNC, typename T>
|
||||
struct MULTI {
|
||||
__device__ PackType operator()(const PackType x, const PackType y) const
|
||||
{
|
||||
return FUNC()(x, y);
|
||||
}
|
||||
__device__ PackType operator()(const PackType x, const PackType y) const
|
||||
{
|
||||
return FUNC()(x, y);
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
@@ -205,15 +205,7 @@ struct MULTI<FUNC, int64_t> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
|
||||
#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
|
||||
|
||||
template<typename T>
|
||||
__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
|
||||
size_t ptrval = reinterpret_cast<size_t>(ptr);
|
||||
return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
|
||||
}
|
||||
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
template<typename T> inline __device__
|
||||
T vFetch(const volatile T* ptr) {
|
||||
@@ -225,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
|
||||
*ptr = val;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
|
||||
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
|
||||
template<> inline __device__
|
||||
half vFetch<half>(const volatile half* ptr) {
|
||||
half r;
|
||||
@@ -251,26 +243,6 @@ void vStore<half>(volatile half* ptr, const half val) {
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
|
||||
__attribute__((noinline))
|
||||
__device__ inline void ReduceCopy(
|
||||
const int tid, const int nthreads,
|
||||
const volatile T * __restrict__ const src0,
|
||||
const volatile T * __restrict__ const src1,
|
||||
volatile T * __restrict__ const dest0,
|
||||
volatile T * __restrict__ const dest1, const int N) {
|
||||
for (int idx = tid; idx < N; idx += nthreads) {
|
||||
T val = vFetch(src0+idx);
|
||||
if (TWO_INPUTS) {
|
||||
val = FUNC()(val, vFetch(src1+idx));
|
||||
}
|
||||
vStore(dest0+idx, val);
|
||||
if (TWO_OUTPUTS) {
|
||||
vStore(dest1+idx, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef ulong2 Pack128;
|
||||
|
||||
template<class FUNC, typename T>
|
||||
@@ -281,8 +253,8 @@ struct MULTI128 {
|
||||
}
|
||||
};
|
||||
|
||||
inline __device__ void Fetch128(Pack128& v, Pack128* p) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
v.x = p->x;
|
||||
v.y = p->y;
|
||||
#else
|
||||
@@ -290,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, Pack128* p) {
|
||||
#endif
|
||||
}
|
||||
inline __device__ void Store128(Pack128* p, Pack128& v) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
p->x = v.x;
|
||||
p->y = v.y;
|
||||
#else
|
||||
@@ -298,67 +270,104 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#define WARP_SIZE 32
|
||||
template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
|
||||
__attribute__((noinline))
|
||||
__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
|
||||
Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
|
||||
const int N) {
|
||||
Pack128 t0[UNROLL];
|
||||
Pack128 t1[UNROLL];
|
||||
const Pack128* src0_end = src0 + N;
|
||||
const int inc = nw * UNROLL * WARP_SIZE;
|
||||
const int offset = w * UNROLL * WARP_SIZE + t;
|
||||
src0 += offset; if (TWO_INPUTS) src1 += offset;
|
||||
dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
|
||||
template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
|
||||
const int offset, const int N) {
|
||||
for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
|
||||
T val = vFetch(srcs[0]+idx);
|
||||
#pragma unroll
|
||||
for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
|
||||
#pragma unroll 1
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
|
||||
|
||||
while (src0 < src0_end) {
|
||||
#pragma unroll
|
||||
for (int u = 0; u < UNROLL; ++u) {
|
||||
Fetch128(t0[u], src0+u*WARP_SIZE);
|
||||
if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u = 0; u < UNROLL; ++u) {
|
||||
if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
|
||||
Store128(dest0+u*WARP_SIZE, t0[u]);
|
||||
if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
|
||||
}
|
||||
src0 += inc; if (TWO_INPUTS) src1 += inc;
|
||||
dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
|
||||
#pragma unroll 1
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
|
||||
}
|
||||
}
|
||||
|
||||
template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
|
||||
__attribute__((noinline))
|
||||
__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
|
||||
volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
|
||||
const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
|
||||
#define WARP_SIZE 64
|
||||
|
||||
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
|
||||
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
|
||||
const int elemOffset, const int Npack) {
|
||||
const int inc = nw * UNROLL * WARP_SIZE;
|
||||
int offset = w * UNROLL * WARP_SIZE + t;
|
||||
|
||||
const Pack128* srcs[MAXSRCS];
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
|
||||
Pack128* dsts[MAXDSTS];
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
|
||||
|
||||
while (offset < Npack) {
|
||||
Pack128 vals[UNROLL];
|
||||
// Load and reduce
|
||||
for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
|
||||
|
||||
for (int i=1; i<MINSRCS; i++) {
|
||||
Pack128 vals2[UNROLL];
|
||||
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
|
||||
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
|
||||
}
|
||||
#pragma unroll 1
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
|
||||
Pack128 vals2[UNROLL];
|
||||
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
|
||||
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
|
||||
}
|
||||
|
||||
// Store
|
||||
for (int i = 0; i < MINDSTS; i++) {
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
#pragma unroll 1
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
|
||||
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
|
||||
}
|
||||
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
|
||||
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
|
||||
offset += inc;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
|
||||
|
||||
// Try to limit consecutive load/stores to 8.
|
||||
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
|
||||
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
|
||||
|
||||
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
|
||||
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
|
||||
int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
|
||||
int N) {
|
||||
int Nrem = N;
|
||||
if (Nrem <= 0) return;
|
||||
|
||||
int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
|
||||
int alignDiff = 0;
|
||||
int align = ptrAlign128(srcs[0]);
|
||||
#pragma unroll
|
||||
for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
|
||||
for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
|
||||
#pragma unroll
|
||||
for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
|
||||
for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
|
||||
|
||||
// stage 0: check if we'll be able to use the fast, 128-bit aligned path.
|
||||
// If not, we'll just use the slow preamble path for the whole operation
|
||||
bool alignable = (((AlignUp(src0, alignof(Pack128)) == src0 + Npreamble)) &&
|
||||
(!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
|
||||
(!HAS_SRC1 || (AlignUp(src1, alignof(Pack128)) == src1 + Npreamble)));
|
||||
|
||||
if (!alignable) {
|
||||
Npreamble = Nrem;
|
||||
}
|
||||
int Npreamble = alignDiff ? Nrem :
|
||||
N < alignof(Pack128) ? N :
|
||||
(alignof(Pack128) - align) % alignof(Pack128);
|
||||
|
||||
// stage 1: preamble: handle any elements up to the point of everything coming
|
||||
// into alignment
|
||||
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
|
||||
|
||||
Nrem -= Npreamble;
|
||||
if (Nrem == 0) return;
|
||||
|
||||
dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
|
||||
src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; }
|
||||
if (Npreamble) {
|
||||
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
|
||||
Nrem -= Npreamble;
|
||||
if (Nrem == 0) return;
|
||||
}
|
||||
int offset = Npreamble;
|
||||
|
||||
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
|
||||
// assuming the pointers we have are all 128-bit alignable.
|
||||
@@ -366,35 +375,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
|
||||
int nw = nthreads / WARP_SIZE; // Number of warps
|
||||
int t = tid % WARP_SIZE; // Thread (inside the warp)
|
||||
|
||||
const int PackFactor = sizeof(Pack128) / sizeof(T);
|
||||
const int packFactor = sizeof(Pack128) / sizeof(T);
|
||||
|
||||
// stage 2a: main loop
|
||||
int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
|
||||
* (UNROLL * nthreads); // round down
|
||||
int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
|
||||
* (AUTOUNROLL * WARP_SIZE); // round down
|
||||
int Nelem2a = Npack2a * packFactor;
|
||||
|
||||
ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
|
||||
ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
|
||||
|
||||
int Ndone2a = Nalign2a * PackFactor;
|
||||
Nrem -= Ndone2a;
|
||||
Nrem -= Nelem2a;
|
||||
if (Nrem == 0) return;
|
||||
dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
|
||||
src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; }
|
||||
offset += Nelem2a;
|
||||
|
||||
// stage 2b: slightly less optimized for section when we don't have full
|
||||
// UNROLLs
|
||||
// unrolling
|
||||
|
||||
int Nalign2b = Nrem / PackFactor;
|
||||
int Npack2b = Nrem / packFactor;
|
||||
int Nelem2b = Npack2b * packFactor;
|
||||
|
||||
ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
|
||||
ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
|
||||
|
||||
int Ndone2b = Nalign2b * PackFactor;
|
||||
Nrem -= Ndone2b;
|
||||
Nrem -= Nelem2b;
|
||||
if (Nrem == 0) return;
|
||||
dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
|
||||
src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; }
|
||||
offset += Nelem2b;
|
||||
|
||||
// stage 2c: tail
|
||||
ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
|
||||
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
|
||||
|
||||
// Workaround for https://reviews.llvm.org/D55580
|
||||
__device__ void ncclWorkaroundClangD55580() {}
|
||||
|
||||
Исполняемый файл
+28
@@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
dir=$1
|
||||
|
||||
targets="GENOBJS := \\\\\n"
|
||||
|
||||
for base in all_reduce all_gather broadcast reduce reduce_scatter; do
|
||||
opn=0
|
||||
for op in sum prod min max; do
|
||||
dtn=0
|
||||
for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
|
||||
echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
|
||||
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
|
||||
echo " mkdir -p ${dir}"
|
||||
echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
|
||||
echo ""
|
||||
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
|
||||
dtn=$(($dtn + 1))
|
||||
done
|
||||
opn=$(($opn + 1))
|
||||
done
|
||||
done
|
||||
echo -e "$targets"
|
||||
@@ -1,186 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_LL_KERNEL_H_
|
||||
#define NCCL_LL_KERNEL_H_
|
||||
|
||||
static __device__ __attribute__((noinline)) uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
do {
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
|
||||
} while (i4[1] != flag || i4[3] != flag);
|
||||
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
|
||||
return val64;
|
||||
#else
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
do {
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
return val64;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = val & 0xffffffff;
|
||||
i4[1] = flag;
|
||||
i4[2] = (val >> 32);
|
||||
i4[3] = flag;
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
|
||||
#else
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Using memcpy handles misaligned pointers.
|
||||
static __device__ uint64_t readAL(uint64_t* src) {
|
||||
uint64_t val;
|
||||
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
|
||||
return val;
|
||||
}
|
||||
static __device__ void storeAL(uint64_t* dst, uint64_t val) {
|
||||
memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
template <typename T, class FUNC>
|
||||
class LLPrimitives {
|
||||
private:
|
||||
template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
|
||||
__attribute__((noinline))
|
||||
static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
|
||||
if (size <= 0) return;
|
||||
size_t size64 = size * sizeof(T) / sizeof(uint64_t);
|
||||
uint64_t* src1A = (uint64_t*)src1;
|
||||
uint64_t* dst1A = (uint64_t*)dst1;
|
||||
int offset = threadIdx.x;
|
||||
// Do multiples of 64 bits
|
||||
#pragma unroll 1
|
||||
for (; offset < size64; offset += nthreads) {
|
||||
uint64_t val;
|
||||
if (HAS_SRC1) {
|
||||
val = readAL(src1A+offset);
|
||||
if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
|
||||
} else if (HAS_SRC2) {
|
||||
val = readLL(src2+offset, iflag);
|
||||
}
|
||||
if (HAS_DST1) storeAL(dst1A+offset, val);
|
||||
if (HAS_DST2) storeLL(dst2+offset, val, oflag);
|
||||
}
|
||||
// Finish last word
|
||||
int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
|
||||
int sizeRem = size - sizeDone;
|
||||
if (threadIdx.x == 0 && sizeRem) {
|
||||
const T* src1B = src1 + sizeDone;
|
||||
T* dst1B = dst1 + sizeDone;
|
||||
|
||||
uint64_t lastVal;
|
||||
T* vals = (T*)&lastVal;
|
||||
|
||||
if (HAS_SRC2) {
|
||||
uint64_t lastVal2 = readLL(src2+size64, iflag);
|
||||
T* src2B = (T*)&lastVal2;
|
||||
for (int offset = 0; offset < sizeRem; offset++) {
|
||||
vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
|
||||
}
|
||||
} else if (HAS_SRC1) {
|
||||
for (int offset = 0; offset < sizeRem; offset++) {
|
||||
vals[offset] = src1B[offset];
|
||||
}
|
||||
}
|
||||
if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
|
||||
if (HAS_DST1) {
|
||||
for (int offset = 0; offset < sizeRem; offset++) {
|
||||
dst1B[offset] = vals[offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
public:
|
||||
static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
|
||||
return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
|
||||
}
|
||||
|
||||
static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
|
||||
return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
|
||||
}
|
||||
|
||||
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
|
||||
return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
|
||||
}
|
||||
|
||||
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
|
||||
return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
|
||||
}
|
||||
|
||||
static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
|
||||
return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
|
||||
}
|
||||
|
||||
static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
|
||||
return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
|
||||
}
|
||||
|
||||
static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
|
||||
return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
|
||||
}
|
||||
};
|
||||
|
||||
// Common macros
|
||||
|
||||
#define STEP_TO_SLOT(step) \
|
||||
(step % NCCL_LL_CHUNKS)
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#define SYNC __syncthreads()
|
||||
#else
|
||||
#define SYNC asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads))
|
||||
#endif
|
||||
|
||||
#define WAIT_NEXT \
|
||||
if (tid == 0) { \
|
||||
while (sendHead + NCCL_LL_CHUNKS <= step) { \
|
||||
sendHead = LOAD(sendHeadPtr); \
|
||||
} \
|
||||
} \
|
||||
SYNC;
|
||||
|
||||
#define POST_SIZE \
|
||||
if (tid == 0 && sizesFifo) { STORE(sizesFifo + step % NCCL_LL_CHUNKS, (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T))); }
|
||||
|
||||
#define ACK_PREV \
|
||||
SYNC; \
|
||||
if (tid == 0) STORE(recvHeadPtr,step);
|
||||
|
||||
#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
|
||||
if (step > LOAD(&ring->send.conn.llLastCleaning) + NCCL_LL_CLEAN_FREQ) { \
|
||||
/* Reset all flags */ \
|
||||
static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
|
||||
static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
|
||||
const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
|
||||
for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
|
||||
prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
|
||||
} \
|
||||
__threadfence_system(); \
|
||||
/* Restart from the same slot, only make sure sender waits for data to be reset */ \
|
||||
step += NCCL_LL_CHUNKS; \
|
||||
ACK_PREV; \
|
||||
while (LOAD(sendHeadPtr) < step); \
|
||||
{ if (tid == 0) STORE(&ring->send.conn.llLastCleaning, step); }\
|
||||
} \
|
||||
STORE(&ring->send.conn.llStep, step); \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -10,229 +10,635 @@
|
||||
|
||||
#include <type_traits>
|
||||
#include "reduce_kernel.h" // for reduction funcs
|
||||
#include "common.h"
|
||||
|
||||
#define SPINS_BEFORE_CHECK_ABORT 1000000
|
||||
|
||||
/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
|
||||
*
|
||||
* In order to reduce the reptetion of template arguments, the operations
|
||||
* are bundled as static methods of the Primitives class.
|
||||
*
|
||||
* Each primitive operation copies/reduces a contiguous buffer and syncs
|
||||
* an optional set of flags against a sub-step counter. The sync value is
|
||||
* based on the step parameter. Sync flags must be of type WaitFlag or
|
||||
* PostFlag. The primitive routines wait for all WaitFlag args to attain
|
||||
* at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
|
||||
* corresponding substep by previous step) before executing the transfer.
|
||||
* After each substep is transfered, all PostFlag arguments get updated to
|
||||
* the value SUBSTEPS*step+substep+1.
|
||||
*/
|
||||
// Unroll unconditionally the first send/recv since nsend/nrecv should be at
|
||||
// least 1 if SEND/RECV is set.
|
||||
#define FOR_SEND(func, ...) do { \
|
||||
if (SEND) { \
|
||||
/* Send to far first, then close */ \
|
||||
for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
|
||||
func(0, ##__VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
class WaitFlag {
|
||||
volatile uint64_t * const flag;
|
||||
const int shift;
|
||||
public:
|
||||
__device__
|
||||
WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
|
||||
__device__
|
||||
void wait(uint64_t val) { while ((LOAD(flag) + shift) < val) /*SPIN*/; }
|
||||
};
|
||||
|
||||
|
||||
class PostFlag {
|
||||
volatile uint64_t * const flag;
|
||||
const int shift;
|
||||
volatile int * const fifo;
|
||||
const int fifo_size;
|
||||
uint32_t * hdp_reg;
|
||||
public:
|
||||
__device__
|
||||
PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
|
||||
: flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
|
||||
// remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
|
||||
__device__
|
||||
void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
|
||||
__device__
|
||||
void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
|
||||
};
|
||||
|
||||
|
||||
// Helper to check if any argument is of type T.
|
||||
// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
|
||||
template<typename T> __device__
|
||||
bool AnyAre() { return false; }
|
||||
|
||||
template<typename T, typename FIRST_T, typename... TAIL_Ts>
|
||||
__device__
|
||||
bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
|
||||
return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
|
||||
}
|
||||
|
||||
|
||||
// Wait on all WaitFlags, ignore PostFlags
|
||||
__device__
|
||||
static void WaitOnFlags(uint64_t val) { }
|
||||
|
||||
template <typename... TAIL_Ts> __device__
|
||||
static void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
|
||||
flag.wait(val);
|
||||
WaitOnFlags(val, tail...);
|
||||
}
|
||||
|
||||
template <typename... TAIL_Ts> __device__
|
||||
static void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
|
||||
WaitOnFlags(val, tail...);
|
||||
}
|
||||
|
||||
|
||||
// Post all PostFlags, ignore WaitFlags
|
||||
__device__
|
||||
static void PostToFlags(uint64_t val) { }
|
||||
|
||||
template <typename... TAIL_Ts> __device__
|
||||
static void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
|
||||
PostToFlags(val, tail...);
|
||||
}
|
||||
|
||||
template <typename... TAIL_Ts> __device__
|
||||
static void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
|
||||
flag.post(val);
|
||||
PostToFlags(val, tail...);
|
||||
}
|
||||
|
||||
|
||||
// Post sizes for PostFlags, ignore WaitFlags
|
||||
__device__
|
||||
static void PostSizeToFlags(uint64_t step, int size) { }
|
||||
|
||||
template <typename... TAIL_Ts> __device__
|
||||
static void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
|
||||
PostSizeToFlags(step, size, tail...);
|
||||
}
|
||||
|
||||
template <typename... TAIL_Ts> __device__
|
||||
static void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
|
||||
flag.postSize(step, size);
|
||||
PostSizeToFlags(step, size, tail...);
|
||||
}
|
||||
|
||||
|
||||
// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
|
||||
template <typename Tptr> __device__
|
||||
static Tptr ptradd(Tptr ptr, int i) {
|
||||
return ptr + i;
|
||||
}
|
||||
|
||||
__device__
|
||||
static std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// use different unroll numbers for all primitives for best throughput
|
||||
#define COPY_UNROLL 4
|
||||
#define REDUCE_UNROLL 2
|
||||
#define DOUBLECOPY_UNROLL 2
|
||||
#define REDUCECOPY_UNROLL 2
|
||||
#define FOR_RECV(func, ...) do { \
|
||||
if (RECV) { \
|
||||
/* Recv from close first, then far */ \
|
||||
func(0, ##__VA_ARGS__); \
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Implementation of primitive types
|
||||
template <int, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
|
||||
class Primitives {
|
||||
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
|
||||
class ncclPrimitives {
|
||||
private:
|
||||
template <int UNROLL,
|
||||
typename SRC2_T, // either T* or std::nullptr_t
|
||||
typename DST2_T, // either T* or std::nullptr_t
|
||||
typename... SYNC_Ts> // either WaitFunc or PostFunc
|
||||
static __device__ __attribute__((noinline)) void
|
||||
GenericOp(const int tid, const int nthreads,
|
||||
const T* src1,
|
||||
const SRC2_T src2,
|
||||
T* dst1,
|
||||
DST2_T dst2,
|
||||
int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
const int stepSize;
|
||||
struct ncclConnInfo* recvConn[NRECV];
|
||||
struct ncclConnInfo* sendConn[NSEND];
|
||||
volatile uint64_t* waitPtr;
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
uint64_t sendConnHead[NSEND];
|
||||
const T* recvDirectBuff[NRECV];
|
||||
T* sendDirectBuff[NSEND];
|
||||
const T* recvBuff[NRECV];
|
||||
T* sendBuff[NSEND];
|
||||
struct ncclDevComm* comm;
|
||||
uint32_t* abortCount;
|
||||
|
||||
enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
|
||||
enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
|
||||
static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
|
||||
"src2 must be of type T* or std::nullptr_t");
|
||||
static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
|
||||
"dst2 must be of type T* or std::nullptr_t");
|
||||
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
|
||||
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
|
||||
__device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
|
||||
__device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
|
||||
|
||||
using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
|
||||
|
||||
int sliceSize = len / SUBSTEPS;
|
||||
int sliceOffset = 0;
|
||||
|
||||
#pragma unroll 1
|
||||
for (int sub=0; sub<SUBSTEPS; ++sub) {
|
||||
int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
|
||||
if (tid < nthreads) {
|
||||
if (AnyAre<WaitFlag>(flags...)) {
|
||||
if (tid == 0) {
|
||||
WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
__syncthreads();
|
||||
__device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
ReduceOrCopy
|
||||
<
|
||||
UNROLL,
|
||||
OpType,
|
||||
T,
|
||||
!std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
|
||||
!std::is_same<SRC2_T, std::nullptr_t>::value // HAS_SRC1
|
||||
>
|
||||
(
|
||||
tid, nthreads,
|
||||
ptradd(dst1, sliceOffset),
|
||||
ptradd(dst2, sliceOffset),
|
||||
ptradd(src1, sliceOffset),
|
||||
ptradd(src2, sliceOffset),
|
||||
realSize
|
||||
);
|
||||
if (AnyAre<PostFlag>(flags...)) {
|
||||
__syncthreads();
|
||||
if(tid == 0)
|
||||
PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
|
||||
__threadfence_system();
|
||||
if(tid == 0)
|
||||
PostToFlags(SUBSTEPS*step + sub + 1, flags...);
|
||||
}
|
||||
|
||||
uint32_t mismatch = 0;
|
||||
const uint64_t opCount;
|
||||
|
||||
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
|
||||
if (mismatch) {
|
||||
// In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
|
||||
STORE(comm->fatalDevError, ncclDevAssertedMismatch);
|
||||
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
|
||||
mismatch += 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
|
||||
spins++;
|
||||
if (spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
checkMismatch(remoteOpCount);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
__device__ void waitRecv(int i) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
recvStep[i] += SLICESTEPS;
|
||||
if (tid == i) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t t0 = clock64();
|
||||
#endif
|
||||
while (LOAD(waitPtr) < recvStep[i]) {
|
||||
if (checkAbort(recvConn[i]->opCountRem)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void waitSend(int i) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
sendStep[i] += SLICESTEPS;
|
||||
if (tid == WARP_SIZE+i) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = comm->devProf;
|
||||
uint64_t t0 = clock64();
|
||||
#endif
|
||||
while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
|
||||
sendConnHead[i] = LOAD(waitPtr);
|
||||
if (checkAbort(sendConn[i]->opCountRem)) break;
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
__atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
inline __device__ void postRecv(int i) {
|
||||
STORE(recvConn[i]->head, recvStep[i]);
|
||||
}
|
||||
|
||||
inline __device__ void postSend(int i) {
|
||||
if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
|
||||
STORE(sendConn[i]->tail, sendStep[i]);
|
||||
}
|
||||
|
||||
__device__ void postSendSize(int i, int size) {
|
||||
if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
|
||||
}
|
||||
|
||||
template <int DIRECTRECV>
|
||||
__device__ const T* directRecvPtr(int i, int directOffset) {
|
||||
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
|
||||
}
|
||||
|
||||
template <int DIRECTSEND>
|
||||
__device__ T* directSendPtr(int i, int directOffset) {
|
||||
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
|
||||
}
|
||||
|
||||
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
|
||||
__device__ void
|
||||
GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
|
||||
int offset = 0;
|
||||
int sliceSize = stepSize * SLICESTEPS;
|
||||
|
||||
const T* srcs[RECV*NRECV+SRC];
|
||||
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
|
||||
if (RECV) {
|
||||
if (SRC) srcs[1] = recvPtr(0);
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
|
||||
}
|
||||
|
||||
T* dsts[SEND*NSEND+DST];
|
||||
dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
|
||||
if (SEND) {
|
||||
if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
|
||||
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
|
||||
}
|
||||
|
||||
#pragma unroll 1
|
||||
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
|
||||
int realSize = max(0, min(sliceSize, nelem-offset));
|
||||
FOR_SEND(waitSend);
|
||||
FOR_RECV(waitRecv);
|
||||
if (realSize > 0) {
|
||||
barrier();
|
||||
if (DIRECTRECV && recvDirectBuff[0]) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (SEND) {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
|
||||
}
|
||||
} else {
|
||||
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
|
||||
}
|
||||
}
|
||||
sliceOffset += sliceSize;
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T));
|
||||
if (SEND) __threadfence_system();
|
||||
if (tid == 0) FOR_SEND(postSend);
|
||||
if (tid == 0) FOR_RECV(postRecv);
|
||||
}
|
||||
for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
|
||||
for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
|
||||
offset += sliceSize;
|
||||
}
|
||||
|
||||
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
recvConn[i] = conn;
|
||||
recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
|
||||
recvStep[i] = LOAD(&recvConn[i]->step);
|
||||
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
// Return credits in case we rounded up.
|
||||
if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
|
||||
if (tid == i) {
|
||||
waitPtr = LOAD(&recvConn[i]->tail);
|
||||
STORE(recvConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
recvDirectBuff[i] = NULL;
|
||||
if (directBuff && recvConn[i]->direct) {
|
||||
recvDirectBuff[i] = directBuff;
|
||||
if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
|
||||
}
|
||||
nrecv++;
|
||||
}
|
||||
|
||||
__device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
|
||||
sendConn[i] = conn;
|
||||
sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
|
||||
sendStep[i] = LOAD(&sendConn[i]->step);
|
||||
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
|
||||
if (tid == WARP_SIZE+i) {
|
||||
waitPtr = LOAD(&sendConn[i]->head);
|
||||
sendConnHead[i] = LOAD(waitPtr);
|
||||
STORE(sendConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
sendDirectBuff[i] = NULL;
|
||||
if (directBuff && sendConn[i]->direct) {
|
||||
void* volatile* ptr = sendConn[i]->ptrExchange;
|
||||
while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
|
||||
__syncthreads();
|
||||
if (tid == 0) STORE(ptr, NULL);
|
||||
}
|
||||
nsend++;
|
||||
}
|
||||
|
||||
__device__ void saveRecvConn(int i) {
|
||||
if (tid == i) {
|
||||
STORE(&recvConn[i]->step, recvStep[i]);
|
||||
__threadfence_system();
|
||||
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void saveSendConn(int i) {
|
||||
if (tid == WARP_SIZE+i) {
|
||||
STORE(&sendConn[i]->step, sendStep[i]);
|
||||
__threadfence_system();
|
||||
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename... SYNC_Ts>
|
||||
static __device__ void
|
||||
Copy(const int tid, const int nthreads, const T* src, T* dst,
|
||||
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
|
||||
GenericOp<COPY_UNROLL>(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
|
||||
__device__
|
||||
ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
|
||||
// Make sure step is updated before we read it
|
||||
abortCount = channel->abortCount;
|
||||
__syncthreads();
|
||||
|
||||
// disable directBuff
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
|
||||
}
|
||||
|
||||
template <typename... SYNC_Ts>
|
||||
static __device__ void
|
||||
DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
|
||||
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
|
||||
GenericOp<DOUBLECOPY_UNROLL>(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
|
||||
__device__ void
|
||||
send(const T* src, int nelem) {
|
||||
GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
directSend(const T* src, int directOffset, int nelem) {
|
||||
GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
|
||||
}
|
||||
|
||||
template <typename... SYNC_Ts>
|
||||
static __device__ void
|
||||
Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
|
||||
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
|
||||
GenericOp<REDUCE_UNROLL>(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
|
||||
__device__ void
|
||||
recv(T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
directRecv(T* dst, int directOffset, int nelem) {
|
||||
GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
template <typename... SYNC_Ts>
|
||||
static __device__ void
|
||||
ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
|
||||
int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
|
||||
GenericOp<REDUCECOPY_UNROLL>(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
|
||||
__device__ void
|
||||
copySend(const T* src, T* dst, int nelem) {
|
||||
GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
directCopySend(const T* src, T* dst, int directOffset, int nelem) {
|
||||
GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
recvCopySend(T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
directRecvCopySend(T* dst, int directOffset, int nelem) {
|
||||
GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
recvReduceCopy(const T* src, T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
recvReduceSend(const T* src, int nelem) {
|
||||
GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
|
||||
}
|
||||
|
||||
__device__ void
|
||||
recvReduceCopySend(const T* src, T* dst, int nelem) {
|
||||
GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
|
||||
}
|
||||
__device__ void
|
||||
directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
|
||||
// Direct is only for the send part
|
||||
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
|
||||
}
|
||||
|
||||
__device__ ~ncclPrimitives() {
|
||||
// Save steps for next collective. Have thread 0 do it to be compatible
|
||||
// with the way LL works.
|
||||
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
||||
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // end include guard
|
||||
template <typename T, class FUNC, int NRECV, int NSEND>
|
||||
class ncclLLPrimitives {
|
||||
private:
|
||||
const int tid;
|
||||
const int nthreads;
|
||||
int nrecv = 0;
|
||||
int nsend = 0;
|
||||
struct ncclConnInfo* recvConn[NRECV];
|
||||
struct ncclConnInfo* sendConn[NSEND];
|
||||
volatile uint64_t* waitPtr;
|
||||
volatile uint64_t* postPtr;
|
||||
volatile int* fifoPtr;
|
||||
uint64_t recvStep[NRECV];
|
||||
uint64_t sendStep[NSEND];
|
||||
uint64_t sendConnHead;
|
||||
union ncclLLFifoLine* recvBuff[NRECV];
|
||||
union ncclLLFifoLine* sendBuff[NSEND];
|
||||
struct ncclDevComm* comm;
|
||||
uint32_t* abortCount;
|
||||
|
||||
__device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
__device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
|
||||
__device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
__device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
|
||||
__device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
|
||||
__device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
// Exit If Abort Barrier : make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if val == 1
|
||||
// all CTA's threads enter the barrier and do a popc on their predicates being True
|
||||
// If any of the thread's predicate was True, all the threads call exit()
|
||||
__device__ void exitIfAbortLocalBarrier() {
|
||||
uint32_t popc;
|
||||
asm ("{");
|
||||
asm volatile (" .reg .pred barr_pred;");
|
||||
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
|
||||
asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
|
||||
asm ("}");
|
||||
if (popc) {
|
||||
// Make sure threads not participating in the operation get the abort and all threads exit
|
||||
exitIfAbortBarrier(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
__device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t mismatch = 0;
|
||||
const uint64_t opCount;
|
||||
|
||||
__device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
|
||||
if (mismatch > 20) {
|
||||
// We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
|
||||
// Note that we are not using _threadfence_system in LL so the error cannot be asserted
|
||||
STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
|
||||
} else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
|
||||
mismatch += 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t spins = 0;
|
||||
uint32_t abort = 0;
|
||||
|
||||
__device__ int checkAbort(volatile uint64_t* remoteOpCount) {
|
||||
spins++;
|
||||
if (spins == SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = LOAD(comm->abortFlag);
|
||||
checkMismatch(remoteOpCount);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
__device__ void waitSend(int i, int nbytes) {
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
if (tid == WARP_SIZE+i) {
|
||||
while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
|
||||
sendConnHead = LOAD(waitPtr);
|
||||
if (checkAbort(sendConn[i]->opCountRem)) break;
|
||||
}
|
||||
if (fifoPtr) {
|
||||
int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void postRecv(int i) {
|
||||
recvStep[i]++;
|
||||
if (tid == i) STORE(postPtr, recvStep[i]);
|
||||
}
|
||||
|
||||
__device__ void postSend(int i, int offset) {
|
||||
// LL Cleanup : write all flags in the slice to make sure we don't have
|
||||
// data corruption when flag loops over.
|
||||
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
|
||||
for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
|
||||
}
|
||||
sendStep[i]++;
|
||||
}
|
||||
|
||||
__device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
|
||||
union ncclLLFifoLine* src = recvPtr(i) + offset;
|
||||
uint32_t flag = recvFlag(i);
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
do {
|
||||
asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
|
||||
if (i4[1] == flag && i4[3] == flag) break;
|
||||
} while (!checkAbort(recvConn[i]->opCountRem));
|
||||
uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
|
||||
#else
|
||||
do {
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
|
||||
if (checkAbort(recvConn[i]->opCountRem)) break;
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
#endif
|
||||
return val64;
|
||||
}
|
||||
|
||||
__device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = val & 0xffffffff;
|
||||
i4[1] = flag;
|
||||
i4[2] = (val >> 32);
|
||||
i4[3] = flag;
|
||||
asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
|
||||
"s_waitcnt vmcnt(0)\n"
|
||||
"buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
|
||||
#else
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Using memcpy handles misaligned pointers.
|
||||
__device__ uint64_t readAL(uint64_t* src) {
|
||||
uint64_t val;
|
||||
memcpy((char*)&val, (char*)src, sizeof(uint64_t));
|
||||
return val;
|
||||
}
|
||||
|
||||
__device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
|
||||
memcpy((char*)dst, (char*)&val, nbytes);
|
||||
}
|
||||
|
||||
template <int RECV, int SEND, int SRC, int DST>
|
||||
__device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
|
||||
uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
|
||||
FOR_SEND(waitSend, nbytes*2);
|
||||
barrier();
|
||||
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
|
||||
uint64_t* srcPack = (uint64_t*)srcPtr;
|
||||
uint64_t* dstPack = (uint64_t*)dstPtr;
|
||||
int offset = tid;
|
||||
// Do multiples of 64 bits
|
||||
#pragma unroll 1
|
||||
for (; offset<npack; offset+=nthreads) {
|
||||
// Recv : local, then intra-node, then inter-node
|
||||
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
|
||||
if (RECV) {
|
||||
if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
|
||||
for (int i=1; i<NRECV && i<nrecv; i++) {
|
||||
val = MULTI<FUNC, T>()(readLL(i, offset), val);
|
||||
}
|
||||
}
|
||||
|
||||
// Send : inter-node, then intra-node, then local
|
||||
if (SEND) {
|
||||
for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
|
||||
storeLL(sendPtr(0)+offset, val, sendFlag(0));
|
||||
}
|
||||
if (DST) {
|
||||
if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
|
||||
// Last incomplete word
|
||||
storeAL(dstPack+offset, val, nbytes & 0x7);
|
||||
} else {
|
||||
storeAL(dstPack+offset, val, sizeof(uint64_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
#else
|
||||
exitIfAbortLocalBarrier();
|
||||
#endif
|
||||
FOR_RECV(postRecv);
|
||||
FOR_SEND(postSend, offset);
|
||||
}
|
||||
|
||||
__device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
|
||||
recvConn[i] = conn;
|
||||
recvBuff[i] = recvConn[i]->llBuff;
|
||||
recvStep[i] = recvConn[i]->step;
|
||||
if (tid == i) {
|
||||
postPtr = recvConn[i]->head;
|
||||
STORE(recvConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
nrecv++;
|
||||
}
|
||||
|
||||
__device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
|
||||
sendConn[i] = conn;
|
||||
sendBuff[i] = sendConn[i]->llBuff;
|
||||
sendStep[i] = sendConn[i]->step;
|
||||
if (tid == WARP_SIZE+i) {
|
||||
waitPtr = sendConn[i]->head;
|
||||
fifoPtr = sendConn[i]->fifo;
|
||||
sendConnHead = LOAD(waitPtr);
|
||||
STORE(sendConn[i]->opCountLoc, opCount);
|
||||
}
|
||||
nsend++;
|
||||
}
|
||||
|
||||
__device__ void saveRecvConn(int i) {
|
||||
if (tid == i) {
|
||||
recvConn[i]->step = recvStep[i];
|
||||
__atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void saveSendConn(int i) {
|
||||
if (tid == WARP_SIZE+i) {
|
||||
sendConn[i]->step = sendStep[i];
|
||||
__atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__
|
||||
ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
|
||||
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
|
||||
// Make sure step is updated before we read it.
|
||||
abortCount = channel->abortCount;
|
||||
barrier();
|
||||
|
||||
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
|
||||
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
|
||||
}
|
||||
|
||||
__device__ void send(const T* src, int nelem) {
|
||||
return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recv(T* dst, int nelem) {
|
||||
return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceSend(const T* src, int nelem) {
|
||||
return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void copySend(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvCopySend(T* dst, int nelem) {
|
||||
return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
|
||||
return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
|
||||
}
|
||||
|
||||
__device__ ~ncclLLPrimitives() {
|
||||
// Save steps for the next operation
|
||||
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
|
||||
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define INIT_COUNTER \
|
||||
if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
|
||||
wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
|
||||
|
||||
#define ACCUMULATE_COUNTER(prim) \
|
||||
if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
|
||||
+ ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
|
||||
+ wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
|
||||
__ATOMIC_SEQ_CST); \
|
||||
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
|
||||
#else
|
||||
#define INIT_COUNTER
|
||||
#define ACCUMULATE_COUNTER(prim)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,12 +11,7 @@
|
||||
|
||||
#define UNROLL 4
|
||||
|
||||
#if NCCL_OP == 0
|
||||
IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum);
|
||||
#elif NCCL_OP == 1
|
||||
IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
|
||||
#elif NCCL_OP == 2
|
||||
IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin);
|
||||
#elif NCCL_OP == 3
|
||||
IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax);
|
||||
#endif
|
||||
|
||||
@@ -1,153 +1,82 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
// Increase Step and boffset for buffer sync
|
||||
#define NEXT_STEP \
|
||||
step++; \
|
||||
boffset += sliceSize; \
|
||||
if (boffset == buffSize) boffset = 0;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->next_hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int buffSize = ring->buffSize / sizeof(T);
|
||||
const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
|
||||
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
|
||||
if (tid == 0) {
|
||||
// Update in case we skipped some collectives
|
||||
STORE(ring->recv.conn.opCount, args->opCount);
|
||||
|
||||
if (rank != root) {
|
||||
// Wait for next to be ready
|
||||
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
|
||||
waitOpCountNext.wait(args->opCount);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint64_t step = 0ULL;
|
||||
int boffset = 0;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
|
||||
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
|
||||
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
int maxOffset = min(chunkSize, size-offset);
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t offset = gridOffset + bid*realChunkSize;
|
||||
int nelem = min(realChunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisInput + offset,
|
||||
nextOutput + boffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
prims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
Prims::Reduce(tid, nthreads,
|
||||
prevInput + boffset,
|
||||
thisInput + offset,
|
||||
thisOutput + offset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
Prims::Reduce(tid, nthreads,
|
||||
prevInput + boffset,
|
||||
thisInput + offset,
|
||||
nextOutput + boffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
NEXT_STEP; // Increases step, boffset
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
if (rank != root) {
|
||||
// Wait for next to have consumed data before resetting the flag
|
||||
waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
|
||||
STORE(ring->send.conn.head, 0ULL);
|
||||
}
|
||||
STORE(ring->recv.conn.tail, 0ULL);
|
||||
__threadfence_system();
|
||||
STORE(ring->recv.conn.opCount, args->opCount+1);
|
||||
}
|
||||
}
|
||||
|
||||
#include "ll_kernel.h"
|
||||
|
||||
#define NEXT_STEP_LL \
|
||||
boffset += NCCL_LL_SLICE_LINES; \
|
||||
if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
|
||||
flag++; \
|
||||
step++;
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int llNthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
|
||||
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
|
||||
volatile int * sizesFifo = ring->send.conn.llFifo;
|
||||
uint64_t sendHead = sendHeadPtr[0];
|
||||
const int nranks = comm->nRanks;
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
|
||||
typedef LLPrimitives<T, FUNC> LL;
|
||||
|
||||
const ssize_t size = args->N;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nRings*chunkSize;
|
||||
|
||||
uint64_t step = ring->send.conn.llStep;
|
||||
uint32_t flag = step + 1;
|
||||
int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
|
||||
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
@@ -155,39 +84,17 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
|
||||
}
|
||||
ssize_t offset = gridOffset + bid*chunkSize;
|
||||
|
||||
int maxOffset = min(chunkSize, size-offset);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
if (prevRank == root) {
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
nextOutput + boffset,
|
||||
maxOffset, flag, llNthreads);
|
||||
POST_SIZE;
|
||||
NEXT_STEP_LL;
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
} else if (rank == root) {
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
prevInput + boffset,
|
||||
thisOutput + offset,
|
||||
maxOffset, flag, llNthreads);
|
||||
NEXT_STEP_LL;
|
||||
ACK_PREV;
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
|
||||
} else {
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
prevInput + boffset,
|
||||
nextOutput + boffset,
|
||||
maxOffset, flag, flag, llNthreads);
|
||||
POST_SIZE;
|
||||
NEXT_STEP_LL;
|
||||
ACK_PREV;
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
// We need everyone to acknowledge data even if they didn't receive anything
|
||||
// so that the next collective can start right away.
|
||||
ACK_PREV;
|
||||
|
||||
FIFO_CLEANING_AND_SAVE_STEP(flag);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 0
|
||||
#include "device/reduce.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 1
|
||||
#include "device/reduce.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 2
|
||||
#include "device/reduce.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 3
|
||||
#include "device/reduce.cu"
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -19,7 +19,7 @@ struct FuncNull {
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
//we really don't need any specializations and we don't need
|
||||
//to break things into uint32_t
|
||||
@@ -164,30 +164,31 @@ struct FuncMin {
|
||||
}
|
||||
};
|
||||
|
||||
#define MASK0 0x00ff00ff
|
||||
#define MASK1 0xff00ff00
|
||||
static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
|
||||
/* This can be used both for signed and unsigned 8-bit addition */
|
||||
const uint32_t x0 = x & MASK0;
|
||||
const uint32_t x1 = x & MASK1;
|
||||
const uint32_t y0 = y & MASK0;
|
||||
const uint32_t y1 = y & MASK1;
|
||||
const uint32_t r0 = (x0+y0);
|
||||
const uint32_t r1 = (x1+y1);
|
||||
return (r0 & MASK0) | (r1 & MASK1);
|
||||
}
|
||||
|
||||
template<>
|
||||
struct FuncSum<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
|
||||
int32_t rv;
|
||||
asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
|
||||
"vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
|
||||
"vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
|
||||
"vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
cy.storage = y;
|
||||
cr.a.x = cx.a.x + cy.a.x;
|
||||
cr.a.y = cx.a.y + cy.a.y;
|
||||
cr.a.z = cx.a.z + cy.a.z;
|
||||
cr.a.w = cx.a.w + cy.a.w;
|
||||
return cr.storage;
|
||||
return addChar4(x, y);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
|
||||
@@ -196,28 +197,16 @@ struct FuncSum<int8_t> {
|
||||
};
|
||||
template<>
|
||||
struct FuncSum<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
|
||||
int32_t rv;
|
||||
asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
|
||||
"vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
|
||||
"vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
|
||||
"vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
cy.storage = y;
|
||||
cr.a.x = cx.a.x + cy.a.x;
|
||||
cr.a.y = cx.a.y + cy.a.y;
|
||||
cr.a.z = cx.a.z + cy.a.z;
|
||||
cr.a.w = cx.a.w + cy.a.w;
|
||||
return cr.storage;
|
||||
return addChar4(x, y);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
|
||||
@@ -227,22 +216,6 @@ struct FuncSum<uint8_t> {
|
||||
|
||||
static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
|
||||
/* This can be used both for signed and unsigned 8-bit multiplication */
|
||||
#if (__CUDA_ARCH__ >= 300)
|
||||
uint32_t rv;
|
||||
asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
|
||||
" vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
|
||||
" vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
|
||||
" shl.b32 t3, t3, 16;\n\t"
|
||||
" shl.b32 t2, t2, 16;\n\t"
|
||||
" vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
|
||||
" shl.b32 t1, t1, 8;\n\t"
|
||||
" vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
|
||||
" and.b32 t1, t1, 0xff00ff00;\n\t"
|
||||
" and.b32 t0, t0, 0x00ff00ff;\n\t"
|
||||
" or.b32 %0, t0, t1;\n\t"
|
||||
"}" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
@@ -252,7 +225,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
|
||||
cr.a.z = cx.a.z * cy.a.z;
|
||||
cr.a.w = cx.a.w * cy.a.w;
|
||||
return cr.storage;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
@@ -278,17 +250,12 @@ template<>
|
||||
struct FuncMax<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
|
||||
int32_t rv;
|
||||
asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
|
||||
"vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
|
||||
"vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
|
||||
"vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
@@ -298,6 +265,7 @@ struct FuncMax<int8_t> {
|
||||
cr.a.z = max(cx.a.z, cy.a.z);
|
||||
cr.a.w = max(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
|
||||
@@ -308,17 +276,12 @@ template<>
|
||||
struct FuncMax<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
|
||||
int32_t rv;
|
||||
asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
|
||||
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
|
||||
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
|
||||
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
@@ -328,6 +291,7 @@ struct FuncMax<uint8_t> {
|
||||
cr.a.z = max(cx.a.z, cy.a.z);
|
||||
cr.a.w = max(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
|
||||
@@ -339,17 +303,12 @@ template<>
|
||||
struct FuncMin<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
|
||||
int32_t rv;
|
||||
asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
|
||||
"vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
|
||||
"vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
|
||||
"vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
@@ -359,6 +318,7 @@ struct FuncMin<int8_t> {
|
||||
cr.a.z = min(cx.a.z, cy.a.z);
|
||||
cr.a.w = min(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
|
||||
@@ -369,17 +329,12 @@ template<>
|
||||
struct FuncMin<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
|
||||
return rv;
|
||||
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
|
||||
int32_t rv;
|
||||
asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
|
||||
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
|
||||
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
|
||||
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
|
||||
return rv;
|
||||
#else
|
||||
converter cx, cy, cr;
|
||||
cx.storage = x;
|
||||
@@ -389,6 +344,7 @@ struct FuncMin<uint8_t> {
|
||||
cr.a.z = min(cx.a.z, cy.a.z);
|
||||
cr.a.w = min(cx.a.w, cy.a.w);
|
||||
return cr.storage;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
|
||||
@@ -480,6 +436,6 @@ struct FuncMin<half> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
#endif // REDUCE_KERNEL_H_
|
||||
|
||||
@@ -11,12 +11,7 @@
|
||||
|
||||
#define UNROLL 4
|
||||
|
||||
#if NCCL_OP == 0
|
||||
IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum);
|
||||
#elif NCCL_OP == 1
|
||||
IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
|
||||
#elif NCCL_OP == 2
|
||||
IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin);
|
||||
#elif NCCL_OP == 3
|
||||
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
|
||||
#endif
|
||||
IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
|
||||
@@ -1,166 +1,93 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "devcomm.h"
|
||||
#include "primitives.h"
|
||||
#include "collectives.h"
|
||||
|
||||
// Increase Step and poffset/noffset for buffer sync
|
||||
#define NEXT_STEP \
|
||||
step++; \
|
||||
poffset = noffset; \
|
||||
noffset += sliceSize; \
|
||||
if (noffset == buffSize) noffset = 0;
|
||||
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = blockDim.x;
|
||||
const int bid = args->bid;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
|
||||
WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
|
||||
WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
|
||||
PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
|
||||
PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->next_hdp_reg);
|
||||
|
||||
typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
|
||||
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
const ssize_t size = args->N;
|
||||
const int nranks = comm->nRanks;
|
||||
const int buffSize = ring->buffSize / sizeof(T);
|
||||
const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
|
||||
const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
|
||||
|
||||
if (tid == 0) {
|
||||
// Update in case we skipped some collectives
|
||||
STORE(ring->recv.conn.opCount, args->opCount);
|
||||
// Wait for next to be ready
|
||||
WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
|
||||
waitOpCountNext.wait(args->opCount);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
uint64_t step = 0ULL;
|
||||
int poffset, noffset = 0;
|
||||
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
|
||||
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
|
||||
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
|
||||
T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
|
||||
ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*chunkSize;
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
|
||||
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
|
||||
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int maxOffset = min(chunkSize, size-chunkOffset);
|
||||
int nelem = min(realChunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
Prims::Copy(tid, nthreads,
|
||||
thisInput + offset,
|
||||
nextOutput + noffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext,
|
||||
postReadyToNext);
|
||||
|
||||
NEXT_STEP; // Increases step, poffset, noffset
|
||||
prims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
Prims::Reduce(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisInput + offset,
|
||||
nextOutput + noffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitDoneFromNext, waitReadyFromPrev,
|
||||
postReadyToNext, postDoneToPrev);
|
||||
|
||||
NEXT_STEP;
|
||||
prims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
Prims::Reduce(tid, nthreads,
|
||||
prevInput + poffset,
|
||||
thisInput + offset,
|
||||
thisOutput + chunkOffset,
|
||||
sliceSize, maxOffset,
|
||||
step,
|
||||
waitReadyFromPrev,
|
||||
postDoneToPrev);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
|
||||
STORE(ring->send.conn.head, 0ULL);
|
||||
STORE(ring->recv.conn.tail, 0ULL);
|
||||
__threadfence_system();
|
||||
STORE(ring->recv.conn.opCount, args->opCount+1);
|
||||
prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
#include "ll_kernel.h"
|
||||
|
||||
#define NEXT_STEP_LL \
|
||||
poffset = noffset; \
|
||||
pflag = nflag; \
|
||||
noffset += NCCL_LL_SLICE_LINES; \
|
||||
if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
|
||||
nflag++; \
|
||||
step++;
|
||||
template<int UNROLL, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
|
||||
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int llNthreads = args->nThreads;
|
||||
struct ncclComm* comm = args->comm;
|
||||
struct ncclRing* ring = comm->rings+blockIdx.x;
|
||||
volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
|
||||
volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
|
||||
volatile int * sizesFifo = ring->send.conn.llFifo;
|
||||
uint64_t sendHead = sendHeadPtr[0];
|
||||
const int nthreads = args->nThreads;
|
||||
struct ncclDevComm* comm = args->comm;
|
||||
struct ncclChannel* channel = comm->channels+blockIdx.x;
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
|
||||
typedef LLPrimitives<T, FUNC> LL;
|
||||
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
|
||||
|
||||
const ssize_t size = args->N;
|
||||
//const int rank = comm->rank;
|
||||
const int nranks = comm->nRanks;
|
||||
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
|
||||
const ssize_t loopSize = args->nRings*chunkSize;
|
||||
|
||||
uint64_t step = ring->send.conn.llStep;
|
||||
uint32_t pflag, nflag = step + 1;
|
||||
int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
|
||||
const ssize_t loopSize = args->nChannels*chunkSize;
|
||||
|
||||
// Compute pointers
|
||||
const T * __restrict__ thisInput = (const T*)args->ThisInput;
|
||||
T * __restrict__ thisOutput = (T*)args->ThisOutput;
|
||||
union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
|
||||
union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
if (size-gridOffset < loopSize) {
|
||||
@@ -170,37 +97,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
|
||||
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
ssize_t offset;
|
||||
int maxOffset = min(chunkSize, size-chunkOffset);
|
||||
int nelem = min(chunkSize, size-chunkOffset);
|
||||
int rankDest;
|
||||
|
||||
// step 0: push data to next GPU
|
||||
rankDest = ring->devUserRanks[nranks-1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.send(thisInput+offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ring->devUserRanks[nranks-j];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
WAIT_NEXT;
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
prevInput + poffset,
|
||||
nextOutput + noffset,
|
||||
maxOffset, pflag, nflag, llNthreads);
|
||||
POST_SIZE;
|
||||
ACK_PREV;
|
||||
|
||||
NEXT_STEP_LL;
|
||||
LLprims.recvReduceSend(thisInput+offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
@@ -208,13 +119,10 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
|
||||
rankDest = ring->devUserRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
LL::ReduceCopy(
|
||||
thisInput + offset,
|
||||
prevInput + poffset,
|
||||
thisOutput + chunkOffset,
|
||||
maxOffset, pflag, llNthreads);
|
||||
ACK_PREV;
|
||||
LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
|
||||
}
|
||||
|
||||
FIFO_CLEANING_AND_SAVE_STEP(nflag);
|
||||
}
|
||||
|
||||
template<int UNUSED, class FUNC, typename T>
|
||||
__attribute__((noinline))
|
||||
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 0
|
||||
#include "device/reduce_scatter.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 1
|
||||
#include "device/reduce_scatter.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 2
|
||||
#include "device/reduce_scatter.cu"
|
||||
@@ -1,8 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#define NCCL_OP 3
|
||||
#include "device/reduce_scatter.cu"
|
||||
@@ -0,0 +1,19 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollReduce, "Reduce",
|
||||
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
|
||||
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "common_coll.h"
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
size_t nbytes = count*ncclTypeSize(datatype);
|
||||
INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||
if (comm->nRanks == 1) {
|
||||
if (sendbuff != recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
|
||||
NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
|
||||
op, root, comm, stream);
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
|
||||
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
|
||||
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
|
||||
return ncclEnqueueCheck(&info);
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "core.h"
|
||||
#include "common_coll.h"
|
||||
#include "enqueue.h"
|
||||
#include "collectives.h"
|
||||
|
||||
ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
size_t nbytes = count*ncclTypeSize(datatype);
|
||||
INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
|
||||
if (comm->nRanks == 1) {
|
||||
if (sendbuff != recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
|
||||
NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
|
||||
return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
|
||||
op, 0, comm, stream);
|
||||
}
|
||||
@@ -0,0 +1,441 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "enqueue.h"
|
||||
#include "checks.h"
|
||||
#include "param.h"
|
||||
|
||||
#include "collectives/collectives.h"
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_FUNC5(coll##Ring, op, dtype)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
|
||||
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum), \
|
||||
NCCL_FUNCS3A(coll, sum)
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclColl);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
};
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Launch system : synchronization and CUDA kernel launch */
|
||||
/*****************************************************************************/
|
||||
|
||||
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
|
||||
if (cgMode & 0x01) {
|
||||
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
|
||||
// These flags are to reduce the latency of using this API
|
||||
0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
int savedDev;
|
||||
CUDACHECK(hipGetDevice(&savedDev));
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
hipLaunchParams* params = paramsList+i;
|
||||
CUDACHECK(hipSetDevice(cudaDevs[i]));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
|
||||
}
|
||||
CUDACHECK(hipSetDevice(savedDev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
|
||||
|
||||
// Set active = 2 for the last operation
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
|
||||
}
|
||||
|
||||
// Find the first operation, choose the kernel accordingly and pass it
|
||||
// as the first argument.
|
||||
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
|
||||
memcpy(&comm->args, coll, sizeof(struct ncclColl));
|
||||
// As we pass that coll directly, we can free it immediately.
|
||||
STORE(&coll->active, 0);
|
||||
|
||||
params->func = (void *)ncclKerns[coll->funcIndex];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
int val = LOAD(ptr);
|
||||
bool done = false;
|
||||
while (done == false) {
|
||||
if (val >= comm->intraRanks) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (val+1 == comm->intraRanks) {
|
||||
// Reset the barrier.
|
||||
comm->intraBarrier[comm->intraPhase^1] = 0;
|
||||
*isLast = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
done = __sync_bool_compare_and_swap(ptr, val, val+1);
|
||||
val++;
|
||||
}
|
||||
*isLast = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
int val = LOAD(ptr);
|
||||
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
while (LOAD(ptr) < comm->intraRanks) pthread_yield();
|
||||
comm->intraPhase ^= 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
|
||||
if (comm->nRanks == 1) return ncclSuccess;
|
||||
hipLaunchParams* params = comm->myParams;
|
||||
|
||||
NCCLCHECK(setupLaunch(comm, params));
|
||||
|
||||
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
||||
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
|
||||
// Enqueue event in user stream
|
||||
CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
|
||||
// Create dependency between user stream and internal NCCL stream
|
||||
CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
|
||||
params->stream = comm->groupStream;
|
||||
} else {
|
||||
if (comm->userStream != params->stream) {
|
||||
// Stream changed from last call, create dependency against last NCCL kernel launch
|
||||
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
||||
}
|
||||
params->stream = comm->userStream;
|
||||
}
|
||||
|
||||
int isLast = 0;
|
||||
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
|
||||
|
||||
if (isLast) {
|
||||
if (comm->launchMode == ncclComm::GROUP) {
|
||||
// I'm the last. Launch all operations.
|
||||
NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
|
||||
}
|
||||
NCCLCHECK(ncclCpuBarrierLast(comm));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
if (comm->nRanks == 1) return ncclSuccess;
|
||||
// We can't print the CG mode before the first barrier happened.
|
||||
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
|
||||
*comm->intraCGMode ^= 0x10;
|
||||
INFO(NCCL_INIT,"Launch mode %s%s%s",
|
||||
comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
|
||||
*comm->intraCGMode ? "/CGMD" : "",
|
||||
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
|
||||
hipLaunchParams *params = comm->myParams;
|
||||
if (comm->launchMode == ncclComm::PARALLEL) {
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
|
||||
}
|
||||
// Start the network proxies as soon as the kernel has been launched. We can't
|
||||
// perform any CUDA call between the two or having a hipFree between the CUDA
|
||||
// launch and the transportStartProxy call could cause a deadlock.
|
||||
// Also, starting the proxies after the CUDA launch seems to be better for
|
||||
// performance (latency).
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
channel->collStart = channel->collFifoTail;
|
||||
channel->collCount = 0;
|
||||
}
|
||||
params->gridDim.x = params->blockDim.x = 0;
|
||||
NCCLCHECK(transportStartProxy(comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
|
||||
hipLaunchParams *params = comm->myParams;
|
||||
// Enqueue event after NCCL kernel
|
||||
CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
|
||||
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
||||
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
|
||||
// Create dependency between NCCL internal stream and user stream
|
||||
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
||||
}
|
||||
comm->userStreamSet = false;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Enqueueing system : computation of kernel and proxy operations parameters */
|
||||
/*****************************************************************************/
|
||||
|
||||
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
||||
if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
|
||||
else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
|
||||
else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
|
||||
else if (info->coll == ncclCollAllReduce) {
|
||||
if (info->nBytes <= info->comm->treeThreshold)
|
||||
info->pattern = ncclPatternTreeUpDown;
|
||||
else
|
||||
info->pattern = ncclPatternRingTwice;
|
||||
}
|
||||
else {
|
||||
WARN("Unknown collective %d", info->coll);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
switch (info->pattern) {
|
||||
case ncclPatternTreeUp:
|
||||
case ncclPatternTreeDown:
|
||||
case ncclPatternTreeUpDown:
|
||||
case ncclPatternPipelineFrom:
|
||||
case ncclPatternPipelineTo:
|
||||
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
||||
case ncclPatternRing:
|
||||
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
case ncclPatternRingTwice:
|
||||
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
default:
|
||||
WARN("Unknown pattern %d\n", info->pattern);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
|
||||
// Compute thresholds and limits that users can override
|
||||
ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
|
||||
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
|
||||
|
||||
// First compute nThreads
|
||||
int nt = NCCL_LL_MIN_NTHREADS;
|
||||
while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
|
||||
|
||||
// Then compute nChannels
|
||||
int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
|
||||
if (nc == 0) nc = 1;
|
||||
if (nc > info->comm->nChannels) nc = info->comm->nChannels;
|
||||
|
||||
// Check if we have a fixed LL threshold, otherwise compute it.
|
||||
int perThreadThreshold = info->comm->threadThreshold;
|
||||
if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
|
||||
ssize_t llThreshold = info->comm->llThreshold >= 0 ?
|
||||
info->comm->llThreshold :
|
||||
nc*nt*info->nchunksPerLoop*perThreadThreshold;
|
||||
|
||||
if (info->nBytes <= llThreshold) {
|
||||
*llMode = 1;
|
||||
*nChannels = nc;
|
||||
*nThreads = nt;
|
||||
} else {
|
||||
*llMode = 0;
|
||||
*nChannels = info->comm->nChannels;
|
||||
*nThreads = info->comm->nThreads;
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
|
||||
// Set nstepsPerLoop and nchunksPerLoop
|
||||
NCCLCHECK(getPatternInfo(info));
|
||||
NCCLCHECK(getLoopInfo(info));
|
||||
|
||||
coll->args.root = info->root;
|
||||
coll->args.N = info->count;
|
||||
coll->args.ThisInput = info->sendbuff;
|
||||
coll->args.ThisOutput = info->recvbuff;
|
||||
coll->args.comm = info->comm->devComm;
|
||||
coll->args.opCount = info->comm->opCount;
|
||||
|
||||
// Compute llMode, nChannels, nThreads
|
||||
int llMode;
|
||||
getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
|
||||
|
||||
int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
|
||||
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
|
||||
|
||||
int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
|
||||
int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
|
||||
int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
|
||||
int chunkSize = stepSize*chunkSteps;
|
||||
|
||||
// Compute lastChunkSize
|
||||
if (treeMode == 1 && llMode == 0) {
|
||||
if (info->pattern == ncclPatternTreeUpDown) {
|
||||
// Optimize chunkSize / nSteps
|
||||
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
||||
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
}
|
||||
// Use lastChunkSize as chunkSize
|
||||
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
||||
} else if (llMode == 1) {
|
||||
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
|
||||
const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
||||
coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
|
||||
ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
|
||||
coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
|
||||
}
|
||||
|
||||
// Compute nSteps for proxies
|
||||
size_t nBytes = llMode ? info->nBytes*2 : info->nBytes;
|
||||
|
||||
int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
|
||||
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
|
||||
proxyArgs->sliceSteps = sliceSteps;
|
||||
proxyArgs->chunkSteps = chunkSteps;
|
||||
proxyArgs->llMode = llMode;
|
||||
proxyArgs->opCount = info->comm->opCount;
|
||||
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
|
||||
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
|
||||
nLoops, proxyArgs->nsteps, info->comm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
if (info->comm->nRanks == 1) {
|
||||
if (info->sendbuff != info->recvbuff)
|
||||
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclColl coll;
|
||||
struct ncclProxyArgs proxyArgs;
|
||||
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
|
||||
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
|
||||
|
||||
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
|
||||
if (info->comm->userStreamSet == false) {
|
||||
info->comm->userStream = info->stream;
|
||||
info->comm->userStreamSet = true;
|
||||
} else if (info->stream != info->comm->userStream) {
|
||||
WARN("Error : mixing different streams within a group call is not supported.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
for (int bid=0; bid<coll.args.nChannels; bid++) {
|
||||
struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
|
||||
|
||||
if (channel->collCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
// Proxy
|
||||
proxyArgs.channel = channel;
|
||||
NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
|
||||
|
||||
info->comm->myParams->gridDim.x++;
|
||||
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
|
||||
memcpy(c, &coll, sizeof(struct ncclColl));
|
||||
|
||||
c->args.bid = bid;
|
||||
STORE(&c->active, 1);
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
channel->collCount++;
|
||||
}
|
||||
/*if (llMode == 0)*/ info->comm->opCount++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
if (info->comm == NULL) return ncclInvalidArgument;
|
||||
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
||||
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
||||
|
||||
// Launch asynchronously if needed
|
||||
if (ncclAsyncMode()) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int savedDev = -1;
|
||||
if (info->comm->checkPointers) {
|
||||
CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
|
||||
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
|
||||
}
|
||||
// Check arguments
|
||||
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
|
||||
// Always register comm even in case of error to make sure ncclGroupEnd
|
||||
// cleans it up.
|
||||
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
|
||||
NCCLCHECKGOTO(saveKernel(info), ret, end);
|
||||
end:
|
||||
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
|
||||
ncclAsyncErrCheck(ret);
|
||||
return ret;
|
||||
} else {
|
||||
NCCLCHECK(ArgsCheck(info));
|
||||
NCCLCHECK(saveKernel(info));
|
||||
NCCLCHECK(ncclBarrierEnqueue(info->comm));
|
||||
NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
|
||||
NCCLCHECK(ncclEnqueueEvents(info->comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ALLOC_H_
|
||||
#define NCCL_ALLOC_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "checks.h"
|
||||
#include <sys/mman.h>
|
||||
|
||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
|
||||
memset(*ptr, 0, size);
|
||||
*devPtr = *ptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(hipHostFree(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
return ncclSystemError;
|
||||
}
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
if (isFineGrain)
|
||||
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ARGCHECK_H_
|
||||
#define NCCL_ARGCHECK_H_
|
||||
|
||||
#include "core.h"
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info);
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,9 +9,12 @@
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
ncclResult_t bootstrapNetInit();
|
||||
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
|
||||
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
|
||||
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
|
||||
ncclResult_t bootstrapClose(void* commState);
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_CHANNEL_H_
|
||||
#define NCCL_CHANNEL_H_
|
||||
#include "core.h"
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,73 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_CHECKS_H_
|
||||
#define NCCL_CHECKS_H_
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
// Check CUDA calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,117 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_COMM_H_
|
||||
#define NCCL_COMM_H_
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#define MEM_ALIGN 4096
|
||||
#define CUDA_IPC_MIN 2097152UL
|
||||
|
||||
struct ncclSendMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t head;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
void* ptrExchange;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
||||
uint64_t opCount;
|
||||
};
|
||||
char pad3[MEM_ALIGN];
|
||||
};
|
||||
};
|
||||
|
||||
struct ncclRecvMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
uint64_t opCount;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[NCCL_STEPS];
|
||||
};
|
||||
char pad4[MEM_ALIGN];
|
||||
};
|
||||
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
|
||||
void* bootstrap;
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
int nvmlDev; // my NVML device number
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
hipStream_t userStream;
|
||||
bool userStreamSet;
|
||||
hipEvent_t doneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
uint64_t opCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
int nThreads;
|
||||
|
||||
// Low-latency algorithm threshold
|
||||
ssize_t llThreshold;
|
||||
ssize_t threadThreshold;
|
||||
|
||||
// Tree algorithm threshold
|
||||
ssize_t treeThreshold;
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
hipStream_t groupStream;
|
||||
|
||||
// Whether there has been a fatal error in this communicator.
|
||||
ncclResult_t fatalError;
|
||||
|
||||
// Error reported by GPU
|
||||
volatile ncclDevError_t* fatalDevError;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
// Device side of the communicator
|
||||
struct ncclDevComm *devComm;
|
||||
// Host copy of the devComm (to free CUDA allocs)
|
||||
struct ncclDevComm hostDevComm;
|
||||
|
||||
// Intra-process sync
|
||||
int intraRank;
|
||||
int intraRanks;
|
||||
int* intraBarrier;
|
||||
int intraPhase;
|
||||
|
||||
// Storage for deferred intra-process launch
|
||||
hipLaunchParams * intraParams;
|
||||
hipLaunchParams *myParams;
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclColl args;
|
||||
struct ncclColl* argsptr;
|
||||
|
||||
// Global proxy thread
|
||||
pthread_t proxyThread;
|
||||
struct ncclProxyState proxyState;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,196 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef COMMON_COLL_H_
|
||||
#define COMMON_COLL_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "enqueue.h"
|
||||
#include "collectives/collectives.h"
|
||||
|
||||
static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
hipPointerAttribute_t attr;
|
||||
hipError_t err = hipPointerGetAttributes(&attr, pointer);
|
||||
if (err != hipSuccess || attr.devicePointer == NULL) {
|
||||
WARN("%s : %s is not a valid pointer", opname, ptrname);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
#if CUDART_VERSION >= 10000
|
||||
if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
|
||||
#else
|
||||
if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
|
||||
#endif
|
||||
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
|
||||
if (ptr == NULL) {
|
||||
WARN("%s : %s argument is NULL", opname, ptrname);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
|
||||
NCCLCHECK(PtrCheck(comm, opname, "comm"));
|
||||
// First, the easy ones
|
||||
if (root < 0 || root >= comm->nRanks) {
|
||||
WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (type < 0 || type >= ncclNumTypes) {
|
||||
WARN("%s : invalid type %d", opname, type);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (op < 0 || op >= ncclNumOps) {
|
||||
WARN("%s : invalid reduction operation %d", opname, op);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
if (comm->checkPointers) {
|
||||
// Check CUDA device pointers
|
||||
if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
|
||||
NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
|
||||
}
|
||||
if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
|
||||
NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8:
|
||||
case ncclUint8:
|
||||
return 1;
|
||||
case ncclFloat16:
|
||||
return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32:
|
||||
return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64:
|
||||
return 8;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// In : comm, nbytes ; Out : nrings, nthreads, ll
|
||||
// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
|
||||
// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
|
||||
// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
|
||||
// This ensures we don't use a large number of rings with a small number of threads
|
||||
// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
|
||||
// we use NCCL_THREAD_THRESHOLD when we reach the max
|
||||
// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
|
||||
// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
|
||||
static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
|
||||
*ll = 0;
|
||||
int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
|
||||
if (comm->llThreshold >= 0) { /* user sets total LL threshold */
|
||||
if (nbytes > comm->llThreshold) { /* non-LL */
|
||||
*nthreads = comm->nThreads;
|
||||
*nrings = comm->nRings;
|
||||
return;
|
||||
} else {
|
||||
llEnforced = 1; /* user wants to use LL */
|
||||
}
|
||||
}
|
||||
int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
|
||||
size_t nr;
|
||||
int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
|
||||
int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
|
||||
ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
|
||||
while (nt < ll_max_nthreads && *ll == 0) {
|
||||
nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
|
||||
if (nr <= maxRings) { /* avoid using few threads but many rings */
|
||||
nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
|
||||
*ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
|
||||
}
|
||||
if (*ll == 0) {
|
||||
nt = nt << 1;
|
||||
}
|
||||
}
|
||||
if (*ll == 1) {
|
||||
*nthreads = nt;
|
||||
*nrings = (int)nr;
|
||||
return; /* we can use smaller number of threads to make LL work, stop here */
|
||||
}
|
||||
nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
|
||||
nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
|
||||
*ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
|
||||
*nthreads = *ll ? ll_max_nthreads : comm->nThreads;
|
||||
*nrings = *ll ? (int)nr : comm->nRings;
|
||||
}
|
||||
|
||||
static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream, size_t nbytes, int loopFactor) {
|
||||
int llMode, nBlocks, nThreads;
|
||||
ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
|
||||
comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
|
||||
if (comm->userStreamSet == false) {
|
||||
comm->userStream = stream;
|
||||
comm->userStreamSet = true;
|
||||
} else if (stream != comm->userStream) {
|
||||
WARN("Error : mixing different streams within a group call is not supported.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
int lastChunkSize = 0;
|
||||
if (llMode == 1) {
|
||||
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
|
||||
const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
|
||||
lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
|
||||
ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
|
||||
}
|
||||
for (int bid=0; bid<nBlocks; bid++) {
|
||||
struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
|
||||
if (ring->collCount == NCCL_MAX_OPS) {
|
||||
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
comm->myParams->gridDim.x++;
|
||||
|
||||
int opIndex = ring->collFifoTail;
|
||||
struct ncclColl* c = ring->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
|
||||
struct CollectiveArgs* args = &c->args;
|
||||
args->root = root;
|
||||
args->N = count;
|
||||
args->ThisInput = sendbuff;
|
||||
args->ThisOutput = recvbuff;
|
||||
args->comm = comm->devComm;
|
||||
args->opCount = comm->opCount;
|
||||
args->bid = bid;
|
||||
args->nRings = nBlocks;
|
||||
args->nThreads = nThreads;
|
||||
args->lastChunkSize = lastChunkSize;
|
||||
|
||||
c->nThreads = nThreads;
|
||||
c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
|
||||
STORE(&c->active, 1);
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
ring->collFifoTail = opIndex;
|
||||
ring->collCount++;
|
||||
}
|
||||
/*if (llMode == 0)*/ comm->opCount++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
|
||||
|
||||
#endif
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,313 +7,20 @@
|
||||
#ifndef NCCL_CORE_H_
|
||||
#define NCCL_CORE_H_
|
||||
|
||||
#define NCCL_MAX_OPS 2048
|
||||
|
||||
#include <pthread.h>
|
||||
#include <algorithm>
|
||||
#include "nccl.h"
|
||||
#include "transport.h"
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "alloc.h"
|
||||
#include "transport.h"
|
||||
#include "devcomm.h"
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "argcheck.h"
|
||||
#include <cstdio>
|
||||
#include <algorithm> // std::min/std::max
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define MAXRINGS 16
|
||||
#define MAXTHREADS 256
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
|
||||
// Rings / LL tuning
|
||||
#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL for Volta and above
|
||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||
#define NCCL_LL_MAX_NTHREADS 256
|
||||
#define NCCL_LL_MIN_NTHREADS 256
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
union ncclLLFifoLine {
|
||||
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
||||
from the network may receive the flag but not the data.
|
||||
Note this is assuming that either we receive contiguous chunks of data
|
||||
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
||||
struct {
|
||||
uint32_t data1;
|
||||
uint32_t flag1;
|
||||
uint32_t data2;
|
||||
uint32_t flag2;
|
||||
};
|
||||
uint64_t v[2];
|
||||
int4 i4;
|
||||
};
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buff; // Local for recv, remote for send
|
||||
uint64_t *tail; // Local for recv, remote for send
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
uint64_t *opCount; // Local for recv, remote for send
|
||||
|
||||
int direct; // Direct communication
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
|
||||
int *fifo; // Size fifo for proxy
|
||||
|
||||
// Low latency mechanism
|
||||
char *llBuff; // Local for recv, remote for send
|
||||
uint64_t *llHead; // Local for send, remote for recv
|
||||
int *llFifo; // LL Size fifo for proxy
|
||||
uint64_t llStep; // Keep where we are
|
||||
uint64_t llLastCleaning;
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
struct transportProxyInfo* proxyInfo;
|
||||
struct ncclTransport* transport;
|
||||
void* transportResources; // Host-side resources
|
||||
struct ncclConnInfo conn;
|
||||
};
|
||||
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#define MEM_ALIGN 4096
|
||||
#define SIZES_FIFO_SIZE 16
|
||||
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
|
||||
|
||||
#define NCCL_LL_CHUNKS 8
|
||||
#define NUM_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
|
||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
|
||||
#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
|
||||
#define NCCL_LL_CLEAN_FREQ 0x10000000
|
||||
|
||||
struct ncclSendMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t head;
|
||||
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
void* ptrExchange;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
|
||||
uint64_t llHead;
|
||||
};
|
||||
char pad3[MEM_ALIGN];
|
||||
};
|
||||
};
|
||||
|
||||
struct ncclRecvMem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t tail;
|
||||
char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
uint64_t opCount;
|
||||
char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
|
||||
int sizesFifo[SIZES_FIFO_SIZE];
|
||||
int llSizesFifo[SIZES_FIFO_SIZE];
|
||||
};
|
||||
char pad5[MEM_ALIGN];
|
||||
};
|
||||
char llBuff[NCCL_LL_BUFF_SIZE];
|
||||
char buff[1]; // Actually larger than that
|
||||
};
|
||||
|
||||
struct ncclRing {
|
||||
union {
|
||||
struct {
|
||||
int id;
|
||||
int nthreads;
|
||||
// Per ring resources
|
||||
struct ncclSendMem* devMemSend; // CUDA-size resources
|
||||
struct ncclRecvMem* devMemRecv; // CUDA-size resources
|
||||
int buffSize;
|
||||
int devMemSendSize; // Keep the size for IPCs
|
||||
int devMemRecvSize; // Keep the size for IPCs
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
|
||||
// Maps an internal nccl index to user-specified rank order. This is necessary
|
||||
// since we need to know how the user expects data to be ordered across
|
||||
// devices. Ordered from current device.
|
||||
int* userRanks;
|
||||
int* devUserRanks;
|
||||
|
||||
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
|
||||
// allows software to explicitly initiate a flush read to HDP memory. See more
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
struct ncclColl* devCollectives;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
|
||||
|
||||
#pragma pack(push) /* push current alignment to stack */
|
||||
#pragma pack(4) /* set alignment to 4 bytes boundary */
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
struct CollectiveArgs {
|
||||
struct ncclComm* comm;
|
||||
uint64_t opCount;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * ThisInput;
|
||||
void * ThisOutput;
|
||||
|
||||
// general parameters
|
||||
size_t N;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nRings;
|
||||
uint16_t nThreads;
|
||||
|
||||
int lastChunkSize;
|
||||
};
|
||||
struct ncclColl {
|
||||
union {
|
||||
struct {
|
||||
struct CollectiveArgs args;
|
||||
uint16_t nThreads;
|
||||
uint16_t funcIndex;
|
||||
uint16_t nextIndex;
|
||||
uint8_t active;
|
||||
};
|
||||
int data[0x10];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||
#pragma pack(pop) /* restore original alignment from stack */
|
||||
|
||||
struct ncclComm {
|
||||
struct ncclRing rings[MAXRINGS];
|
||||
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
int cudaDev; // my cuda device index
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
hipStream_t userStream;
|
||||
bool userStreamSet;
|
||||
hipEvent_t doneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
// where syncs are not symmetric).
|
||||
uint64_t opCount;
|
||||
|
||||
// Rings for collectives
|
||||
int nRings;
|
||||
int nThreads;
|
||||
|
||||
// Low-latency algorithm threshold
|
||||
ssize_t llThreshold;
|
||||
ssize_t threadThreshold;
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
hipStream_t groupStream;
|
||||
|
||||
// Device copy of the communicator
|
||||
struct ncclComm *devComm;
|
||||
|
||||
// Intra-process sync
|
||||
int intraRank;
|
||||
int intraRanks;
|
||||
int* intraBarrier;
|
||||
int intraPhase;
|
||||
|
||||
// Storage for deferred intra-process launch
|
||||
hipLaunchParams* intraParams;
|
||||
hipLaunchParams* myParams;
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclColl args;
|
||||
struct ncclColl* argsptr;
|
||||
};
|
||||
|
||||
// Convert volatile access to atomic
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
#else
|
||||
#define LOAD(VAR) *(VAR)
|
||||
#define STORE(DST, SRC) *(DST) = (SRC)
|
||||
#endif
|
||||
|
||||
// Check CUDA calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
ncclResult_t res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
return res; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define NCCLCHECKGOTO(call, res, label) do { \
|
||||
res = call; \
|
||||
if (res != ncclSuccess) { \
|
||||
/* Print the back trace*/ \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#ifdef PROFAPI
|
||||
#define NCCL_API(ret, func, args...) \
|
||||
@@ -333,51 +39,27 @@ struct ncclComm {
|
||||
#endif // end PROFAPI
|
||||
|
||||
int ncclCudaCompCap();
|
||||
ncclResult_t ncclNvlinkGpu(int* nvlink);
|
||||
int64_t ncclTreeThreshold();
|
||||
|
||||
#include <sys/mman.h>
|
||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
|
||||
memset(*ptr, 0, size);
|
||||
*devPtr = *ptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(hipHostFree(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
return ncclSystemError;
|
||||
static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8:
|
||||
case ncclUint8:
|
||||
return 1;
|
||||
case ncclFloat16:
|
||||
return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32:
|
||||
return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64:
|
||||
return 8;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
memset(p, 0, nelem*sizeof(T));
|
||||
*ptr = (T*)p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
if (isFineGrain) {
|
||||
hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained);
|
||||
if (e != hipSuccess) {
|
||||
*ptr = 0;
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_CPUSET_H_
|
||||
#define NCCL_CPUSET_H_
|
||||
|
||||
// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
|
||||
|
||||
static int hexToInt(char c) {
|
||||
int v = c - '0';
|
||||
if (v < 0) return -1;
|
||||
if (v > 9) v = 10 + c - 'a';
|
||||
if ((v < 0) || (v > 15)) return -1;
|
||||
return v;
|
||||
}
|
||||
|
||||
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
|
||||
|
||||
ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
|
||||
uint32_t cpumasks[CPU_SET_N_U32];
|
||||
int m = CPU_SET_N_U32-1;
|
||||
cpumasks[m] = 0;
|
||||
for (int o=0; o<strlen(str); o++) {
|
||||
char c = str[o];
|
||||
if (c == ',') {
|
||||
m--;
|
||||
cpumasks[m] = 0;
|
||||
} else {
|
||||
int v = hexToInt(c);
|
||||
if (v == -1) break;
|
||||
cpumasks[m] <<= 4;
|
||||
cpumasks[m] += v;
|
||||
}
|
||||
}
|
||||
// Copy cpumasks to mask
|
||||
for (int a=0; m<CPU_SET_N_U32; a++,m++) {
|
||||
memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
int c = 0;
|
||||
uint8_t* m8 = (uint8_t*)mask;
|
||||
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
|
||||
if (c == 0 && m8[o] == 0) continue;
|
||||
sprintf(str+c, "%02x", m8[o]);
|
||||
c+=2;
|
||||
if (o && o%4 == 0) {
|
||||
sprintf(str+c, ",");
|
||||
c++;
|
||||
}
|
||||
}
|
||||
str[c] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -25,7 +24,8 @@ extern int ncclDebugLevel;
|
||||
extern uint64_t ncclDebugMask;
|
||||
extern pthread_mutex_t ncclDebugOutputLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen);
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
|
||||
|
||||
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||
|
||||
@@ -108,7 +108,7 @@ static inline void initDebug() {
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
getHostName(hostname, 1024, '.');
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
|
||||
@@ -0,0 +1,259 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_DEVICE_H_
|
||||
#define NCCL_DEVICE_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
|
||||
// Convert volatile access to atomic
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
#else
|
||||
#define LOAD(VAR) *(VAR)
|
||||
#define STORE(DST, SRC) *(DST) = (SRC)
|
||||
#endif
|
||||
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
#define ROUNDUP(x, y) \
|
||||
(DIVUP((x), (y))*(y))
|
||||
|
||||
#define ALIGN_SIZE(size, align) \
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
union ncclLLFifoLine {
|
||||
/* Flags have to be *after* data, because otherwise, an incomplete receive
|
||||
from the network may receive the flag but not the data.
|
||||
Note this is assuming that either we receive contiguous chunks of data
|
||||
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
|
||||
struct {
|
||||
uint32_t data1;
|
||||
uint32_t flag1;
|
||||
uint32_t data2;
|
||||
uint32_t flag2;
|
||||
};
|
||||
uint64_t v[2];
|
||||
int4 i4;
|
||||
};
|
||||
|
||||
#define MAXTHREADS 256
|
||||
#define NCCL_LL_MAX_NTHREADS MAXTHREADS
|
||||
#define NUM_LINES_PER_THREAD 8
|
||||
#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
|
||||
#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
|
||||
#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
|
||||
#ifdef DEBUG_LL
|
||||
#define NCCL_LL_CLEAN_MASK 0x00000ff8
|
||||
#define NCCL_LL_FLAG_MAX 0x00001000
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
|
||||
#else
|
||||
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
|
||||
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
|
||||
#endif
|
||||
// Make sure the clean mask will last for at least NCCL_NSTEPS
|
||||
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
|
||||
|
||||
struct ncclConnInfo {
|
||||
// Regular comm mechanism
|
||||
char *buff; // Local for recv, remote for send
|
||||
uint64_t *tail; // Local for recv, remote for send
|
||||
uint64_t *head; // Local for send, remote for recv
|
||||
uint64_t *opCountLoc; // opCount of local rank
|
||||
uint64_t *opCountRem; // opCount of remote rank
|
||||
|
||||
int direct; // Direct communication
|
||||
void **ptrExchange; // Pointer exchange for direct communication
|
||||
|
||||
int *fifo; // Size fifo for proxy
|
||||
|
||||
uint64_t step; // Keep where we are
|
||||
|
||||
// Low latency mechanism
|
||||
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
||||
uint64_t llLastCleaning;
|
||||
|
||||
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
|
||||
// allows software to explicitly initiate a flush read to HDP memory. See more
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
int connected;
|
||||
struct ncclProxyArgs *proxyAppend;
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources; // Host-side resources
|
||||
struct ncclConnInfo conn;
|
||||
struct ncclComm *comm;
|
||||
};
|
||||
|
||||
struct ncclRing {
|
||||
// Shortcuts for userRanks[1] and userRanks[n-1]
|
||||
int prev;
|
||||
int next;
|
||||
|
||||
// Maps an internal nccl index to user-specified rank order. This is necessary
|
||||
// since we need to know how the user expects data to be ordered across
|
||||
// devices. Ordered from current device.
|
||||
int* userRanks;
|
||||
int* devUserRanks;
|
||||
};
|
||||
|
||||
|
||||
#define NCCL_MAX_TREE_ARITY 3
|
||||
struct ncclTree {
|
||||
int depth;
|
||||
int up;
|
||||
int down[NCCL_MAX_TREE_ARITY];
|
||||
};
|
||||
|
||||
struct ncclPeer {
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
};
|
||||
|
||||
struct ncclDevComm;
|
||||
|
||||
#pragma pack(push) /* push current alignment to stack */
|
||||
#pragma pack(4) /* set alignment to 4 bytes boundary */
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
struct CollectiveArgs {
|
||||
struct ncclDevComm* comm;
|
||||
uint64_t opCount;
|
||||
|
||||
// local and remote input, output, and buffer
|
||||
const void * ThisInput;
|
||||
void * ThisOutput;
|
||||
|
||||
// general parameters
|
||||
size_t N;
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint16_t nThreads;
|
||||
|
||||
int lastChunkSize;
|
||||
};
|
||||
struct ncclColl {
|
||||
union {
|
||||
struct {
|
||||
struct CollectiveArgs args;
|
||||
uint16_t funcIndex;
|
||||
uint16_t nextIndex;
|
||||
uint8_t active;
|
||||
};
|
||||
int data[0x10];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
|
||||
|
||||
struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
|
||||
int id;
|
||||
int nthreads;
|
||||
int buffSize;
|
||||
|
||||
// Communication structures
|
||||
struct ncclPeer* peers;
|
||||
struct ncclPeer* devPeers;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclColl* collectives;
|
||||
struct ncclColl* devCollectives;
|
||||
int collStart;
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
|
||||
uint32_t* abortCount;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||
#pragma pack(pop) /* restore original alignment from stack */
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf {
|
||||
union {
|
||||
struct {
|
||||
uint64_t total_cycle;
|
||||
uint64_t wait_send_cycle[MAXCHANNELS];
|
||||
uint64_t wait_recv_cycle[MAXCHANNELS];
|
||||
// primtive cycles
|
||||
uint64_t send_cycle;
|
||||
uint64_t directSend_cycle;
|
||||
uint64_t recv_cycle;
|
||||
uint64_t directRecv_cycle;
|
||||
uint64_t copySend_cycle;
|
||||
uint64_t directCopySend_cycle;
|
||||
uint64_t recvCopySend_cycle;
|
||||
uint64_t directRecvCopySend_cycle;
|
||||
uint64_t recvReduceCopy_cycle;
|
||||
uint64_t recvReduceSend_cycle;
|
||||
uint64_t recvReduceCopySend_cycle;
|
||||
uint64_t directRecvReduceCopySend_cycle;
|
||||
// primitive bytes
|
||||
uint64_t send_byte;
|
||||
uint64_t directSend_byte;
|
||||
uint64_t recv_byte;
|
||||
uint64_t directRecv_byte;
|
||||
uint64_t copySend_byte;
|
||||
uint64_t directCopySend_byte;
|
||||
uint64_t recvCopySend_byte;
|
||||
uint64_t directRecvCopySend_byte;
|
||||
uint64_t recvReduceCopy_byte;
|
||||
uint64_t recvReduceSend_byte;
|
||||
uint64_t recvReduceCopySend_byte;
|
||||
uint64_t directRecvReduceCopySend_byte;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
ncclDevSuccess,
|
||||
ncclDevAssertedMismatch,
|
||||
ncclDevSuspectedMismatch
|
||||
} ncclDevError_t;
|
||||
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile ncclDevError_t *fatalDevError;
|
||||
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf* devProf;
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -11,12 +11,14 @@
|
||||
#include "core.h"
|
||||
#include "group.h"
|
||||
|
||||
typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
// Channels / LL tuning
|
||||
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL
|
||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||
#define NCCL_THREAD_THRESHOLD_VEGA 8 // Per thread size before we switch to non-LL for VEGA
|
||||
#define NCCL_LL_MIN_NTHREADS 256
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
|
||||
void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
|
||||
ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_INFO_H_
|
||||
#define NCCL_INFO_H_
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
typedef enum {
|
||||
ncclPatternRing,
|
||||
ncclPatternRingTwice,
|
||||
ncclPatternPipelineFrom,
|
||||
ncclPatternPipelineTo,
|
||||
ncclPatternTreeUp,
|
||||
ncclPatternTreeDown,
|
||||
ncclPatternTreeUpDown
|
||||
} ncclPattern_t;
|
||||
|
||||
// Used to pass NCCL call information between functions
|
||||
struct ncclInfo {
|
||||
ncclColl_t coll;
|
||||
const char* opName;
|
||||
// NCCL Coll Args
|
||||
const void* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
ncclDataType_t datatype;
|
||||
ncclRedOp_t op;
|
||||
int root;
|
||||
ncclComm_t comm;
|
||||
hipStream_t stream;
|
||||
// Algorithm details
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
// Computed later
|
||||
ncclPattern_t pattern;
|
||||
size_t nBytes;
|
||||
int nstepsPerLoop;
|
||||
int nchunksPerLoop;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -58,8 +58,51 @@ typedef struct {
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v1_t;
|
||||
|
||||
typedef ncclNet_v1_t ncclNet_t;
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Return the device path in /sys. NCCL will call free on this path.
|
||||
ncclResult_t (*pciPath)(int dev, char** path);
|
||||
// Return whether this device supports host pointers and/or CUDA pointers
|
||||
// as data from the current GPU. Supported types should be composed with
|
||||
// NCCL_PTR_HOST and NCCL_PTR_CUDA.
|
||||
ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||
// Finalize connection establishment after remote peer has called connectHandle
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v2_t;
|
||||
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
|
||||
typedef ncclNet_v2_t ncclNet_t;
|
||||
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -13,11 +13,6 @@
|
||||
extern ncclNet_t* ncclNet;
|
||||
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
|
||||
/* Socket Interface Selection type */
|
||||
typedef enum { findSubnetIf = -1,
|
||||
dontCareIf = -2
|
||||
} ncclSocketIfSl_t;
|
||||
|
||||
// Translation to external API
|
||||
static const char* ncclNetName() { return ncclNet->name; }
|
||||
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
|
||||
@@ -26,15 +21,16 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
|
||||
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
|
||||
extern ncclNet_t ncclNetIb;
|
||||
extern ncclNet_t ncclNetSocket;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -19,6 +19,7 @@
|
||||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
|
||||
@@ -26,7 +27,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
||||
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
|
||||
char* rPath = realpath(classPath, NULL);
|
||||
int fd;
|
||||
SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
|
||||
if ((fd = open(rPath, O_RDONLY)) == -1) {
|
||||
// Could not find device. It might be because we're in a VM and
|
||||
// we don't see the whole machine. This is handled silently so
|
||||
// we don't want to print an INFO error.
|
||||
TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
free(rPath);
|
||||
char pciClass[9];
|
||||
strncpy(pciClass, "0x000000", 9);
|
||||
@@ -36,6 +43,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
||||
if (strcmp(pciClass, "0x068000") == 0) {
|
||||
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
|
||||
*type = ncclNvLinkDeviceSwitch;
|
||||
} else if (strcmp(pciClass, "0x068001") == 0) {
|
||||
// PCI device is of type "Bridge: IBM Device 04ea"
|
||||
*type = ncclNvLinkDeviceBridge;
|
||||
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|
||||
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
|
||||
*type = ncclNvLinkDeviceGpu;
|
||||
@@ -49,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
||||
/* Get the maximum number of NVLinks based on the GPU generation */
|
||||
static ncclResult_t getMaxNvlinks(int* maxLinks) {
|
||||
int cudaDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
int ccMajor;
|
||||
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
|
||||
CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
|
||||
// 6 for Volta, 4 for Pascal
|
||||
*maxLinks = (ccMajor > 6) ? 6 : 4;
|
||||
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
|
||||
@@ -68,18 +78,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
if (res != ncclSuccess) return 0;
|
||||
|
||||
for(int l=0; l<maxNvLinks; ++l) {
|
||||
// nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
|
||||
// report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
|
||||
// don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
|
||||
// the POWER CPU case, so it seems best to check this as well.
|
||||
// Check whether we can use this NVLink for P2P
|
||||
unsigned canP2P;
|
||||
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
|
||||
|
||||
// nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
|
||||
// if the links don't exist, or are disabled. So checking for that return
|
||||
// here would probably make the nvmlDeviceGetNvLinkCapability check above
|
||||
// redundant. Presumably, we still need to check the P2P capability above,
|
||||
// since even non-GPUs would possess PCI info.
|
||||
// Make sure the Nvlink is up. The previous call should have trained the link.
|
||||
nvmlEnableState_t isActive;
|
||||
if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
|
||||
|
||||
// Try to figure out what's on the other side of the NVLink
|
||||
nvmlPciInfo_t remoteProc;
|
||||
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
|
||||
|
||||
@@ -90,7 +97,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
p[c] = toupper(p[c]);
|
||||
}
|
||||
|
||||
if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
|
||||
if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
|
||||
links++;
|
||||
} else {
|
||||
// Make a lower case copy of the bus ID for calling ncclDeviceType
|
||||
@@ -102,11 +109,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
lowerId[c] = tolower(p[c]);
|
||||
}
|
||||
|
||||
// Determine if the remote side is NVswitch
|
||||
// Determine if the remote side is NVswitch or a GPU
|
||||
enum ncclNvLinkDeviceType type;
|
||||
if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
|
||||
//TODO: we are making an assumption that all GPUs are connected to this switch
|
||||
//This assumption may change for future architectures
|
||||
ncclResult_t ret = ncclDeviceType(lowerId, &type);
|
||||
if (ret == ncclSuccess) {
|
||||
if (type == ncclNvLinkDeviceSwitch) {
|
||||
//TODO: we are making an assumption that all GPUs are connected to this switch
|
||||
//This assumption may change for future architectures
|
||||
nvswitch_links++;
|
||||
} else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
|
||||
links++;
|
||||
}
|
||||
} else {
|
||||
// The NVLink is up but we couldn't find the PCI device on the other
|
||||
// side. Assume it's an NVswitch outside a VM.
|
||||
if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
|
||||
nvswitch_links++;
|
||||
}
|
||||
}
|
||||
@@ -114,43 +131,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
|
||||
}
|
||||
|
||||
static int getNumNvlinks(const char* busId) {
|
||||
nvmlDevice_t nvmlDev;
|
||||
ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
|
||||
if (res != ncclSuccess) return 0;
|
||||
|
||||
int nvlinks = 0, nvswitch_links = 0;
|
||||
int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
|
||||
for(int l=0; l<maxNvLinks; ++l) {
|
||||
unsigned canP2P;
|
||||
nvmlEnableState_t isActive;
|
||||
if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
|
||||
wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
|
||||
nvlinks++;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
nvmlPciInfo_t remoteProc;
|
||||
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
|
||||
|
||||
// Make a lower case copy of the bus ID for calling ncclDeviceType
|
||||
// PCI system path is in lower case
|
||||
char* p = remoteProc.busId;
|
||||
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
|
||||
if (p[c] == 0) break;
|
||||
lowerId[c] = tolower(p[c]);
|
||||
}
|
||||
|
||||
// Determine if the remote side is NVswitch
|
||||
enum ncclNvLinkDeviceType type;
|
||||
if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
|
||||
//TODO: we are making an assumption that all GPUs are connected to this switch
|
||||
//This assumption may change for future architectures
|
||||
nvswitch_links++;
|
||||
}
|
||||
}
|
||||
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -8,13 +8,23 @@
|
||||
#ifndef NCCL_NVLINK_H_
|
||||
#define NCCL_NVLINK_H_
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "nvmlwrap.h"
|
||||
#include "topo.h"
|
||||
|
||||
#define CONNECT_NVLINK 0x10
|
||||
#define CONNECT_NVSWITCH 0x100
|
||||
|
||||
static int getNumNvlinks(const char* busId) {
|
||||
return 0;
|
||||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
int links = 0;
|
||||
return CONNECT_NVLINK*links;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -7,7 +7,7 @@
|
||||
#ifndef NCCL_NVMLWRAP_H_
|
||||
#define NCCL_NVMLWRAP_H_
|
||||
|
||||
#include "core.h"
|
||||
#include "nccl.h"
|
||||
|
||||
//#define NVML_DIRECT 1
|
||||
#ifdef NVML_DIRECT
|
||||
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
|
||||
NVMLCHECK(nvmlDeviceGetIndex(device, index));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
|
||||
NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
|
||||
NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
|
||||
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
|
||||
return ncclSuccess;
|
||||
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
|
||||
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
|
||||
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
// Dynamically handle dependencies on NVML
|
||||
|
||||
@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
|
||||
ncclResult_t wrapNvmlShutdown(void);
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
|
||||
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
|
||||
ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
|
||||
ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
|
||||
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
|
||||
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
|
||||
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
|
||||
nvmlNvLinkCapability_t capability, unsigned int *capResult);
|
||||
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
|
||||
|
||||
#endif // NVML_DIRECT
|
||||
|
||||
#endif // End include guard
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
|
||||
s++;
|
||||
strncpy(envValue, line+s, 1024);
|
||||
setenv(envVar, envValue, 0);
|
||||
char *str = getenv(envVar);
|
||||
}
|
||||
if (line) free(line);
|
||||
fclose(file);
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_RING_H_
|
||||
#define NCCL_RING_H_
|
||||
#include "core.h"
|
||||
|
||||
ncclResult_t initRing(struct ncclComm* comm, int ringid);
|
||||
ncclResult_t freeRing(struct ncclRing* ring);
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -9,14 +9,13 @@
|
||||
#define NCCL_RINGS_H_
|
||||
|
||||
static int getDefaultThreads() {
|
||||
// On Kepler, rings are doubled later.
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
return 256;
|
||||
#else
|
||||
#else // On Kepler, rings are doubled later.
|
||||
return ncclCudaCompCap() == 3 ? 128 : 256;
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
|
||||
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -18,8 +18,9 @@
|
||||
|
||||
#define MAX_IFS 16
|
||||
#define MAX_IF_NAME_SIZE 16
|
||||
#define SLEEP_INT 1000 // sleep interval in usec
|
||||
#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
|
||||
#define SLEEP_INT 1000 // connection retry sleep interval in usec
|
||||
#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
|
||||
#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
|
||||
|
||||
/* Common socket address storage structure for IPv4/IPv6 */
|
||||
union socketAddress {
|
||||
@@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
static inline short socketToPort(struct sockaddr *saddr) {
|
||||
static inline uint16_t socketToPort(struct sockaddr *saddr) {
|
||||
return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
|
||||
}
|
||||
|
||||
@@ -60,9 +61,12 @@ static inline int envSocketFamily(void) {
|
||||
}
|
||||
|
||||
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[1024];
|
||||
#endif
|
||||
struct netIf userIfs[MAX_IFS];
|
||||
bool searchNot = prefixList && prefixList[0] == '^';
|
||||
bool searchExact = prefixList && prefixList[0] == '=';
|
||||
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
|
||||
|
||||
int found = 0;
|
||||
@@ -89,7 +93,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
|
||||
}
|
||||
|
||||
// check against user specified interfaces
|
||||
if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
|
||||
if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -106,7 +110,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
|
||||
// Store the IP address
|
||||
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
|
||||
memcpy(addrs+found, interface->ifa_addr, salen);
|
||||
INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
|
||||
found++;
|
||||
}
|
||||
}
|
||||
@@ -159,7 +162,10 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
|
||||
}
|
||||
|
||||
static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
|
||||
char line[1024], line_a[1024];
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[1024];
|
||||
#endif
|
||||
char line_a[1024];
|
||||
int found = 0;
|
||||
struct ifaddrs *interfaces, *interface;
|
||||
getifaddrs(&interfaces);
|
||||
@@ -183,7 +189,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
|
||||
// Store the interface name
|
||||
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
|
||||
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
|
||||
found++;
|
||||
if (found == maxIfs) break;
|
||||
}
|
||||
@@ -336,8 +342,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
|
||||
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
|
||||
#endif
|
||||
|
||||
/* Put the socket in listen mode */
|
||||
SYSCHECK(listen(sockfd, 128), "listen");
|
||||
/* Put the socket in listen mode
|
||||
* NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
|
||||
*/
|
||||
SYSCHECK(listen(sockfd, 16384), "listen");
|
||||
*fd = sockfd;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -367,14 +375,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
||||
#endif
|
||||
|
||||
int ret;
|
||||
int retries = 0;
|
||||
int timedout_retries = 0;
|
||||
int refused_retries = 0;
|
||||
retry:
|
||||
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
|
||||
if (ret == 0) return ncclSuccess;
|
||||
if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
|
||||
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
|
||||
if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
||||
INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
|
||||
return ncclSystemError;
|
||||
@@ -382,12 +394,12 @@ retry:
|
||||
|
||||
#define NCCL_SOCKET_SEND 0
|
||||
#define NCCL_SOCKET_RECV 1
|
||||
static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
|
||||
static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) {
|
||||
int bytes = 0;
|
||||
char* data = (char*)ptr;
|
||||
do {
|
||||
if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
|
||||
if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
|
||||
if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
||||
if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
|
||||
if (op == NCCL_SOCKET_RECV && bytes == 0) {
|
||||
WARN("Net : Connection closed by remote peer");
|
||||
return ncclSystemError;
|
||||
@@ -405,9 +417,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
|
||||
return socketProgressOpt(op, fd, ptr, size, offset, 0);
|
||||
}
|
||||
|
||||
static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) {
|
||||
while (*offset < size)
|
||||
NCCLCHECK(socketProgress(op, fd, ptr, size, offset));
|
||||
NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -12,78 +11,35 @@
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
#define BUSID_SIZE (sizeof("0000:00:00.0"))
|
||||
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
|
||||
ncclResult_t getCudaPath(int cudaDev, char** path);
|
||||
|
||||
static bool isEPYC() {
|
||||
std::ifstream cpuinfo("/proc/cpuinfo");
|
||||
std::string line;
|
||||
int needed = 2;
|
||||
static bool vendor_id = true, cpu_family = false, initialized = false;
|
||||
if (initialized) return (vendor_id && cpu_family);
|
||||
while (std::getline(cpuinfo, line)) {
|
||||
if (line.compare(0, 9, "vendor_id") == 0) {
|
||||
if(line.find("AuthenticAMD") == std::string::npos)
|
||||
vendor_id = false;
|
||||
needed --;
|
||||
}
|
||||
if (line.compare(0, 10, "cpu family") == 0) {
|
||||
std::string family_str = line.substr(line.find(": ") + 2);
|
||||
if (std::stoi(family_str) >= 23)
|
||||
cpu_family = true;
|
||||
needed --;
|
||||
}
|
||||
if (!needed)
|
||||
break;
|
||||
}
|
||||
initialized = true;
|
||||
return (vendor_id && cpu_family);
|
||||
}
|
||||
static int getNumaId(char *path) {
|
||||
char npath[PATH_MAX];
|
||||
snprintf(npath, PATH_MAX, "%s/numa_node", path);
|
||||
npath[PATH_MAX-1] = '\0';
|
||||
|
||||
static ncclResult_t getCudaPath(int cudaDev, char** path) {
|
||||
char busId[BUSID_SIZE];
|
||||
CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
|
||||
for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
|
||||
char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
|
||||
memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
|
||||
*path = realpath(busPath, NULL);
|
||||
if (*path == NULL) {
|
||||
WARN("Could not find real path of %s", busPath);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
int numaId = -1;
|
||||
FILE *file = fopen(npath, "r");
|
||||
if (file == NULL) return -1;
|
||||
if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
|
||||
fclose(file);
|
||||
|
||||
return numaId;
|
||||
}
|
||||
|
||||
enum ncclPathDist {
|
||||
PATH_PIX = 0,
|
||||
PATH_PXB = 1,
|
||||
PATH_PHB = 2,
|
||||
PATH_SOC = 3
|
||||
PATH_PIX = 0,
|
||||
PATH_PXB = 1,
|
||||
PATH_PHB = 2,
|
||||
PATH_NODE = 3,
|
||||
PATH_SYS = 4,
|
||||
PATH_ARRAY_SIZE = 5
|
||||
};
|
||||
|
||||
static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
|
||||
extern const char* pathDists[PATH_ARRAY_SIZE];
|
||||
|
||||
static int pciDistance(char* path1, char* path2) {
|
||||
int score = 0;
|
||||
int depth = 0;
|
||||
int same = 1;
|
||||
for (int i=0; i<strlen(path1); i++) {
|
||||
if (path1[i] != path2[i]) same = 0;
|
||||
if (path1[i] == '/') {
|
||||
depth++;
|
||||
if (same == 1) score++;
|
||||
}
|
||||
}
|
||||
if (isEPYC() && score <= 3) return PATH_PHB;
|
||||
if (score <= 3) return PATH_SOC;
|
||||
if (score == 4) return PATH_PHB;
|
||||
if (score == depth-1) return PATH_PIX;
|
||||
return PATH_PXB;
|
||||
}
|
||||
int pciDistance(char* path1, char* path2);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,7 +8,9 @@
|
||||
#define NCCL_TRANSPORT_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "devcomm.h"
|
||||
#include <stdint.h>
|
||||
#include "nvmlwrap.h"
|
||||
|
||||
#define NTRANSPORTS 3
|
||||
|
||||
@@ -19,11 +21,13 @@ struct ncclRing;
|
||||
struct ncclConnector;
|
||||
struct ncclComm;
|
||||
|
||||
#define RANK_INFO_SIZE 64
|
||||
typedef char ncclTinfo_t[RANK_INFO_SIZE];
|
||||
|
||||
struct ncclInfo {
|
||||
ncclTinfo_t tinfo[NTRANSPORTS];
|
||||
struct ncclPeerInfo {
|
||||
int rank;
|
||||
int cudaDev;
|
||||
int nvmlDev;
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
};
|
||||
|
||||
// Used to hold the transport connection values
|
||||
@@ -34,18 +38,47 @@ struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
};
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
|
||||
struct ncclProxyArgs;
|
||||
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
struct ncclProxyArgs {
|
||||
struct ncclRing* ring;
|
||||
int substeps;
|
||||
proxyProgressFunc_t progress;
|
||||
struct ncclChannel* channel;
|
||||
struct ncclConnector* connector;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int nsteps;
|
||||
uint64_t opCount;
|
||||
int llMode;
|
||||
bool needProxy;
|
||||
int active; // add component before this line -- it is left out during initialization
|
||||
int state; // add component before this line -- it is left out during initialization
|
||||
|
||||
// Internal state
|
||||
uint64_t head;
|
||||
uint64_t tail;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
int idle;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
};
|
||||
|
||||
struct ncclProxyPool;
|
||||
struct ncclProxyState {
|
||||
pthread_cond_t cond;
|
||||
pthread_mutex_t mutex;
|
||||
bool stop;
|
||||
struct ncclProxyArgs* ops;
|
||||
struct ncclProxyArgs* pool;
|
||||
struct ncclProxyPool* pools;
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
|
||||
ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
|
||||
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
|
||||
ncclResult_t (*free)(void*);
|
||||
ncclResult_t (*proxy)(struct ncclProxyArgs*);
|
||||
@@ -53,8 +86,7 @@ struct ncclTransportComm {
|
||||
|
||||
struct ncclTransport {
|
||||
const char name[4];
|
||||
ncclResult_t (*fillInfo)(ncclTinfo_t*, int, uint64_t);
|
||||
ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
|
||||
ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
||||
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
|
||||
struct ncclTransportComm send;
|
||||
struct ncclTransportComm recv;
|
||||
@@ -64,37 +96,17 @@ struct ncclTransport {
|
||||
|
||||
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
|
||||
|
||||
#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
|
||||
|
||||
struct transportProxyInfo {
|
||||
struct ncclComm* comm;
|
||||
pthread_t thread;
|
||||
threadFunc_t func;
|
||||
volatile int proxyReady;
|
||||
struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
|
||||
volatile uint64_t argsFifoHead;
|
||||
volatile uint64_t argsFifoTail;
|
||||
pthread_cond_t cond;
|
||||
pthread_mutex_t mutex;
|
||||
};
|
||||
|
||||
ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
|
||||
ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
|
||||
|
||||
enum proxyMode {
|
||||
proxyRing = 0,
|
||||
proxyFrom = 1,
|
||||
proxyTo = 2
|
||||
};
|
||||
|
||||
static int proxyPatternRing = proxyRing;
|
||||
static inline int proxyPatternFrom(int root) { return 1+root; }
|
||||
static inline int proxyPatternTo(int root) { return -1-root; }
|
||||
static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
|
||||
static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
|
||||
|
||||
ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
|
||||
ncclResult_t transportStartProxies(struct ncclComm* comm);
|
||||
ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
|
||||
ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
|
||||
ncclResult_t transportStartProxy(struct ncclComm* comm);
|
||||
ncclResult_t transportCreateProxy(struct ncclComm* comm);
|
||||
ncclResult_t transportDestroyProxy(struct ncclComm* comm);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -106,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
|
||||
}
|
||||
}
|
||||
|
||||
inline void transportProxyIdle(int idle) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_TREES_H_
|
||||
#define NCCL_TREES_H_
|
||||
|
||||
ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
|
||||
ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,7 +11,7 @@
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen);
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getnHash(const char* string, int n);
|
||||
uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
@@ -21,6 +22,6 @@ struct netIf {
|
||||
};
|
||||
|
||||
int parseStringList(const char* string, struct netIf* ifList, int maxList);
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
|
||||
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
|
||||
|
||||
#endif
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,970 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
#include "ring.h"
|
||||
#include "param.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "rings.h"
|
||||
#include "bootstrap.h"
|
||||
#include "transport.h"
|
||||
#include "common_coll.h"
|
||||
#include "group.h"
|
||||
#include "utils.h"
|
||||
#include "net.h"
|
||||
#include "topo.h"
|
||||
#include <numa.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#define STR2(v) #v
|
||||
#define STR(v) STR2(v)
|
||||
|
||||
int ncclDebugLevel;
|
||||
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
|
||||
pthread_mutex_t ncclDebugOutputLock;
|
||||
FILE *ncclDebugFile = stdout;
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
||||
#else
|
||||
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
||||
#endif
|
||||
|
||||
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
||||
|
||||
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||
|
||||
ncclNet_t* ncclNet = NULL;
|
||||
|
||||
// We define this as weak to let tests redefine their own
|
||||
#pragma weak ncclCudaCompCap
|
||||
int ncclCudaCompCap() {
|
||||
int cudaDev;
|
||||
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
|
||||
int ccMajor;
|
||||
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
|
||||
return ccMajor;
|
||||
}
|
||||
int ncclCudaFullCompCap() {
|
||||
int cudaDev;
|
||||
if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
|
||||
int ccMajor, ccMinor;
|
||||
if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
|
||||
if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
|
||||
return ccMajor*10+ccMinor;
|
||||
}
|
||||
|
||||
// Returns ncclInternalError if anything fails, causing that network to be ignored.
|
||||
ncclResult_t initNet(ncclNet_t* net) {
|
||||
int ndev;
|
||||
if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
|
||||
if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
|
||||
if (ndev <= 0) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t initNetPlugin(ncclNet_t** net) {
|
||||
void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
|
||||
if (netPluginLib == NULL) {
|
||||
// dlopen does not guarantee to set errno, but dlerror only gives us a
|
||||
// string, so checking errno doesn't hurt to try to provide a better
|
||||
// error message
|
||||
if (errno == ENOENT) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
|
||||
if (extNet == NULL) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
|
||||
goto cleanup;
|
||||
}
|
||||
if (initNet(extNet) == ncclSuccess) {
|
||||
*net = extNet;
|
||||
return ncclSuccess;
|
||||
}
|
||||
cleanup:
|
||||
if (netPluginLib != NULL) dlclose(netPluginLib);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t initNet() {
|
||||
// Always initialize sockets as we use it for bootstrap
|
||||
NCCLCHECK(initNet(&ncclNetSocket));
|
||||
|
||||
NCCLCHECK(initNetPlugin(&ncclNet));
|
||||
if (ncclNet != NULL) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (initNet(&ncclNetIb) == ncclSuccess) {
|
||||
ncclNet = &ncclNetIb;
|
||||
} else {
|
||||
ncclNet = &ncclNetSocket;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
|
||||
NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
|
||||
|
||||
int ncclThreadThreshold(int minCompCap, int multiNode) {
|
||||
int threshold = ncclParamThreadThreshold();
|
||||
if (threshold == -2) { // user has not set this env variable
|
||||
threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
|
||||
// multiply by 2 if running on multiple nodes
|
||||
if (multiNode) {
|
||||
threshold *= 2;
|
||||
}
|
||||
}
|
||||
return threshold;
|
||||
}
|
||||
|
||||
bool useFineGrainVramPcie = false;
|
||||
|
||||
void parseHsaForceFineGrainVramPcie() {
|
||||
char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
|
||||
if (str && strlen(str) > 0) {
|
||||
errno = 0;
|
||||
int64_t v = strtoll(str, NULL, 0);
|
||||
if (errno || (v != 0 && v != 1)) {
|
||||
INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
|
||||
} else {
|
||||
useFineGrainVramPcie = v;
|
||||
INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static bool initialized = false;
|
||||
static ncclResult_t ncclInit() {
|
||||
if (initialized) return ncclSuccess;
|
||||
pthread_mutex_lock(&initLock);
|
||||
if (!initialized) {
|
||||
initEnv();
|
||||
initDebug();
|
||||
initNet();
|
||||
// Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
|
||||
parseHsaForceFineGrainVramPcie();
|
||||
initialized = true;
|
||||
}
|
||||
pthread_mutex_unlock(&initLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGetVersion, int* version);
|
||||
ncclResult_t ncclGetVersion(int* version) {
|
||||
if (version == NULL) return ncclInvalidArgument;
|
||||
*version = NCCL_VERSION_CODE;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
||||
NCCLCHECK(ncclInit());
|
||||
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
||||
return bootstrapGetUniqueId(out);
|
||||
}
|
||||
|
||||
static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
CUDACHECK(hipFree(comm->devComm));
|
||||
|
||||
for (int ring=0; ring<comm->nRings; ring++)
|
||||
NCCLCHECK(freeRing(comm->rings+ring));
|
||||
|
||||
if (comm->doneEvent != NULL)
|
||||
CUDACHECK(hipEventDestroy(comm->doneEvent));
|
||||
|
||||
if (comm->launchMode == ncclComm::GROUP) {
|
||||
CUDACHECK(hipStreamDestroy(comm->groupStream));
|
||||
}
|
||||
|
||||
// Last rank frees shared resources between threads
|
||||
int isLast;
|
||||
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
|
||||
if (isLast) {
|
||||
free(comm->intraBarrier);
|
||||
free(comm->intraParams);
|
||||
free(comm->intraCudaDevs);
|
||||
free(comm->intraCGMode);
|
||||
free(comm->intraCC);
|
||||
}
|
||||
|
||||
free(comm);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
if (ndev < 1) {
|
||||
WARN("invalid device count (%d) requested", ndev);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (rank >= ndev || rank < 0) {
|
||||
WARN("rank %d exceeds ndev=%d", rank, ndev);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
// Try to create a CUDA object right away. If there is something wrong with
|
||||
// the device we're on (failure cause #1) , better know it early.
|
||||
hipEvent_t doneEvent;
|
||||
CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
|
||||
|
||||
struct ncclComm* comm;
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
|
||||
comm->rank = rank;
|
||||
comm->nRanks = ndev;
|
||||
hipGetDevice(&comm->cudaDev);
|
||||
comm->doneEvent = doneEvent;
|
||||
comm->llThreshold = ncclParamLlThreshold();
|
||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||
#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
comm->groupCudaStream = ncclParamGroupCudaStream();
|
||||
#else
|
||||
// Don't allow the user to overload the default setting in older CUDA builds
|
||||
comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
|
||||
#endif
|
||||
|
||||
comm->argsptr = &comm->args;
|
||||
|
||||
*comret = comm;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
// Fully duplicate the comm on the device
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
|
||||
// Copy the comm on the device
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
|
||||
// Copy userRanks
|
||||
for (int r=0; r<comm->nRings; r++) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
|
||||
#else
|
||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
||||
#endif
|
||||
static void showVersion() {
|
||||
static int shown = 0;
|
||||
if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
|
||||
printf("%s\n", VERSION_STRING);
|
||||
fflush(stdout);
|
||||
if (ncclDebugFile != stdout)
|
||||
INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
|
||||
shown = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t fillInfo(struct ncclInfo* info, int rank, uint64_t commHash) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank, commHash));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice);
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
ncclTvalue_t ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
|
||||
if (ret > 0) {
|
||||
cpu_set_t affinitySave;
|
||||
nvmlDevice_t nvmlDevice;
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
SetCpuAffinity(cudaDev, &nvmlDevice);
|
||||
NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
|
||||
*transportRet = transport;
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("No transport found !");
|
||||
*transportRet = NULL;
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
|
||||
NCCLCHECK(initRing(comm, ringid));
|
||||
|
||||
struct ncclRing* ring = comm->rings+ringid;
|
||||
// Reorganize ranks to start with rank.
|
||||
int shift;
|
||||
for (shift = 0; shift<nranks; shift++) {
|
||||
if (ringRanks[shift] == rank) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i=0; i<nranks; i++) {
|
||||
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
|
||||
}
|
||||
int prev = ring->userRanks[nranks-1];
|
||||
int next = ring->userRanks[1];
|
||||
|
||||
NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
|
||||
NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
|
||||
NCCLCHECK(transportCreateProxy(0, ring, comm));
|
||||
NCCLCHECK(transportCreateProxy(1, ring, comm));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
|
||||
for (int r=0; r<nranks; r++) {
|
||||
connectTransport[r] = -1;
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
|
||||
if (connectValue[r] > 0) {
|
||||
connectTransport[r] = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void swap(void* mem1, void* mem2, int size) {
|
||||
char tmp[size];
|
||||
memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
|
||||
}
|
||||
|
||||
#define MAXWIDTH 64
|
||||
#define PREFIXLEN 15
|
||||
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
|
||||
void dumpMatrix(int* connectMatrix, int nranks) {
|
||||
char line[STRLENGTH+1];
|
||||
line[STRLENGTH] = '\0';
|
||||
memset(line, ' ', STRLENGTH);
|
||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
for (int i=0; i<nranks; i++) {
|
||||
memset(line, ' ', STRLENGTH);
|
||||
sprintf(line, "%3d ", i);
|
||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
}
|
||||
}
|
||||
|
||||
void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
|
||||
char line[STRLENGTH+1];
|
||||
line[STRLENGTH] = '\0';
|
||||
memset(line, ' ', STRLENGTH);
|
||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
for (int i=0; i<nranks; i++) {
|
||||
memset(line, ' ', STRLENGTH);
|
||||
sprintf(line, "%3d ", i);
|
||||
for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void dumpLine(int* values, int nranks, const char* prefix) {
|
||||
int prefixlen = strlen(prefix);
|
||||
char line[STRLENGTH+1];
|
||||
line[STRLENGTH] = '\0';
|
||||
memset(line, ' ', STRLENGTH);
|
||||
strncpy(line, prefix, PREFIXLEN);
|
||||
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
}
|
||||
|
||||
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
||||
for (int r=0; r<nrings; r++) {
|
||||
char prefix[30];
|
||||
/*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
|
||||
dumpLine(prev+r*nranks, nranks, prefix);
|
||||
sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
|
||||
dumpLine(next+r*nranks, nranks, prefix);*/
|
||||
|
||||
int current = rank;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
rings[r*nranks+i] = current;
|
||||
current = next[r*nranks+current];
|
||||
}
|
||||
sprintf(prefix, "Ring %02d : ", r);
|
||||
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
|
||||
if (current != rank) {
|
||||
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Check that all ranks are there
|
||||
for (int i=0; i<nranks; i++) {
|
||||
int found = 0;
|
||||
for (int j=0; j<nranks; j++) {
|
||||
if (rings[r*nranks+j] == i) {
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found == 0) {
|
||||
WARN("Error : ring %d does not contain rank %d", r, i);
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* waitForNonNullPtr(void* p) {
|
||||
volatile void** ptr = (volatile void**) p;
|
||||
while (LOAD(ptr) == NULL) sched_yield();
|
||||
return (void*)LOAD(ptr);
|
||||
}
|
||||
|
||||
ncclResult_t initParams(struct ncclComm* comm) {
|
||||
hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
|
||||
params->args = (void **)&comm->argsptr;
|
||||
params->stream = NULL;
|
||||
params->sharedMem = 0;
|
||||
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
|
||||
params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Allocate/Set Intra Process Structures and set CG options
|
||||
ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
|
||||
comm->intraRank = rank;
|
||||
comm->intraRanks = ranks;
|
||||
comm->intraPhase = 0;
|
||||
|
||||
// Alloc shared structures
|
||||
if (rank == 0) {
|
||||
assert(comm == comm0);
|
||||
int* bar;
|
||||
NCCLCHECK(ncclCalloc(&bar, 2));
|
||||
bar[0] = bar[1] = 0;
|
||||
comm->intraBarrier = bar;
|
||||
NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
|
||||
int* CGMode;
|
||||
NCCLCHECK(ncclCalloc(&CGMode, 1));
|
||||
*CGMode = 0x11;
|
||||
comm->intraCGMode = CGMode;
|
||||
int* CC;
|
||||
NCCLCHECK(ncclCalloc(&CC, 1));
|
||||
*CC = ncclCudaFullCompCap();
|
||||
comm->intraCC = CC;
|
||||
} else {
|
||||
comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
|
||||
comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
|
||||
comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
|
||||
comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
|
||||
comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
|
||||
}
|
||||
comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
|
||||
NCCLCHECK(initParams(comm));
|
||||
|
||||
int cgMdLaunch = 1;
|
||||
|
||||
// Set CG Mode
|
||||
comm->launchMode = ncclComm::GROUP;
|
||||
char* str = getenv("NCCL_LAUNCH_MODE");
|
||||
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
|
||||
comm->launchMode = ncclComm::PARALLEL;
|
||||
}
|
||||
if (comm->launchMode == ncclComm::GROUP) {
|
||||
CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
|
||||
#if CUDART_VERSION >= 9000
|
||||
if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
|
||||
// Check whether the GPU supports Cooperative Group Multi Device Launch
|
||||
(void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Disable cgMdLaunch if any rank does not support it
|
||||
if (cgMdLaunch == 0) {
|
||||
*comm->intraCGMode = 0x10;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
void* commState;
|
||||
uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
|
||||
TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
|
||||
NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
|
||||
|
||||
struct ncclInfo* allInfo;
|
||||
NCCLCHECK(ncclCalloc(&allInfo, nranks));
|
||||
NCCLCHECK(fillInfo(allInfo+rank, rank, commHash));
|
||||
NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
|
||||
|
||||
int* connectTransport;
|
||||
ncclTvalue_t* connectValue;
|
||||
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
|
||||
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
|
||||
|
||||
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
|
||||
NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
|
||||
NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
|
||||
//if (rank == 0) dumpMatrix(connectTransport, nranks);
|
||||
//if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
|
||||
|
||||
// Get my rings
|
||||
int nrings;
|
||||
int* prev, *next;
|
||||
NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
|
||||
NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
|
||||
comm->nThreads = getDefaultThreads();
|
||||
NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
|
||||
free(connectTransport);
|
||||
free(connectValue);
|
||||
|
||||
// Find max nThreads
|
||||
int allData[nranks];
|
||||
allData[rank] = comm->nThreads;
|
||||
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
||||
for (int i=0; i<nranks; i++)
|
||||
comm->nThreads = std::max(allData[i], comm->nThreads);
|
||||
if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
|
||||
|
||||
// Determine the minimum CUDA Compute capability of all GPUs
|
||||
int myCompCap = ncclCudaCompCap();
|
||||
int minCompCap = myCompCap;
|
||||
allData[rank] = myCompCap;
|
||||
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
||||
for (int i=0; i<nranks; i++)
|
||||
minCompCap = std::min(allData[i], minCompCap);
|
||||
if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
|
||||
|
||||
// Find min nrings across ranks
|
||||
allData[rank] = nrings;
|
||||
NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
|
||||
for (int i=0; i<nranks; i++)
|
||||
nrings = std::min(allData[i], nrings);
|
||||
|
||||
// Exchange data with others to build complete rings
|
||||
comm->nRings = nrings;
|
||||
for (int r=0; r<nrings; r++) {
|
||||
NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
|
||||
NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
|
||||
}
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
|
||||
NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
|
||||
free(prev);
|
||||
free(next);
|
||||
|
||||
// Connect with prev/next for each ring
|
||||
struct ncclConnect *connectData;
|
||||
NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
|
||||
for (int r=0; r<nrings; r++) {
|
||||
int* ringRanks = rings+r*nranks;
|
||||
struct ncclRing *ring = comm->rings+r;
|
||||
NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
|
||||
int prev_offset = ring->userRanks[nranks-1]*2+1;
|
||||
int next_offset = ring->userRanks[1]*2;
|
||||
NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
|
||||
NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
|
||||
NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
|
||||
}
|
||||
free(connectData);
|
||||
free(rings);
|
||||
free(allInfo);
|
||||
|
||||
// Intra-process barrier setup
|
||||
struct rankInfo {
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
struct ncclComm* comm;
|
||||
} rankInfos[nranks];
|
||||
rankInfos[rank].hostHash = getHostHash();
|
||||
rankInfos[rank].pidHash = getPidHash();
|
||||
rankInfos[rank].comm = comm;
|
||||
NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
|
||||
|
||||
// Compute intra ranks
|
||||
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
|
||||
int multiNode = 0;
|
||||
for (int r=0; r<nranks; r++) {
|
||||
if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
|
||||
(rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
|
||||
if (intraRanks == 0) intraRank0 = r;
|
||||
if (r == rank) intraRank = intraRanks;
|
||||
intraRanks++;
|
||||
} else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
|
||||
multiNode = 1;
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
|
||||
if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
|
||||
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
|
||||
rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
|
||||
|
||||
// Determine thread threshold across all GPUs
|
||||
comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
|
||||
|
||||
// Barrier
|
||||
bootstrapClose(commState);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
if (numa_available() < 0) {
|
||||
WARN("System does not support NUMA API!");
|
||||
return false;
|
||||
}
|
||||
char* cudaPath;
|
||||
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
|
||||
strcat(cudaPath, "/numa_node");
|
||||
int fd;
|
||||
SYSCHECKVAL(open(cudaPath, O_RDONLY), "open", fd);
|
||||
char numa_node[5];
|
||||
int len;
|
||||
SYSCHECKVAL(read(fd, numa_node, 4), "read", len);
|
||||
SYSCHECK(close(fd), "close");
|
||||
errno = 0;
|
||||
long node = strtol(numa_node, NULL, 10);
|
||||
if (errno == ERANGE || errno == EINVAL) {
|
||||
INFO(NCCL_ALL,"%s: Call to strtol returned %s", __func__, strerror(errno));
|
||||
free(cudaPath);
|
||||
return false;
|
||||
}
|
||||
numa_run_on_node(node);
|
||||
numa_set_preferred(node);
|
||||
free(cudaPath);
|
||||
return true;
|
||||
#else
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
if (hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != hipSuccess) return false;
|
||||
if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
|
||||
if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
|
||||
WARN("Failed to set CPU affinity");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
||||
cpu_set_t affinitySave;
|
||||
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
|
||||
NCCLCHECK(wrapNvmlSymbols());
|
||||
NCCLCHECK(wrapNvmlInit());
|
||||
|
||||
// Make sure all host memory allocation are close to the GPU
|
||||
int cudaDev;
|
||||
nvmlDevice_t nvmlDevice;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
SetCpuAffinity(cudaDev, &nvmlDevice);
|
||||
ncclResult_t res;
|
||||
|
||||
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
|
||||
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
||||
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
||||
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
|
||||
|
||||
return ncclSuccess;
|
||||
cleanup:
|
||||
*newcomm = NULL;
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
return res;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
|
||||
char* env = getenv("NCCL_COMM_ID");
|
||||
if (env && myrank == 0) {
|
||||
NCCLCHECK(bootstrapCreateRoot(&commId, true));
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclInit());
|
||||
if (myrank == 0) showVersion();
|
||||
|
||||
INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
|
||||
|
||||
// Make sure the CUDA runtime is initialized.
|
||||
CUDACHECK(hipFree(NULL));
|
||||
|
||||
NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
|
||||
if (nranks < 1 || myrank < 0 || myrank >= nranks) {
|
||||
WARN("Invalid rank requested : %d/%d", myrank, nranks);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
if (ncclAsyncMode()) {
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
|
||||
} else {
|
||||
return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
|
||||
struct ncclInfo* allInfo;
|
||||
NCCLCHECK(ncclCalloc(&allInfo, nranks));
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
CUDACHECK(hipSetDevice(devs[rank]));
|
||||
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
|
||||
}
|
||||
|
||||
int* connectTransport;
|
||||
ncclTvalue_t* connectValue;
|
||||
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
|
||||
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
|
||||
for (int rank=0; rank<nranks; rank++)
|
||||
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
|
||||
|
||||
int* prev, *prevFinal, *next, *nextFinal;
|
||||
NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
|
||||
NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
|
||||
NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
|
||||
NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
|
||||
int nrings = MAXRINGS;
|
||||
int nthreads=0;
|
||||
int myCompCap = ncclCudaCompCap();
|
||||
int minCompCap = myCompCap;
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
CUDACHECK(hipSetDevice(devs[rank]));
|
||||
int nringsRank;
|
||||
int nthreadsRank = getDefaultThreads();
|
||||
myCompCap = ncclCudaCompCap();
|
||||
NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
|
||||
nrings = std::min(nrings, nringsRank);
|
||||
nthreads = std::max(nthreads, nthreadsRank);
|
||||
minCompCap = std::min(minCompCap, myCompCap);
|
||||
for (int ring=0; ring<nrings; ring++) {
|
||||
int index = ring*nranks+rank;
|
||||
prevFinal[index] = prev[index];
|
||||
nextFinal[index] = next[index];
|
||||
}
|
||||
}
|
||||
free(connectTransport);
|
||||
free(connectValue);
|
||||
free(prev);
|
||||
free(next);
|
||||
|
||||
INFO(NCCL_INIT,"Using %d threads", nthreads);
|
||||
INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
|
||||
|
||||
int* rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
|
||||
NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
|
||||
free(prevFinal);
|
||||
free(nextFinal);
|
||||
|
||||
// Determine thread threshold across all GPUs
|
||||
int threadThreshold = ncclThreadThreshold(minCompCap, 0);
|
||||
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
comms[rank]->nRings = nrings;
|
||||
comms[rank]->nThreads = nthreads;
|
||||
comms[rank]->threadThreshold = threadThreshold;
|
||||
}
|
||||
|
||||
for (int r=0; r<nrings; r++) {
|
||||
struct ncclConnect connect[2*nranks];
|
||||
int* ringRanks = rings+r*nranks;
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
CUDACHECK(hipSetDevice(devs[rank]));
|
||||
NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
|
||||
}
|
||||
// RingExchange connect information
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
// Swap rank->prev and prevRank->next
|
||||
struct ncclRing *ring = comms[rank]->rings+r;
|
||||
int prevRank = ring->userRanks[nranks-1];
|
||||
struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
|
||||
struct ncclConnect* rankPrevConnect = connect+2*rank;
|
||||
swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
|
||||
}
|
||||
for (int rank=0; rank<nranks; rank++) {
|
||||
CUDACHECK(hipSetDevice(devs[rank]));
|
||||
struct ncclRing *ring = comms[rank]->rings+r;
|
||||
NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
|
||||
NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
|
||||
}
|
||||
}
|
||||
free(rings);
|
||||
free(allInfo);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
NCCLCHECK(ncclInit());
|
||||
NCCLCHECK(wrapNvmlSymbols());
|
||||
NCCLCHECK(wrapNvmlInit());
|
||||
showVersion();
|
||||
|
||||
INFO(NCCL_INIT,"nranks %d", ndev);
|
||||
|
||||
NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
|
||||
if (ndev < 1) {
|
||||
WARN("Invalid device count requested : %d", ndev);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
ncclResult_t res;
|
||||
int savedDevice;
|
||||
int rank, cudaDev;
|
||||
ncclComm_t comm = NULL;
|
||||
nvmlDevice_t nvmlDevice;
|
||||
int ncclDevList[ndev];
|
||||
for (int i=0; i<ndev; i++) {
|
||||
ncclDevList[i] = devlist ? devlist[i] : i;
|
||||
}
|
||||
|
||||
hipGetDevice(&savedDevice);
|
||||
|
||||
for(rank=0; rank<ndev; ++rank)
|
||||
comms[rank] = NULL;
|
||||
|
||||
cpu_set_t affinitySave;
|
||||
sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
|
||||
for (rank=0; rank<ndev; ++rank) {
|
||||
cudaDev = ncclDevList[rank];
|
||||
CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
|
||||
|
||||
SetCpuAffinity(cudaDev, &nvmlDevice);
|
||||
|
||||
NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
|
||||
comms[rank] = comm;
|
||||
|
||||
NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
|
||||
}
|
||||
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
|
||||
NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
|
||||
|
||||
for(rank=0; rank<ndev; ++rank) {
|
||||
cudaDev = ncclDevList[rank];
|
||||
CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
|
||||
NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
|
||||
}
|
||||
|
||||
res = ncclSuccess;
|
||||
goto final;
|
||||
|
||||
cleanup:
|
||||
for(rank=0; rank<ndev; ++rank) {
|
||||
if(comms[rank] != NULL) {
|
||||
commFree(comms[rank]);
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
if(wrapNvmlShutdown() != ncclSuccess)
|
||||
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
|
||||
hipSetDevice(savedDevice);
|
||||
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
|
||||
return res;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
int savedDevice;
|
||||
CUDACHECK(hipGetDevice(&savedDevice));
|
||||
int commDevice = comm->cudaDev;
|
||||
|
||||
if (savedDevice != commDevice) {
|
||||
CUDACHECK(hipSetDevice(commDevice));
|
||||
}
|
||||
|
||||
NCCLCHECK(commFree(comm));
|
||||
|
||||
if (savedDevice != commDevice)
|
||||
CUDACHECK(hipSetDevice(savedDevice));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
|
||||
const char* ncclGetErrorString(ncclResult_t code) {
|
||||
switch (code) {
|
||||
case ncclSuccess : return "no error";
|
||||
case ncclUnhandledCudaError : return "unhandled cuda error";
|
||||
case ncclSystemError : return "unhandled system error";
|
||||
case ncclInternalError : return "internal error";
|
||||
case ncclInvalidArgument : return "invalid argument";
|
||||
case ncclInvalidUsage : return "invalid usage";
|
||||
default : return "unknown result code";
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
|
||||
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
|
||||
NCCLCHECK(PtrCheck(count, "CommCount", "count"));
|
||||
*count = comm->nRanks;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
|
||||
NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
|
||||
NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
|
||||
*devid = comm->cudaDev;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
|
||||
NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
|
||||
NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
|
||||
*rank = comm->rank;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "argcheck.h"
|
||||
|
||||
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
|
||||
hipPointerAttribute_t attr;
|
||||
hipError_t err = hipPointerGetAttributes(&attr, pointer);
|
||||
if (err != hipSuccess || attr.devicePointer == NULL) {
|
||||
WARN("%s : %s is not a valid pointer", opname, ptrname);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
#if CUDART_VERSION >= 10000
|
||||
if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
|
||||
#else
|
||||
if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
|
||||
#endif
|
||||
WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
|
||||
if (ptr == NULL) {
|
||||
WARN("%s : %s argument is NULL", opname, ptrname);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
|
||||
// First, the easy ones
|
||||
if (info->root < 0 || info->root >= info->comm->nRanks) {
|
||||
WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
|
||||
WARN("%s : invalid type %d", info->opName, info->datatype);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
// Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
|
||||
info->count = info->nBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
|
||||
|
||||
if (info->op < 0 || info->op >= ncclNumOps) {
|
||||
WARN("%s : invalid reduction operation %d", info->opName, info->op);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
if (info->comm->checkPointers) {
|
||||
// Check CUDA device pointers
|
||||
if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
|
||||
}
|
||||
if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
|
||||
NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Ссылка в новой задаче
Block a user