diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 9e5f7f1f8b..b2c1bdf4f0 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -1,5 +1,4 @@ -# Doxyfile 1.8.10 -# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +# Doxyfile 1.8.17 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -18,11 +17,11 @@ # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 @@ -39,20 +38,20 @@ PROJECT_NAME = "RCCL" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = v3.0.1.0 +PROJECT_NUMBER = v2.18.3 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" +PROJECT_BRIEF = "ROCm Collective Communications Library" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. -PROJECT_LOGO = +PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is @@ -94,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO OUTPUT_LANGUAGE = English +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -136,7 +143,7 @@ ABBREVIATE_BRIEF = "The $name class" \ # description. # The default value is: NO. -ALWAYS_DETAILED_SEC = NO +ALWAYS_DETAILED_SEC = YES # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those @@ -151,7 +158,7 @@ INLINE_INHERITED_MEMB = NO # shortest path that makes the file name unique will be used # The default value is: YES. -FULL_PATH_NAMES = YES +FULL_PATH_NAMES = NO # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand @@ -190,6 +197,16 @@ SHORT_NAMES = NO JAVADOC_AUTOBRIEF = NO +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus @@ -237,7 +254,12 @@ TAB_SIZE = 4 # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) ALIASES = @@ -253,7 +275,7 @@ TCL_SUBST = # members will be omitted, etc. # The default value is: NO. -OPTIMIZE_OUTPUT_FOR_C = NO +OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored @@ -275,28 +297,37 @@ OPTIMIZE_FOR_FORTRAN = NO OPTIMIZE_OUTPUT_VHDL = NO +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is +# Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = in=C # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. +# documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. @@ -304,6 +335,15 @@ EXTENSION_MAPPING = MARKDOWN_SUPPORT = YES +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or @@ -329,7 +369,7 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. @@ -419,8 +459,6 @@ LOOKUP_CACHE_SIZE = 0 # Build related configuration options #--------------------------------------------------------------------------- -SHOW_NAMESPACES = NO - # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the @@ -429,7 +467,7 @@ SHOW_NAMESPACES = NO # normally produced when WARNINGS is set to YES. # The default value is: NO. -EXTRACT_ALL = NO +EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. @@ -437,6 +475,12 @@ EXTRACT_ALL = NO EXTRACT_PRIVATE = NO +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. @@ -491,8 +535,8 @@ HIDE_UNDOC_MEMBERS = NO HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. +# declarations. If set to NO, these declarations will be included in the +# documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO @@ -515,7 +559,7 @@ INTERNAL_DOCS = NO # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. +# (including Cygwin) ands Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO @@ -702,7 +746,7 @@ LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. @@ -747,10 +791,17 @@ WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. -WARN_NO_PARAMDOC = NO +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = YES # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which @@ -778,12 +829,12 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ../../tools/topo_expl/include/nccl.h +INPUT = mainpage.txt ../../src/nccl.h.in # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of # possible encodings. # The default value is: UTF-8. @@ -800,8 +851,10 @@ INPUT_ENCODING = UTF-8 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, -# *.vhdl, *.ucf, *.qsf, *.as and *.js. +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen +# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.cc \ @@ -928,6 +981,10 @@ IMAGE_PATH = # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. INPUT_FILTER = @@ -937,6 +994,10 @@ INPUT_FILTER = # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. FILTER_PATTERNS = @@ -960,7 +1021,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -# USE_MDFILE_AS_MAINPAGE = ../README.md +USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing @@ -989,7 +1050,7 @@ INLINE_SOURCES = NO STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. +# entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO @@ -1021,12 +1082,12 @@ SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version +# (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # @@ -1054,7 +1115,7 @@ VERBATIM_HEADERS = YES # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was -# compiled with the --with-libclang option. +# generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO @@ -1067,6 +1128,16 @@ CLANG_ASSISTED_PARSING = NO CLANG_OPTIONS = +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1185,7 +1256,7 @@ HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. @@ -1221,6 +1292,17 @@ HTML_COLORSTYLE_GAMMA = 80 HTML_TIMESTAMP = NO +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. @@ -1244,13 +1326,13 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# environment (see: https://developer.apple.com/xcode/), introduced with OSX +# 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1289,7 +1371,7 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output @@ -1365,7 +1447,7 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1373,7 +1455,7 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1382,7 +1464,7 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1390,7 +1472,7 @@ QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1398,7 +1480,7 @@ QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = @@ -1491,7 +1573,7 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # @@ -1502,8 +1584,14 @@ FORMULA_FONTSIZE = 10 FORMULA_TRANSPARENT = YES +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering +# https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path @@ -1530,8 +1618,8 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest @@ -1573,7 +1661,7 @@ MATHJAX_CODEFILE = SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a web server instead of a web client using Javascript. There +# implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH # setting. When disabled, doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing @@ -1592,7 +1680,7 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). +# Xapian (see: https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1605,7 +1693,7 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). See the section "External Indexing and +# Xapian (see: https://xapian.org/). See the section "External Indexing and # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. @@ -1657,21 +1745,35 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. # -# Note that when enabling USE_PDFLATEX this option is only used for generating -# bitmaps for formulas in the HTML output, but not in the Makefile that is -# written to the output directory. -# The default file is: latex. +# Note that when not enabling USE_PDFLATEX the default is latex when enabling +# USE_PDFLATEX the default is pdflatex and when in the later case latex is +# chosen this is overwritten by pdflatex. For specific output languages the +# default can have been set differently, this depends on the implementation of +# the output language. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate # index for LaTeX. +# Note: This tag is used in the Makefile / make.bat. +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file +# (.tex). # The default file is: makeindex. # This tag requires that the tag GENERATE_LATEX is set to YES. MAKEINDEX_CMD_NAME = makeindex +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to +# generate index for LaTeX. In case there is no backslash (\) as first character +# it will be automatically added in the LaTeX code. +# Note: This tag is used in the generated output file (.tex). +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. +# The default value is: makeindex. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_MAKEINDEX_CMD = makeindex + # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. @@ -1792,12 +1894,28 @@ LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. # The default value is: plain. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_BIB_STYLE = plain +# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated +# page will contain the date and time when the page was generated. Setting this +# to NO can help when comparing the output of multiple runs. +# The default value is: NO. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_TIMESTAMP = NO + +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) +# path from which the emoji images will be read. If a relative path is entered, +# it will be relative to the LATEX_OUTPUT directory. If left blank the +# LATEX_OUTPUT directory will be used. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_EMOJI_DIRECTORY = + #--------------------------------------------------------------------------- # Configuration options related to the RTF output #--------------------------------------------------------------------------- @@ -1837,9 +1955,9 @@ COMPACT_RTF = NO RTF_HYPERLINKS = NO -# Load stylesheet definitions from file. Syntax is similar to doxygen's config -# file, i.e. a series of assignments. You only have to provide replacements, -# missing definitions are set to their default value. +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# configuration file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the # default style sheet that doxygen normally uses. @@ -1848,8 +1966,8 @@ RTF_HYPERLINKS = NO RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's config file. A template extensions file can be generated -# using doxygen -e rtf extensionFile. +# similar to doxygen's configuration file. A template extensions file can be +# generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = @@ -1935,6 +2053,13 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# namespace members in file scope as well, matching the HTML output. +# The default value is: NO. +# This tag requires that the tag GENERATE_XML is set to YES. + +XML_NS_MEMB_FILE_SCOPE = NO + #--------------------------------------------------------------------------- # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- @@ -1967,9 +2092,9 @@ DOCBOOK_PROGRAMLISTING = NO #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sf.net) file that captures the -# structure of the code including all documentation. Note that this feature is -# still experimental and incomplete at the moment. +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# the structure of the code including all documentation. Note that this feature +# is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO @@ -2136,12 +2261,6 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /usr/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- @@ -2155,15 +2274,6 @@ PERL_PATH = /usr/bin/perl CLASS_DIAGRAMS = NO -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. @@ -2182,7 +2292,7 @@ HIDE_UNDOC_RELATIONS = YES # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent # Bell Labs. The other options in this section have no effect if this option is # set to NO -# The default value is: NO. +# The default value is: YES. HAVE_DOT = NO @@ -2338,7 +2448,9 @@ DIRECTORY_GRAPH = YES # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order # to make the SVG files visible in IE 9+ (other browsers do not have this # requirement). -# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo, +# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd, +# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo, +# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo, # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and # png:gdiplus:gdiplus. # The default value is: png. @@ -2391,6 +2503,11 @@ DIAFILE_DIRS = PLANTUML_JAR_PATH = +# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a +# configuration file for plantuml. + +PLANTUML_CFG_FILE = + # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. diff --git a/docs/doxygen/mainpage.txt b/docs/doxygen/mainpage.txt new file mode 100644 index 0000000000..34c35a33c8 --- /dev/null +++ b/docs/doxygen/mainpage.txt @@ -0,0 +1,27 @@ +/*! \mainpage RCCL Documentation + +\tableofcontents + +\section intro_sec Introduction + +RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. + +The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. + +\section API RCCL API Contents +- @ref rccl_api_version +- @ref rccl_result_code +- @ref rccl_config_type +- @ref rccl_api_communicator +- @ref rccl_api_errcheck +- @ref rccl_api_comminfo +- @ref rccl_api_enumerations +- @ref rccl_api_custom_redop +- @ref rccl_collective_api +- @ref rccl_group_api +- @ref msccl_api + +\section Full RCCL API File +- nccl.h.in + +*/ diff --git a/src/nccl.h.in b/src/nccl.h.in index e4ebb92a21..eab3c83444 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -28,221 +28,320 @@ extern "C" { #endif -/*! @brief Opaque handle to communicator */ #include + +/*! @brief Opaque handle to communicator + @details A communicator contains information required to facilitate collective communications calls */ typedef struct ncclComm* ncclComm_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 -typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; +/*! @brief Opaque unique id used to initialize communicators + @details The ncclUniqueId must be passed to all participating ranks */ +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId; -/*! @brief Error type */ -typedef enum { ncclSuccess = 0, - ncclUnhandledCudaError = 1, - ncclSystemError = 2, - ncclInternalError = 3, - ncclInvalidArgument = 4, - ncclInvalidUsage = 5, - ncclRemoteError = 6, - ncclInProgress = 7, - ncclNumResults = 8 } ncclResult_t; +/*! @defgroup rccl_result_code Result Codes + @details The various result codes that RCCL API calls may return + @{ */ + +/*! @brief Result type + @details Return codes aside from ncclSuccess indicate that a call has failed */ + typedef enum { + ncclSuccess = 0, /*!< No error */ + ncclUnhandledCudaError = 1, /*!< Unhandled HIP error */ + ncclSystemError = 2, /*!< Unhandled system error */ + ncclInternalError = 3, /*!< Internal Error - Please report to RCCL developers */ + ncclInvalidArgument = 4, /*!< Invalid argument */ + ncclInvalidUsage = 5, /*!< Invalid usage */ + ncclRemoteError = 6, /*!< Remote process exited or there was a network error */ + ncclInProgress = 7, /*!< RCCL operation in progress */ + ncclNumResults = 8 /*!< Number of result types */ + } ncclResult_t; +/*! @} */ #define NCCL_CONFIG_UNDEF_INT INT_MIN #define NCCL_CONFIG_UNDEF_PTR NULL #define NCCL_SPLIT_NOCOLOR -1 -/* Communicator configuration. Users can assign value to attributes to specify the - * behavior of a communicator. */ +/*! @defgroup rccl_config_type Communicator Configuration + @details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig + @{ */ + +/*! @brief Communicator configuration + @details Users can assign value to attributes to specify the behavior of a communicator */ typedef struct ncclConfig_v21700 { /* attributes that users should never touch. */ - size_t size; - unsigned int magic; - unsigned int version; + size_t size; /*!< Should not be touched */ + unsigned int magic; /*!< Should not be touched */ + unsigned int version; /*!< Should not be touched */ /* attributes that users are able to customize. */ - int blocking; - int cgaClusterSize; - int minCTAs; - int maxCTAs; - const char *netName; - int splitShare; + int blocking; /*!< Whether or not calls should block or not */ + int cgaClusterSize; /*!< Cooperative group array cluster size */ + int minCTAs; /*!< Minimum number of cooperative thread arrays (blocks) */ + int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */ + const char *netName; /*!< Force NCCL to use a specfic network */ + int splitShare; /*!< Allow communicators to share resources */ } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. - * Not initialized config will result in NCCL error. */ -#define NCCL_CONFIG_INITIALIZER { \ - sizeof(ncclConfig_t), /* size */ \ - 0xcafebeef, /* magic */ \ - NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ - NCCL_CONFIG_UNDEF_INT, /* blocking */ \ - NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ - NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ - NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ - NCCL_CONFIG_UNDEF_PTR, /* netName */ \ - NCCL_CONFIG_UNDEF_INT /* splitShare */ \ + * Not initialized config will result in an error. */ +#define NCCL_CONFIG_INITIALIZER { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ } +/*! @} */ -/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. - * - * @details This integer is coded with the MAJOR, MINOR and PATCH level of the - * NCCL library - */ +/*! @defgroup rccl_api_version Version Information + @details API call that returns RCCL version + @{ */ + +/*! @brief Return the RCCL_VERSION_CODE of RCCL in the supplied integer. + @details This integer is coded with the MAJOR, MINOR and PATCH level of RCCL. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] version Pointer to where version will be stored */ ncclResult_t ncclGetVersion(int *version); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclGetVersion(int *version); -/// @endcond +/*! @endcond */ +/*! @} */ -/*! @brief Generates an ID for ncclCommInitRank +/*! @defgroup rccl_api_communicator Communicator Initialization/Destruction + @details API calls that operate on communicators. + Communicators objects are used to launch collective communication + operations. Unique ranks between 0 and N-1 must be assigned to + each HIP device participating in the same Communicator. + Using the same HIP device for multiple ranks of the same Communicator + is not supported at this time. + @{ */ - @details - Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be - called once and the Id should be distributed to all ranks in the - communicator before calling ncclCommInitRank. +/*! @brief Generates an ID for ncclCommInitRank. + @details Generates an ID to be used in ncclCommInitRank. + ncclGetUniqueId should be called once by a single rank and the + ID should be distributed to all ranks in the communicator before + using it as a parameter for ncclCommInitRank. + @return Result code. See @ref rccl_result_code for more details. - @param[in] - uniqueId ncclUniqueId* - pointer to uniqueId - -*/ + @param[out] uniqueId Pointer to where uniqueId will be stored */ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); -/// @endcond +/*! @endcond */ -/*! @brief Create a new communicator (multi thread/process version) with a configuration - * set by users. */ +/*! @brief Create a new communicator with config. + @details Create a new communicator (multi thread/process version) with a configuration + set by users. See @ref rccl_config_type for more details. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] comm Pointer to created communicator + @param[in] nranks Total number of ranks participating in this communicator + @param[in] commId UniqueId required for initialization + @param[in] rank Current rank to create communicator for. [0 to nranks-1] + @param[in] config Pointer to communicator configuration */ ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); -/// @endcond +/*! @endcond */ -/*! @brief Creates a new communicator (multi thread/process version). +/*! @brief Creates a new communicator (multi thread/process version). + @details Rank must be between 0 and nranks-1 and unique within a communicator clique. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. ncclCommInitRank implicitly syncronizes with other ranks, + so it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd. + @return Result code. See @ref rccl_result_code for more details. - @details - rank must be between 0 and nranks-1 and unique within a communicator clique. - Each rank is associated to a CUDA device, which has to be set before calling - ncclCommInitRank. - ncclCommInitRank implicitly syncronizes with other ranks, so it must be - called by different threads/processes or use ncclGroupStart/ncclGroupEnd. - - @param[in] - comm ncclComm_t* - communicator struct pointer - */ + @param[out] comm Pointer to created communicator + @param[in] nranks Total number of ranks participating in this communicator + @param[in] commId UniqueId required for initialization + @param[in] rank Current rank to create communicator for */ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); -/// @endcond +/*! @endcond */ -/*! @brief Creates a clique of communicators (single process version). - * - * @details This is a convenience function to create a single-process communicator clique. - * Returns an array of ndev newly initialized communicators in comm. - * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). - * If devlist is NULL, the first ndev HIP devices are used. - * Order of devlist defines user-order of processors within the communicator. - * */ +/*! @brief Creates a clique of communicators (single process version). + @details This is a convenience function to create a single-process communicator clique. + Returns an array of ndev newly initialized communicators in comm. + comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + If devlist is NULL, the first ndev HIP devices are used. + Order of devlist defines user-order of processors within the communicator. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] comm Pointer to array of created communicators + @param[in] ndev Total number of ranks participating in this communicator + @param[in] devlist Array of GPU device indices to create for */ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -/// @endcond +/*! @endcond */ -/*! @brief Finalize a communicator. - * @details ncclCommFinalize flushes all issued communications, - * and marks communicator state as ncclInProgress. The state will change to ncclSuccess - * when the communicator is globally quiescent and related resources are freed; then, - * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator - * itself) without blocking. */ +/*! @brief Finalize a communicator. + @details ncclCommFinalize flushes all issued communications + and marks communicator state as ncclInProgress. The state will change to ncclSuccess + when the communicator is globally quiescent and related resources are freed; then, + calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator + itself) without blocking. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to finalize */ ncclResult_t ncclCommFinalize(ncclComm_t comm); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommFinalize(ncclComm_t comm); -/// @endcond +/*! @endcond */ -/*! @brief Frees local resources associated with communicator object. */ +/*! @brief Frees local resources associated with communicator object. + @details Destroy all local resources associated with the passed in communicator object + @return Result code. See @ref rccl_result_code for more details. + @param[in] comm Communicator to destroy */ ncclResult_t ncclCommDestroy(ncclComm_t comm); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommDestroy(ncclComm_t comm); -/// @endcond +/*! @endcond */ -/*! @brief Frees resources associated with communicator object and aborts any operations - * that might still be running on the device. */ +/*! @brief Abort any in-progress calls and destroy the communicator object. + @details Frees resources associated with communicator object and aborts any operations + that might still be running on the device. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to abort and destroy */ ncclResult_t ncclCommAbort(ncclComm_t comm); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommAbort(ncclComm_t comm); -/// @endcond +/*! @endcond */ -/*! @brief Creates one or more communicators from an existing one. - * Ranks with the same color will end up in the same communicator. - * Within the new communicator, key will be used to order ranks. - * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group - * and will therefore return a NULL communicator. - * If config is NULL, the new communicator will inherit the original communicator's - * configuration*/ +/*! @brief Create one or more communicators from an existing one. + @details Creates one or more communicators from an existing one. + Ranks with the same color will end up in the same communicator. + Within the new communicator, key will be used to order ranks. + NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group + and will therefore return a NULL communicator. + If config is NULL, the new communicator will inherit the original communicator's configuration + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Original communicator object for this rank + @param[in] color Color to assign this rank + @param[in] key Key used to order ranks within the same new communicator + @param[out] newcomm Pointer to new communicator + @param[in] config Config file for new communicator. May be NULL to inherit from comm */ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); -/// @endcond +/*! @endcond */ +/*! @} */ -/* Returns a string for each error code. */ -/*! @brief Returns a string for each error code. */ +/*! @defgroup rccl_api_errcheck Error Checking Calls + @details API calls that check for errors + @{ */ + +/*! @brief Returns a string for each result code. + @details Returns a human-readable string describing the given result code. + @return String containing description of result code. + + @param[in] result Result code to get description for */ const char* ncclGetErrorString(ncclResult_t result); -/// @cond include_hidden +/*! @cond include_hidden */ const char* pncclGetErrorString(ncclResult_t result); -/// @endcond +/*! @endcond */ -/*! @brief Returns a human-readable message of the last error that occurred. - * comm is currently unused and can be set to NULL - */ +/*! @brief Returns mesage on last result that occured. + @details Returns a human-readable message of the last error that occurred. + @return String containing the last result + + @param[in] comm is currently unused and can be set to NULL */ const char* ncclGetLastError(ncclComm_t comm); -/// @cond include_hidden +/*! @cond include_hidden */ const char* pncclGetLastError(ncclComm_t comm); -/// @endcond +/*! @endcond */ -/* Checks whether the comm has encountered any asynchronous errors */ +/*! @brief Checks whether the comm has encountered any asynchronous errors + @details Query whether the provided communicator has encountered any asynchronous errors + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] asyncError Pointer to where result code will be stored */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); -/// @endcond +/*! @endcond */ +/*! @} */ -/*! @brief Gets the number of ranks in the communicator clique. */ +/*! @defgroup rccl_api_comminfo Communicator Information + @details API calls that query communicator information + @{ */ + +/*! @brief Gets the number of ranks in the communicator clique. + @details Returns the number of ranks in the communicator clique (as set during initialization) + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] count Pointer to where number of ranks will be stored */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); -/// @endcond +/*~ @endcond */ -/*! @brief Returns the rocm device number associated with the communicator. */ +/*! @brief Get the ROCm device index associated with a communicator + @details Returns the ROCm device number associated with the provided communicator. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] device Pointer to where the associated ROCm device index will be stored */ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); -/// @endcond +/*! @endcond */ -/*! @brief Returns the user-ordered "rank" associated with the communicator. */ +/*! @brief Get the rank associated with a communicator + @details Returns the user-ordered "rank" associated with the provided communicator. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] comm Communicator to query + @param[out] rank Pointer to where the associated rank will be stored */ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); -/// @endcond +/*! @endcond */ +/*! @} */ -/*! @brief Reduction operation selector */ -/* Reduction operation selector */ +/*! @defgroup rccl_api_enumerations API Enumerations + @details Enumerations used by collective communication calls + @{ */ + +/*! @brief Dummy reduction enumeration + @details Dummy reduction enumeration used to determine value for ncclMaxRedOp */ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; -typedef enum { ncclSum = 0, - ncclProd = 1, - ncclMax = 2, - ncclMin = 3, - ncclAvg = 4, - /* ncclNumOps: The number of built-in ncclRedOp_t values. Also - * serves as the least possible value for dynamic ncclRedOp_t's - * as constructed by ncclRedOpCreate*** functions. */ - ncclNumOps = 5, - /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(ncclRedOp_t) when compared to previous NCCL versions to - * maintain ABI compatibility. */ - ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) + +/*! @brief Reduction operation selector + @details Enumeration used to specify the various reduction operations + ncclNumOps is the number of built-in ncclRedOp_t values and serves as + the least possible value for dynamic ncclRedOp_t values constructed by + ncclRedOpCreate functions. + + ncclMaxRedOp is the largest valid value for ncclRedOp_t and is defined + to be the largest signed value (since compilers are permitted to use + signed enums) that won't grow sizeof(ncclRedOp_t) when compared to previous + RCCL versions to maintain ABI compatibility. */ +typedef enum { ncclSum = 0, /*!< Sum */ + ncclProd = 1, /*!< Product */ + ncclMax = 2, /*!< Max */ + ncclMin = 3, /*!< Min */ + ncclAvg = 4, /*!< Average */ + ncclNumOps = 5, /*!< Number of built-in reduction ops */ + ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) /*!< Largest value for ncclRedOp_t */ } ncclRedOp_t; -/*! @brief Data types */ +/*! @brief Data types + @details Enumeration of the various supported datatype */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, @@ -254,338 +353,446 @@ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclFloat64 = 8, ncclDouble = 8, ncclBfloat16 = 9, ncclNumTypes = 10 } ncclDataType_t; +/*! @} */ -/*! @brief ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ +/*! @defgroup rccl_api_custom_redop Custom Reduction Operator + @details API calls relating to creation/destroying custom reduction operator + that pre-multiplies local source arrays prior to reduction + @{ */ + +/*! @brief Location and dereferencing logic for scalar arguments. + @details Enumeration specifying memory location of the scalar argument. + Based on where the value is stored, the argument will be dereferenced either + while the collective is running (if in device memory), or before the ncclRedOpCreate() + function returns (if in host memory). */ typedef enum { - /* ncclScalarDevice: The scalar is in device-visible memory and will be - * dereferenced while the collective is running. */ - ncclScalarDevice = 0, - - /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be - * dereferenced before the ncclRedOpCreate***() function returns. */ - ncclScalarHostImmediate = 1 + ncclScalarDevice = 0, /*!< Scalar is in device-visible memory */ + ncclScalarHostImmediate = 1 /*!< Scalar is in host-visible memory */ } ncclScalarResidence_t; -/*! @brief ncclRedOpCreatePreMulSum - * Creates a new reduction operator which pre-multiplies input values by a given - * scalar locally before reducing them with peer values via summation. For use - * only with collectives launched against *comm* and *datatype*. The - * *residence* argument indicates how/when the memory pointed to by *scalar* - * will be dereferenced. Upon return, the newly created operator's handle - * is stored in *op*. - */ +/*! @brief Create a custom pre-multiplier reduction operator + @details Creates a new reduction operator which pre-multiplies input values by a given + scalar locally before reducing them with peer values via summation. For use + only with collectives launched against *comm* and *datatype*. The + *residence* argument indicates how/when the memory pointed to by *scalar* + will be dereferenced. Upon return, the newly created operator's handle + is stored in *op*. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] op Pointer to where newly created custom reduction operator is to be stored + @param[in] scalar Pointer to scalar value. + @param[in] datatype Scalar value datatype + @param[in] residence Memory type of the scalar value + @param[in] comm Communicator to associate with this custom reduction operator */ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); -/// @endcond +/*! @endcond */ -/*! @brief ncclRedOpDestroy - * @details Destroys the reduction operator *op*. The operator must have been created by - * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be - * destroyed as soon as the last NCCL function which is given that operator returns. - */ +/*! @brief Destroy custom reduction operator + @details Destroys the reduction operator *op*. The operator must have been created by + ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + destroyed as soon as the last RCCL function which is given that operator returns. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] op Custom reduction operator is to be destroyed + @param[in] comm Communicator associated with this reduction operator */ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); -/// @endcond +/*! @endcond */ +/*! @} */ -/* - * Collective communication operations - * - * Collective communication operations must be called separately for each - * communicator in a communicator clique. - * - * They return when operations have been enqueued on the CUDA stream. - * - * Since they may perform inter-CPU synchronization, each call has to be done - * from a different thread or process, or need to use Group Semantics (see - * below). - */ +/*! @defgroup rccl_collective_api Collective Communication Operations + @details Collective communication operations must be called separately for each + communicator in a communicator clique. -/*! - * @brief Reduce - * - * @details Reduces data arrays of length count in sendbuff into recvbuff using op - * operation. - * recvbuff may be NULL on all calls except for root device. - * root is the rank (not the CUDA device) where data will reside after the - * operation is complete. - * - * In-place operation will happen if sendbuff == recvbuff. - */ + They return when operations have been enqueued on the HIP stream. + Since they may perform inter-CPU synchronization, each call has to be done + from a different thread or process, or need to use Group Semantics (see + below). + @{ */ + +/*! @brief Reduce + @details Reduces data arrays of length *count* in *sendbuff* into *recvbuff* using *op* + operation. + *recvbuff* may be NULL on all calls except for root device. + *root* is the rank (not the HIP device) where data will reside after the + operation is complete. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Local device data buffer to be reduced + @param[out] recvbuff Data buffer where result is stored (only for *root* rank). May be null for other ranks. + @param[in] count Number of elements in every send buffer + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator type + @param[in] root Rank where result data array will be stored + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief (deprecated) Broadcast (in-place) - * - * @details Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * This operation is implicitely in place. - */ +/*! @brief (Deprecated) Broadcast (in-place) + @details Copies *count* values from *root* to all other devices. + root is the rank (not the CUDA device) where data resides before the + operation is started. + This operation is implicitly in-place. + @return Result code. See @ref rccl_result_code for more details. + + @param[in,out] buff Input array on *root* to be copied to other ranks. Output array for all ranks. + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] root Rank owning buffer to be copied to others + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief Broadcast - * - * @details Copies count values from root to all other devices. - * root is the rank (not the HIP device) where data resides before the - * operation is started. - * - * In-place operation will happen if sendbuff == recvbuff. - */ +/*! @brief Broadcast + @details Copies *count* values from *sendbuff* on *root* to *recvbuff* on all devices. + *root* is the rank (not the HIP device) where data resides before the operation is started. + *sendbuff* may be NULL on ranks other than *root*. + In-place operation will happen if *sendbuff* == *recvbuff*. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to copy (if *root*). May be NULL for other ranks + @param[in] recvbuff Data array to store received array + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] root Rank of broadcast root + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief All-Reduce - * - * @details Reduces data arrays of length count in sendbuff using op operation, and - * leaves identical copies of result on each recvbuff. - * - * In-place operation will happen if sendbuff == recvbuff. - */ +/*! @brief All-Reduce + @details Reduces data arrays of length *count* in *sendbuff* using *op* operation, and + leaves identical copies of result on each *recvbuff*. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to reduce + @param[out] recvbuff Data array to store reduced result array + @param[in] count Number of elements in data buffer + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! - * @brief Reduce-Scatter - * - * @details Reduces data in sendbuff using op operation and leaves reduced result - * scattered over the devices so that recvbuff on rank i will contain the i-th - * block of the result. - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ +/*! @brief Reduce-Scatter + @details Reduces data in *sendbuff* using *op* operation and leaves reduced result + scattered over the devices so that *recvbuff* on rank i will contain the i-th + block of the result. + Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff* + should have a size of at least nranks*recvcount elements. + In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to reduce + @param[out] recvbuff Data array to store reduced result subarray + @param[in] recvcount Number of elements each rank receives + @param[in] datatype Data buffer element datatype + @param[in] op Reduction operator + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief All-Gather - * - * @details Each device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ +/*! @brief All-Gather + @details Each device gathers *sendcount* values from other GPUs into *recvbuff*, + receiving data from rank i at offset i*sendcount. + Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + should have a size of at least nranks*sendcount elements. + In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Input data array to send + @param[out] recvbuff Data array to store the gathered result + @param[in] sendcount Number of elements each rank sends + @param[in] datatype Data buffer element datatype + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief Send - * - * @details Send data from sendbuff to rank peer. - * Rank peer needs to call ncclRecv with the same datatype and the same count from this - * rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations - * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ - * ncclGroupEnd section. - */ +/*! @brief Send + @details Send data from *sendbuff* to rank *peer*. + Rank *peer* needs to call ncclRecv with the same *datatype* and the same *count* + as this rank. + This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + need to progress concurrently to complete, they must be fused within a ncclGroupStart / + ncclGroupEnd section. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send + @param[in] count Number of elements to send + @param[in] datatype Data buffer element datatype + @param[in] peer Peer rank to send to + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief Receive - * - * @details Receive data from rank peer into recvbuff. - * Rank peer needs to call ncclSend with the same datatype and the same count to this - * rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations - * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ - * ncclGroupEnd section. - */ +/*! @brief Receive + @details Receive data from rank *peer* into *recvbuff*. + Rank *peer* needs to call ncclSend with the same datatype and the same count + as this rank. + This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + ncclGroupEnd section. + @return Result code. See @ref rccl_result_code for more details. + + @param[out] recvbuff Data array to receive + @param[in] count Number of elements to receive + @param[in] datatype Data buffer element datatype + @param[in] peer Peer rank to send to + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief Gather - * - * @details Root device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ +/*! @brief Gather + @details Root device gathers *sendcount* values from other GPUs into *recvbuff*, + receiving data from rank i at offset i*sendcount. + Assumes recvcount is equal to nranks*sendcount, which means that *recvbuff* + should have a size of at least nranks*sendcount elements. + In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + *recvbuff* may be NULL on ranks other than *root*. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send + @param[out] recvbuff Data array to receive into on *root*. + @param[in] sendcount Number of elements to send per rank + @param[in] datatype Data buffer element datatype + @param[in] root Rank that receives data from all other ranks + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief Scatter - * - * @details Scattered over the devices so that recvbuff on rank i will contain the i-th - * block of the data on root. - * - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ +/*! @brief Scatter + @details Scattered over the devices so that recvbuff on rank i will contain the i-th + block of the data on root. + Assumes sendcount is equal to nranks*recvcount, which means that *sendbuff* + should have a size of at least nranks*recvcount elements. + In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send (on *root* rank). May be NULL on other ranks. + @param[out] recvbuff Data array to receive partial subarray into + @param[in] recvcount Number of elements to receive per rank + @param[in] datatype Data buffer element datatype + @param[in] root Rank that scatters data to all other ranks + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief All-To-All - * - * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th - * block. Each block for sending/receiving has count elements, which means - * that recvbuff and sendbuff should have a size of nranks*count elements. - * - * In-place operation will happen if sendbuff == recvbuff. - */ +/*! @brief All-To-All + @details Device (i) send (j)th block of data to device (j) and be placed as (i)th + block. Each block for sending/receiving has *count* elements, which means + that *recvbuff* and *sendbuff* should have a size of nranks*count elements. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendbuff Data array to send (contains blocks for each other rank) + @param[out] recvbuff Data array to receive (contains blocks from each other rank) + @param[in] count Number of elements to send between each pair of ranks + @param[in] datatype Data buffer element datatype + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief All-To-Allv - * - * @details Device (i) sends sendcounts[j] of data from offset sdispls[j] - * to device (j). In the same time, device (i) receives recvcounts[j] of data - * from device (j) to be placed at rdispls[j]. +/*! @brief All-To-Allv + @details Device (i) sends sendcounts[j] of data from offset sdispls[j] + to device (j). At the same time, device (i) receives recvcounts[j] of data + from device (j) to be placed at rdispls[j]. + sendcounts, sdispls, recvcounts and rdispls are all measured in the units + of datatype, not bytes. + In-place operation will happen if sendbuff == recvbuff. + @return Result code. See @ref rccl_result_code for more details. - * sendcounts, sdispls, recvcounts and rdispls are all measured in the units - * of datatype, not bytes. - * - * In-place operation will happen if sendbuff == recvbuff. - */ + @param[in] sendbuff Data array to send (contains blocks for each other rank) + @param[in] sendcounts Array containing number of elements to send to each participating rank + @param[in] sdispls Array of offsets into *sendbuff* for each participating rank + @param[out] recvbuff Data array to receive (contains blocks from each other rank) + @param[in] recvcounts Array containing number of elements to receive from each participating rank + @param[in] rdispls Array of offsets into *recvbuff* for each participating rank + @param[in] datatype Data buffer element datatype + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[], void *recvbuff, const size_t recvcounts[], const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[], void *recvbuff, const size_t recvcounts[], const size_t rdispls[], ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -/// @endcond +/*! @endcond */ -/*! @brief Opaque handle to MSCCL algorithm */ +/*! @} */ + +/*! @defgroup msccl_api MSCCL Algorithm + @details API calls relating to the optional MSCCL algorithm datapath + @{ */ + +/*! @brief Opaque handle to MSCCL algorithm */ typedef int mscclAlgoHandle_t; -/*! @brief MSCCL Load Algorithm - * - * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return - * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL - * scheduler instead of end users. - */ -ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); -ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); +/*! @brief MSCCL Load Algorithm + @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return + its handle via mscclAlgoHandle. This API is expected to be called by MSCCL + scheduler instead of end users. + @return Result code. See @ref rccl_result_code for more details. -/*! @brief MSCCL Run Algorithm - * - * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter - * list merges all possible parameters required by different operations as this - * is a general-purposed API. This API is expected to be called by MSCCL - * scheduler instead of end users. - */ + @param[in] mscclAlgoFilePath Path to MSCCL algorithm file + @param[out] mscclAlgoHandle Returned handle to MSCCL algorithm + @param[in] rank Current rank */ +ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); +/*! @cond include_hidden */ +ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); +/*! @endcond */ + +/*! @brief MSCCL Run Algorithm + @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter + list merges all possible parameters required by different operations as this + is a general-purposed API. This API is expected to be called by MSCCL + scheduler instead of end users. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] sendBuff Data array to send + @param[in] sendCounts Array containing number of elements to send to each participating rank + @param[in] sDisPls Array of offsets into *sendbuff* for each participating rank + @param[out] recvBuff Data array to receive + @param[in] recvCounts Array containing number of elements to receive from each participating rank + @param[in] rDisPls Array of offsets into *recvbuff* for each participating rank + @param[in] count Number of elements + @param[in] dataType Data buffer element datatype + @param[in] root Root rank index + @param[in] peer Peer rank index + @param[in] op Reduction operator + @param[in] mscclAlgoHandle Handle to MSCCL algorithm + @param[in] comm Communicator group object to execute on + @param[in] stream HIP stream to execute collective on */ ncclResult_t mscclRunAlgo( const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); +/*! @cond include_hidden */ ncclResult_t pmscclRunAlgo( const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream); +/*! @endcond */ -/*! @brief MSCCL Load Algorithm - * - * @details Unload MSCCL algorithm previous loaded using its handle. This API - * is expected to be called by MSCCL scheduler instead of end users. - */ +/*! @brief MSCCL Unload Algorithm + @details Unload MSCCL algorithm previous loaded using its handle. This API + is expected to be called by MSCCL scheduler instead of end users. + @return Result code. See @ref rccl_result_code for more details. + + @param[in] mscclAlgoHandle Handle to MSCCL algorithm to unload +*/ ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); +/*! @cond include_hidden */ ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); +/*! @endcond */ +/*! @} */ -/* - * Group semantics - * - * When managing multiple GPUs from a single thread, and since NCCL collective - * calls may perform inter-CPU synchronization, we need to "group" calls for - * different ranks/devices into a single call. - * - * Grouping NCCL calls as being part of the same collective operation is done - * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all - * collective calls until the ncclGroupEnd call, which will wait for all calls - * to be complete. Note that for collective communication, ncclGroupEnd only - * guarantees that the operations are enqueued on the streams, not that - * the operation is effectively done. - * - * Both collective communication and ncclCommInitRank can be used in conjunction - * of ncclGroupStart/ncclGroupEnd, but not together. - * - * Group semantics also allow to fuse multiple operations on the same device - * to improve performance (for aggregated collective calls), or to permit - * concurrent progress of multiple send/receive operations. - */ -/*! @brief Group Start - * - * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into - * a single NCCL operation. Nothing will be started on the CUDA stream until - * ncclGroupEnd. - */ +/*! @defgroup rccl_group_api Group semantics + @details When managing multiple GPUs from a single thread, and since RCCL collective + calls may perform inter-CPU synchronization, we need to "group" calls for + different ranks/devices into a single call. + + Grouping RCCL calls as being part of the same collective operation is done + using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + collective calls until the ncclGroupEnd call, which will wait for all calls + to be complete. Note that for collective communication, ncclGroupEnd only + guarantees that the operations are enqueued on the streams, not that + the operation is effectively done. + + Both collective communication and ncclCommInitRank can be used in conjunction + of ncclGroupStart/ncclGroupEnd, but not together. + + Group semantics also allow to fuse multiple operations on the same device + to improve performance (for aggregated collective calls), or to permit + concurrent progress of multiple send/receive operations. + @{ */ + +/*! @brief Group Start + @details Start a group call. All calls to RCCL until ncclGroupEnd will be fused into + a single RCCL operation. Nothing will be started on the HIP stream until + ncclGroupEnd. + @return Result code. See @ref rccl_result_code for more details. */ ncclResult_t ncclGroupStart(); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclGroupStart(); -/// @endcond +/*! @endcond */ -/*! @brief Group End - * - * End a group call. Start a fused NCCL operation consisting of all calls since - * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations - * need to be called after ncclGroupEnd. - */ +/*! @brief Group End + @details End a group call. Start a fused RCCL operation consisting of all calls since + ncclGroupStart. Operations on the HIP stream depending on the RCCL operations + need to be called after ncclGroupEnd. + @return Result code. See @ref rccl_result_code for more details. */ ncclResult_t ncclGroupEnd(); -/// @cond include_hidden +/*! @cond include_hidden */ ncclResult_t pncclGroupEnd(); -/// @endcond +/*! @endcond */ +/*! @} */ #ifdef __cplusplus } // end extern "C"