Files
rocm-systems/src/include/nvtx3/nvToolsExtCuda.h
T
Sylvain Jeaugey 178b6b7590 2.22.3-1
Rework core for NVIDIA Trusted Computing
 * Compress work structs so that they are shared between channels
 * Utilize the full amount of kernel argument space permitted (4k)
   before resorting to work fifo.
 * Rework the task preprocessing phase.
 * Use a separate abortDevFlag which is kept in sync with abortFlag
   using cudaMemcpy operations.
 * Rename src/include/align.h to src/include/bitops.h

Add lazy connection establishment for collective operations
 * Move buffer allocation and connection establishment to the first
   collective operation using that algorithm.
 * Accelerate init time and reduce memory usage.
 * Avoid allocating NVLS buffers if all calls are registered.
 * Compute algo/proto in ncclLaunchCollTasksInfo early on.
 * Connect peers in ncclCollPreconnectFunc if not connected already.
 * Also move shared buffer creation to the first send/recv call.

Accelerate intra-node NVLink detection
 * Make each rank only detect NVLinks attached to its GPU.
 * Fuse XMLs to reconstruct the full NVLink topology

Add init profiling to report time spend in different init phases.
 * Report timings of bootstrap, allgather, search, connect, etc.
 * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS.

Add support for PCI p2p on split PCI switches
 * Detect split PCI switches through a kernel module exposing
   switch information.
 * Update the topology XML and graph to add those inter-switch
   connections.

Add cost estimation API
 * Add a new ncclGroupEndSimulate primitive to return the estimated
   time a group would take.

Net/IB: Add separate traffic class for fifo messages
 * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages
   independently from NCCL_IB_TC.
   Merges PR #1194

Net/IB: Add support for IB router
 * Use flid instead of lid if subnets do not match
 * Warn if flid is 0

Optimizations and fixes for device network offload (unpack)
 * Double the default number of channels
 * Cache netDeviceType
 * Fix save/increment head logic to enable Tree support.

Support ncclGroupStart/End for ncclCommAbort/Destroy
 * Allow Abort/Destroy to be called within a group when managing
   multiple GPUs with a single process.

Improve Tuner API
 * Provide to the plugin the original cost table so that the plugin
   can leave unknown or disabled algo/proto combinations untouched.
 * Remove nvlsSupport and collnetSupport.

Do not print version to stdout when using a debug file
 * Also print version from all processes with INFO debug level.
   Fixes issue #1271

Fix clang warnings in NVTX headers
 * Update NVTX headers to the latest version
   Fixes issue #1270

Disable port fusion in heterogeneous systems
 * Do not fuse ports if a mix of multi-port and single port are detected.

Fix NVLS graphs search for dual NICs.
 * Fix NVLS graph search when we have more than one NIC per GPU.

Fix crash with collnetDirect
 * Add separate graph search for collnetDirect, testing alltoall paths
   and working similarly to the NVLS search.

Fix hang when nodes have different CPU types
 * Add the CPU type to the rank peer info.
 * Align all ranks on the CPU type after the first allgather.
 * Only use the aligned CPU type for all tuning operations.
   Fixes issue #1136
   Fixes issue #1184

Fix performance of registered send/recv operations
 * Allow for single full size operations
 * Add INFO to confirm the registration of send/recv buffers.

Move all sync ops to finalize stage
 * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has
   been called.

Improve error reporting during SHM segment creation

Improve support of various compilers
   Merges PR #1177
   Merges PR #1228

Allow net and tuner plugins to be statically linked
 * Search for ncclNet or ncclTuner symbols in the main binary.
   Merges PR #979

Plugin examples includes cleanup
 * Harmonize err.h and common.h usage.
 * Add mixed plugin with both net and tuner.
2024-06-19 01:57:16 -07:00

142 خطوط
4.5 KiB
C

/*
* Copyright 2009-2022 NVIDIA Corporation. All rights reserved.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
#include "nvToolsExt.h"
#include "cuda.h"
#ifndef NVTOOLSEXT_CUDA_V3
#define NVTOOLSEXT_CUDA_V3
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* ========================================================================= */
/** \name Functions for CUDA Resource Naming
*/
/** \addtogroup RESOURCE_NAMING
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
*
* This section covers the API functions that allow to annotate CUDA resources
* with user-provided names.
*
* @{
*/
/* ------------------------------------------------------------------------- */
/* \cond SHOW_HIDDEN
* \brief Used to build a non-colliding value for resource types separated class
* \version \NVTX_VERSION_2
*/
#define NVTX_RESOURCE_CLASS_CUDA 4
/** \endcond */
/* ------------------------------------------------------------------------- */
/** \brief Resource types for CUDA
*/
typedef enum nvtxResourceCUDAType_t
{
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
} nvtxResourceCUDAType_t;
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA device.
*
* Allows the user to associate a CUDA device with a user-provided name.
*
* \param device - The handle of the CUDA device to name.
* \param name - The name of the CUDA device.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA context.
*
* Allows the user to associate a CUDA context with a user-provided name.
*
* \param context - The handle of the CUDA context to name.
* \param name - The name of the CUDA context.
*
* \par Example:
* \code
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
* if ( CUDA_SUCCESS != status )
* goto Error;
* nvtxNameCuContext(cuContext, "CTX_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA stream.
*
* Allows the user to associate a CUDA stream with a user-provided name.
*
* \param stream - The handle of the CUDA stream to name.
* \param name - The name of the CUDA stream.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
/** \brief Annotates a CUDA event.
*
* Allows the user to associate a CUDA event with a user-provided name.
*
* \param event - The handle of the CUDA event to name.
* \param name - The name of the CUDA event.
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
/* ========================================================================= */
#ifdef UNICODE
#define nvtxNameCuDevice nvtxNameCuDeviceW
#define nvtxNameCuContext nvtxNameCuContextW
#define nvtxNameCuStream nvtxNameCuStreamW
#define nvtxNameCuEvent nvtxNameCuEventW
#else
#define nvtxNameCuDevice nvtxNameCuDeviceA
#define nvtxNameCuContext nvtxNameCuContextA
#define nvtxNameCuStream nvtxNameCuStreamA
#define nvtxNameCuEvent nvtxNameCuEventA
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifndef NVTX_NO_IMPL
#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */
#include "nvtxDetail/nvtxImplCuda_v3.h"
#undef NVTX_IMPL_GUARD_CUDA
#endif /*NVTX_NO_IMPL*/
#endif /* NVTOOLSEXT_CUDA_V3 */