Use rocprofiler-sdk for RCCL-API tracing (#126)
- Add support for RCCL API tracing through rocprofiler-sdk. - Refactored the comm_data code to use the SDK RCCL_API callbacks. - Add a runtime version check for SDK to gate callback enablement, rather than just the compile-time check. - Fixed: SAMPLING_TIMEOUT was not being handled correctly in add_test.
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
c5507e3740
Коммит
af77d93f75
@@ -179,7 +179,6 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI"
|
||||
ON)
|
||||
rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support" OFF)
|
||||
rocprofiler_systems_add_option(
|
||||
ROCPROFSYS_USE_MPI_HEADERS
|
||||
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
|
||||
@@ -209,12 +208,6 @@ elseif("$ENV{ROCPROFSYS_CI}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT ROCPROFSYS_USE_ROCM)
|
||||
set(ROCPROFSYS_USE_RCCL
|
||||
OFF
|
||||
CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_BUILD_TESTING)
|
||||
set(ROCPROFSYS_BUILD_EXAMPLES
|
||||
ON
|
||||
|
||||
@@ -1,94 +0,0 @@
|
||||
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file
|
||||
# Copyright.txt or https://cmake.org/licensing for details.
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
set(RCCL-Headers_INCLUDE_DIR_INTERNAL
|
||||
"${PROJECT_SOURCE_DIR}/source/lib/rocprof-sys/library/tpls/rccl"
|
||||
CACHE PATH "Path to internal rccl.h")
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
|
||||
set(ROCM_PATH "$ENV{ROCM_PATH}")
|
||||
endif()
|
||||
|
||||
foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl)
|
||||
if(EXISTS ${_DIR})
|
||||
get_filename_component(_ABS_DIR "${_DIR}" REALPATH)
|
||||
list(APPEND _RCCL_PATHS ${_ABS_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
find_package(
|
||||
rccl
|
||||
QUIET
|
||||
CONFIG
|
||||
HINTS
|
||||
${_RCCL_PATHS}
|
||||
PATHS
|
||||
${_RCCL_PATHS}
|
||||
PATH_SUFFIXES
|
||||
rccl/lib/cmake)
|
||||
|
||||
if(NOT rccl_FOUND)
|
||||
set(RCCL-Headers_INCLUDE_DIR
|
||||
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
|
||||
CACHE PATH "Path to RCCL headers")
|
||||
else()
|
||||
set(RCCL-Headers_INCLUDE_DIR
|
||||
"${rccl_INCLUDE_DIR}"
|
||||
CACHE PATH "Path to RCCL headers")
|
||||
endif()
|
||||
|
||||
# because of the annoying warning starting with v5.2.0, we've got to do this crap
|
||||
if(ROCmVersion_NUMERIC_VERSION)
|
||||
if(ROCmVersion_NUMERIC_VERSION LESS 50200)
|
||||
set(_RCCL-Headers_FILE "rccl.h")
|
||||
set(_RCCL-Headers_DIR "/rccl")
|
||||
else()
|
||||
set(_RCCL-Headers_FILE "rccl/rccl.h")
|
||||
set(_RCCL-Headers_DIR "")
|
||||
endif()
|
||||
else()
|
||||
set(_RCCL-Headers_FILE "rccl/rccl.h")
|
||||
set(_RCCL-Headers_DIR "")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}")
|
||||
rocprofiler_systems_message(
|
||||
AUTHOR_WARNING
|
||||
"RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
|
||||
)
|
||||
set(RCCL-Headers_INCLUDE_DIR
|
||||
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}"
|
||||
CACHE PATH "Path to RCCL headers" FORCE)
|
||||
endif()
|
||||
|
||||
unset(_RCCL-Headers_FILE)
|
||||
unset(_RCCL-Headers_DIR)
|
||||
|
||||
mark_as_advanced(RCCL-Headers_INCLUDE_DIR)
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR)
|
||||
|
||||
# ------------------------------------------------------------------------------#
|
||||
|
||||
if(RCCL-Headers_FOUND)
|
||||
add_library(roc::rccl-headers INTERFACE IMPORTED)
|
||||
set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR})
|
||||
|
||||
target_include_directories(roc::rccl-headers SYSTEM
|
||||
INTERFACE ${RCCL-Headers_INCLUDE_DIR})
|
||||
|
||||
add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED)
|
||||
target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers)
|
||||
endif()
|
||||
|
||||
# ------------------------------------------------------------------------------#
|
||||
@@ -17,9 +17,6 @@ rocprofiler_systems_add_interface_library(
|
||||
"Provides flags and libraries for Dyninst (dynamic instrumentation)")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm
|
||||
"Provides flags and libraries for ROCm")
|
||||
rocprofiler_systems_add_interface_library(
|
||||
rocprofiler-systems-rccl
|
||||
"Provides flags for ROCm Communication Collectives Library (RCCL)")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-mpi
|
||||
"Provides MPI or MPI headers")
|
||||
rocprofiler_systems_add_interface_library(rocprofiler-systems-libva
|
||||
@@ -47,7 +44,6 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition
|
||||
# libraries with relevant compile definitions
|
||||
set(ROCPROFSYS_EXTENSION_LIBRARIES
|
||||
rocprofiler-systems::rocprofiler-systems-rocm
|
||||
rocprofiler-systems::rocprofiler-systems-rccl
|
||||
rocprofiler-systems::rocprofiler-systems-bfd
|
||||
rocprofiler-systems::rocprofiler-systems-mpi
|
||||
rocprofiler-systems::rocprofiler-systems-ptl
|
||||
@@ -185,19 +181,6 @@ if(ROCPROFSYS_USE_ROCM)
|
||||
target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# RCCL
|
||||
#
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
|
||||
if(ROCPROFSYS_USE_RCCL)
|
||||
find_package(RCCL-Headers ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
|
||||
target_link_libraries(rocprofiler-systems-rccl INTERFACE roc::rccl-headers)
|
||||
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rccl
|
||||
INTERFACE ROCPROFSYS_USE_RCCL)
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
#
|
||||
# MPI
|
||||
|
||||
@@ -834,10 +834,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
|
||||
(defined(ROCPROFSYS_USE_MPI_HEADERS) && ROCPROFSYS_USE_MPI_HEADERS > 0)
|
||||
add_default_env(_env, "ROCPROFSYS_USE_MPIP", true);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
|
||||
add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true);
|
||||
#endif
|
||||
}
|
||||
|
||||
_fill("ROCPROFSYS_CAUSAL_BINARY_EXCLUDE", _binary_excludes, _generate_configs);
|
||||
|
||||
@@ -730,11 +730,10 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
}
|
||||
});
|
||||
|
||||
std::set<std::string> _backend_choices = {
|
||||
"all", "kokkosp", "mpip", "ompt",
|
||||
"rcclp", "amd-smi", "mutex-locks", "spin-locks",
|
||||
"rw-locks", "rocprofiler-sdk", "rocm"
|
||||
};
|
||||
std::set<std::string> _backend_choices = { "all", "kokkosp", "mpip",
|
||||
"ompt", "rcclp", "amd-smi",
|
||||
"mutex-locks", "spin-locks", "rw-locks",
|
||||
"rocm" };
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
|
||||
_backend_choices.erase("mpip");
|
||||
@@ -744,14 +743,10 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
_backend_choices.erase("ompt");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_RCCL)
|
||||
_backend_choices.erase("rcclp");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
_backend_choices.erase("rocm");
|
||||
_backend_choices.erase("amd-smi");
|
||||
_backend_choices.erase("rocprofiler-sdk");
|
||||
_backend_choices.erase("rcclp");
|
||||
#endif
|
||||
|
||||
parser.start_group("BACKEND OPTIONS",
|
||||
|
||||
@@ -44,7 +44,6 @@ target_link_libraries(
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-libva>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-ptl>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rccl>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libgcc-optional>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libstdcxx-optional>
|
||||
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-sanitizer>
|
||||
|
||||
@@ -35,7 +35,6 @@ set(core_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/mproc.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rccl.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/redirect.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/state.hpp
|
||||
|
||||
@@ -562,10 +562,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_data.processed_environs.emplace("periods");
|
||||
}
|
||||
|
||||
strset_t _backend_choices = {
|
||||
"all", "kokkosp", "mpip", "ompt", "rcclp",
|
||||
"amd-smi", "rocprofiler-sdk", "mutex-locks", "spin-locks", "rw-locks"
|
||||
};
|
||||
strset_t _backend_choices = { "all", "kokkosp", "mpip", "ompt",
|
||||
"rcclp", "amd-smi", "rocm", "mutex-locks",
|
||||
"spin-locks", "rw-locks" };
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
|
||||
_backend_choices.erase("mpip");
|
||||
@@ -575,14 +574,10 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
_backend_choices.erase("ompt");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_RCCL)
|
||||
_backend_choices.erase("rcclp");
|
||||
#endif
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
_backend_choices.erase("amd-smi");
|
||||
_backend_choices.erase("rocprofiler-sdk");
|
||||
_backend_choices.erase("rocm");
|
||||
_backend_choices.erase("rcclp");
|
||||
#endif
|
||||
|
||||
if(gpu::device_count() == 0)
|
||||
@@ -590,13 +585,8 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
// remove GPU-specific backends
|
||||
_backend_choices.erase("rcclp");
|
||||
_backend_choices.erase("amd-smi");
|
||||
_backend_choices.erase("rocprofiler-sdk");
|
||||
_backend_choices.erase("rocm");
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL)
|
||||
update_env(_data, "ROCPROFSYS_USE_RCCLP", false);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM)
|
||||
update_env(_data, "ROCPROFSYS_USE_AMD_SMI", false);
|
||||
update_env(_data, "ROCPROFSYS_USE_ROCM", false);
|
||||
|
||||
@@ -102,6 +102,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGOR
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm RocDecode API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocjpeg_api, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, "rocm_rocjpeg_api", "ROCm RocJPEG API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl_api, ROCPROFSYS_CATEGORY_ROCM_RCCL_API, "rocm_rccl_api", "ROCm RCCL API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "amd_smi", "AMD-SMI data")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_gfx_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, "device_busy_gfx", "Busy percentage of GFX engine on a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_umc_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, "device_busy_umc", "Busy percentage of UMC engin on a GPU device")
|
||||
@@ -171,6 +172,7 @@ using name = perfetto_category<Tp...>;
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocdecode_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocjpeg_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl_api), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_gfx_busy), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_umc_busy), \
|
||||
|
||||
@@ -1359,6 +1359,14 @@ configure_disabled_settings(const std::shared_ptr<settings>& _config)
|
||||
_config->find("ROCPROFSYS_USE_AMD_SMI")->second->set_hidden(true);
|
||||
for(const auto& itr : _config->disable_category("amd_smi"))
|
||||
_config->find(itr)->second->set_hidden(true);
|
||||
|
||||
_config->find("ROCPROFSYS_USE_RCCLP")->second->set_hidden(true);
|
||||
for(const auto& itr : _config->disable_category("rcclp"))
|
||||
_config->find(itr)->second->set_hidden(true);
|
||||
|
||||
_config->find("ROCPROFSYS_USE_ROCM")->second->set_hidden(true);
|
||||
for(const auto& itr : _config->disable_category("rocm"))
|
||||
_config->find(itr)->second->set_hidden(true);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_OMPT) || ROCPROFSYS_USE_OMPT == 0
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/defines.hpp"
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
|
||||
# include <rccl/rccl.h>
|
||||
#endif
|
||||
@@ -218,6 +218,30 @@ get_operations_impl(const std::unordered_set<int32_t>& _complete,
|
||||
|
||||
} // namespace
|
||||
|
||||
/// @brief Return the version of the rocprofiler-sdk
|
||||
/// @return The version of the rocprofiler-sdk or 0 if not initialized
|
||||
version_info&
|
||||
get_version()
|
||||
{
|
||||
static auto _version = version_info{ 0 };
|
||||
|
||||
if(_version.formatted == 0)
|
||||
{
|
||||
uint32_t _major = 0;
|
||||
uint32_t _minor = 0;
|
||||
uint32_t _patch = 0;
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_get_version(&_major, &_minor, &_patch));
|
||||
|
||||
_version.major = _major;
|
||||
_version.minor = _minor;
|
||||
_version.patch = _patch;
|
||||
_version.formatted = _major * 10000 + _minor * 100 + _patch;
|
||||
}
|
||||
|
||||
return _version;
|
||||
}
|
||||
|
||||
void
|
||||
config_settings(const std::shared_ptr<settings>& _config)
|
||||
{
|
||||
@@ -319,6 +343,7 @@ config_settings(const std::shared_ptr<settings>& _config)
|
||||
join::join(join::array_config{ ", ", "", "" }, _domain_choices));
|
||||
auto _domain_defaults = std::string{ "hip_runtime_api,marker_api,kernel_dispatch,"
|
||||
"memory_copy,scratch_memory" };
|
||||
|
||||
# if(ROCPROFILER_VERSION < 10000)
|
||||
_domain_defaults.append(",page_migration");
|
||||
# endif
|
||||
@@ -353,28 +378,48 @@ std::unordered_set<rocprofiler_callback_tracing_kind_t>
|
||||
get_callback_domains()
|
||||
{
|
||||
const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
|
||||
const auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>
|
||||
{
|
||||
auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>{
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
# if(ROCPROFILER_VERSION >= 700)
|
||||
ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API,
|
||||
# endif
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
};
|
||||
|
||||
auto _version = get_version();
|
||||
ROCPROFSYS_WARNING_IF(_version.formatted == 0,
|
||||
"Warning! rocprofiler-sdk version not initialized\n");
|
||||
|
||||
# if(ROCPROFILER_VERSION >= 600)
|
||||
if(_version.formatted >= 600)
|
||||
{
|
||||
// Argument tracing is supported in rocprofiler-sdk 0.6.0 and later
|
||||
supported.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API);
|
||||
supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API);
|
||||
}
|
||||
# endif
|
||||
# if(ROCPROFILER_VERSION >= 700)
|
||||
if(_version.formatted >= 700)
|
||||
{
|
||||
supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API);
|
||||
}
|
||||
# endif
|
||||
|
||||
auto _data = std::unordered_set<rocprofiler_callback_tracing_kind_t>{};
|
||||
auto _domains =
|
||||
tim::delimit(config::get_setting_value<std::string>("ROCPROFSYS_ROCM_DOMAINS")
|
||||
.value_or(std::string{}),
|
||||
" ,;:\t\n");
|
||||
|
||||
if(config::get_use_rcclp() && _version.formatted >= 600)
|
||||
{
|
||||
// Translate ROCPROFSYS_USE_RCCLP to entry in ROCPROFSYS_ROCM_DOMAINS
|
||||
_data.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API);
|
||||
}
|
||||
|
||||
const auto valid_choices =
|
||||
settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices();
|
||||
|
||||
@@ -582,6 +627,12 @@ void
|
||||
config_settings(const std::shared_ptr<settings>&)
|
||||
{}
|
||||
|
||||
version_info&
|
||||
get_version()
|
||||
{
|
||||
static auto _version = version_info{ 0 };
|
||||
return _version;
|
||||
}
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
|
||||
|
||||
@@ -39,9 +39,20 @@ namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
struct version_info
|
||||
{
|
||||
uint32_t major = 0;
|
||||
uint32_t minor = 0;
|
||||
uint32_t patch = 0;
|
||||
uint32_t formatted = 0; // major * 10000 + minor * 100 + patch
|
||||
};
|
||||
|
||||
void
|
||||
config_settings(const std::shared_ptr<settings>&);
|
||||
|
||||
version_info&
|
||||
get_version();
|
||||
|
||||
#if defined(ROCPROFSYS_USE_ROCM)
|
||||
|
||||
std::unordered_set<rocprofiler_callback_tracing_kind_t>
|
||||
|
||||
@@ -54,6 +54,7 @@ extern "C"
|
||||
ROCPROFSYS_CATEGORY_ROCM_MARKER_API,
|
||||
ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API,
|
||||
ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API,
|
||||
ROCPROFSYS_CATEGORY_ROCM_RCCL_API,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC,
|
||||
|
||||
@@ -122,7 +122,7 @@ rocprofsys_init_library(void)
|
||||
extern "C" void
|
||||
rocprofsys_init_tooling(void)
|
||||
{
|
||||
rocprofsys_init_tooling_hidden(true);
|
||||
rocprofsys_init_tooling_hidden();
|
||||
}
|
||||
|
||||
extern "C" void
|
||||
|
||||
@@ -95,7 +95,7 @@ extern "C"
|
||||
|
||||
// these are the real implementations for internal calling convention
|
||||
void rocprofsys_init_library_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
bool rocprofsys_init_tooling_hidden(bool postinit = false) ROCPROFSYS_HIDDEN_API;
|
||||
bool rocprofsys_init_tooling_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_init_hidden(const char*, bool, const char*) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_finalize_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_reset_preload_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
|
||||
@@ -53,7 +53,6 @@
|
||||
#include "library/ompt.hpp"
|
||||
#include "library/process_sampler.hpp"
|
||||
#include "library/ptl.hpp"
|
||||
#include "library/rcclp.hpp"
|
||||
#include "library/rocprofiler-sdk.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
@@ -404,42 +403,16 @@ rocprofsys_init_library_hidden()
|
||||
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n");
|
||||
}
|
||||
|
||||
// Initialize RCCL if:
|
||||
// - postinit=true - so the code doesn't hang at the initialization stage
|
||||
// - get_state() >= State::Init - so the code doesn't throw an exception
|
||||
// - rccl_initialized=false - so we don't try to initialize RCCL twice
|
||||
// - get_use_rcclp()=true - only if the environment is configured to use RCCL
|
||||
static void
|
||||
rccl_setup(bool postinit)
|
||||
{
|
||||
// Flag used to avoid initializing RCCL twice
|
||||
static bool rccl_initialized = false;
|
||||
|
||||
if(postinit && (get_state() >= State::Init) && !rccl_initialized && get_use_rcclp())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n");
|
||||
rcclp::setup();
|
||||
rccl_initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
rocprofsys_init_library_hidden_with_rccl(bool postinit)
|
||||
{
|
||||
rocprofsys_init_library_hidden();
|
||||
rccl_setup(postinit);
|
||||
}
|
||||
|
||||
//======================================================================================//
|
||||
|
||||
extern "C" bool
|
||||
rocprofsys_init_tooling_hidden(bool postinit)
|
||||
rocprofsys_init_tooling_hidden(void)
|
||||
{
|
||||
if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true;
|
||||
|
||||
if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true))
|
||||
{
|
||||
rocprofsys_init_library_hidden_with_rccl(postinit);
|
||||
rocprofsys_init_library_hidden();
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -458,7 +431,6 @@ rocprofsys_init_tooling_hidden(bool postinit)
|
||||
|
||||
if(get_state() != State::PreInit || get_state() == State::Init || _once)
|
||||
{
|
||||
rccl_setup(postinit);
|
||||
return false;
|
||||
}
|
||||
_once = true;
|
||||
@@ -481,7 +453,7 @@ rocprofsys_init_tooling_hidden(bool postinit)
|
||||
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init,
|
||||
"Calling rocprofsys_init_library()...\n");
|
||||
|
||||
rocprofsys_init_library_hidden_with_rccl(postinit);
|
||||
rocprofsys_init_library_hidden();
|
||||
|
||||
ROCPROFSYS_DEBUG_F("\n");
|
||||
|
||||
@@ -807,12 +779,6 @@ rocprofsys_finalize_hidden(void)
|
||||
component::vaapi_gotcha::shutdown();
|
||||
}
|
||||
|
||||
if(get_use_rcclp())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down RCCLP...\n");
|
||||
rcclp::shutdown();
|
||||
}
|
||||
|
||||
if(get_use_ompt())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down OMPT...\n");
|
||||
|
||||
@@ -20,7 +20,6 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/process_sampler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/ptl.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocm.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/amd_smi.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
|
||||
@@ -34,11 +33,6 @@ set(library_headers
|
||||
target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources}
|
||||
${library_headers})
|
||||
|
||||
if(ROCPROFSYS_USE_RCCL)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_USE_ROCM)
|
||||
target_sources(
|
||||
rocprofiler-systems-object-library
|
||||
@@ -56,7 +50,6 @@ add_subdirectory(tracing)
|
||||
set(ndebug_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/components/mpi_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/components/backtrace_metrics.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/ompt.cpp)
|
||||
|
||||
@@ -30,15 +30,9 @@ set(component_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/vaapi_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp)
|
||||
|
||||
target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources}
|
||||
${component_headers})
|
||||
|
||||
if(ROCPROFSYS_USE_RCCL)
|
||||
target_sources(rocprofiler-systems-object-library
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
|
||||
endif()
|
||||
|
||||
@@ -298,6 +298,8 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sen
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL)
|
||||
// Kept for reference, but now gathered throught the SDK callbacks.
|
||||
|
||||
// ncclReduce
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
@@ -403,6 +405,7 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const v
|
||||
}
|
||||
|
||||
// ncclAllGather
|
||||
// ncclAllToAll
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
|
||||
|
||||
@@ -26,7 +26,6 @@
|
||||
#include "core/common.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "core/rccl.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
|
||||
@@ -78,18 +77,6 @@ struct comm_data : base<comm_data, void>
|
||||
static constexpr auto label = "MPI Comm Send";
|
||||
};
|
||||
|
||||
struct rccl_recv
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "RCCL Comm Recv";
|
||||
};
|
||||
|
||||
struct rccl_send
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "RCCL Comm Send";
|
||||
};
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(comm_data)
|
||||
|
||||
static void preinit();
|
||||
@@ -148,55 +135,6 @@ struct comm_data : base<comm_data, void>
|
||||
MPI_Datatype recvtype, MPI_Comm);
|
||||
#endif
|
||||
|
||||
#if defined(ROCPROFSYS_USE_RCCL)
|
||||
static auto rccl_type_size(ncclDataType_t datatype)
|
||||
{
|
||||
switch(datatype)
|
||||
{
|
||||
case ncclInt8:
|
||||
case ncclUint8: return 1;
|
||||
case ncclFloat16: return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32: return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64: return 8;
|
||||
default: return 0;
|
||||
};
|
||||
}
|
||||
|
||||
// ncclReduce
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
|
||||
ncclComm_t, hipStream_t);
|
||||
|
||||
// ncclSend
|
||||
// ncclGather
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclBroadcast
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllGather
|
||||
// ncclAlltoAll
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
|
||||
|
||||
#endif
|
||||
|
||||
private:
|
||||
static auto& add(tracker_t& _t, data_type value)
|
||||
{
|
||||
|
||||
@@ -1,195 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/rcclp.hpp"
|
||||
#include "library/rcclp.hpp"
|
||||
|
||||
#include <timemory/manager.hpp>
|
||||
|
||||
std::ostream&
|
||||
operator<<(std::ostream& _os, const ncclUniqueId& _v)
|
||||
{
|
||||
for(auto itr : _v.internal)
|
||||
_os << itr;
|
||||
return _os;
|
||||
}
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
uint64_t
|
||||
activate_rcclp()
|
||||
{
|
||||
using handle_t = tim::component::rcclp_handle;
|
||||
|
||||
static auto _handle = std::shared_ptr<handle_t>{};
|
||||
|
||||
if(!_handle.get())
|
||||
{
|
||||
_handle = std::make_shared<handle_t>();
|
||||
_handle->start();
|
||||
|
||||
auto cleanup_functor = [=]() {
|
||||
if(_handle)
|
||||
{
|
||||
_handle->stop();
|
||||
_handle.reset();
|
||||
}
|
||||
};
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
|
||||
<< demangle<category::rocm_rccl>();
|
||||
tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
//
|
||||
//======================================================================================//
|
||||
//
|
||||
uint64_t
|
||||
deactivate_rcclp(uint64_t id)
|
||||
{
|
||||
if(id > 0)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
|
||||
<< demangle<category::rocm_rccl>();
|
||||
tim::manager::instance()->cleanup(ss.str());
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
//
|
||||
//======================================================================================//
|
||||
//
|
||||
void
|
||||
configure_rcclp(const std::set<std::string>& permit, const std::set<std::string>& reject)
|
||||
{
|
||||
static bool is_initialized = false;
|
||||
if(!is_initialized)
|
||||
{
|
||||
// generate the gotcha wrappers
|
||||
rcclp_gotcha_t::get_initializer() = []() {
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion);
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll);
|
||||
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv);
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum);
|
||||
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy);
|
||||
};
|
||||
|
||||
// provide environment variable for suppressing wrappers
|
||||
rcclp_gotcha_t::get_reject_list() = [reject]() {
|
||||
auto _reject = reject;
|
||||
// check environment
|
||||
auto reject_list =
|
||||
tim::get_env<std::string>("ROCPROFSYS_RCCLP_REJECT_LIST", "");
|
||||
// add environment setting
|
||||
for(const auto& itr : tim::delimit(reject_list))
|
||||
_reject.insert(itr);
|
||||
return _reject;
|
||||
};
|
||||
|
||||
// provide environment variable for selecting wrappers
|
||||
rcclp_gotcha_t::get_permit_list() = [permit]() {
|
||||
auto _permit = permit;
|
||||
// check environment
|
||||
auto permit_list =
|
||||
tim::get_env<std::string>("ROCPROFSYS_RCCLP_PERMIT_LIST", "");
|
||||
// add environment setting
|
||||
for(const auto& itr : tim::delimit(permit_list))
|
||||
_permit.insert(itr);
|
||||
return _permit;
|
||||
};
|
||||
|
||||
is_initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
rcclp_handle::start()
|
||||
{
|
||||
if(get_tool_count()++ == 0)
|
||||
{
|
||||
get_tool_instance() = std::make_shared<rcclp_tuple_t>("timemory_rcclp");
|
||||
get_tool_instance()->start();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
rcclp_handle::stop()
|
||||
{
|
||||
auto idx = --get_tool_count();
|
||||
if(get_tool_instance().get())
|
||||
{
|
||||
get_tool_instance()->stop();
|
||||
if(idx == 0) get_tool_instance().reset();
|
||||
}
|
||||
}
|
||||
|
||||
rcclp_handle::persistent_data&
|
||||
rcclp_handle::get_persistent_data()
|
||||
{
|
||||
static persistent_data _instance;
|
||||
return _instance;
|
||||
}
|
||||
|
||||
std::atomic<short>&
|
||||
rcclp_handle::get_configured()
|
||||
{
|
||||
return get_persistent_data().m_configured;
|
||||
}
|
||||
|
||||
rcclp_handle::toolset_ptr_t&
|
||||
rcclp_handle::get_tool_instance()
|
||||
{
|
||||
return get_persistent_data().m_tool;
|
||||
}
|
||||
|
||||
std::atomic<int64_t>&
|
||||
rcclp_handle::get_tool_count()
|
||||
{
|
||||
return get_persistent_data().m_count;
|
||||
}
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
@@ -1,106 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "core/common.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "core/rccl.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/comm_data.hpp"
|
||||
|
||||
#include <timemory/api/macros.hpp>
|
||||
#include <timemory/components/macros.hpp>
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#if !defined(ROCPROFSYS_NUM_RCCLP_WRAPPERS)
|
||||
# define ROCPROFSYS_NUM_RCCLP_WRAPPERS 25
|
||||
#endif
|
||||
|
||||
ROCPROFSYS_COMPONENT_ALIAS(
|
||||
rccl_toolset_t,
|
||||
::tim::component_bundle<category::rocm_rccl,
|
||||
rocprofsys::component::category_region<category::rocm_rccl>,
|
||||
comm_data>)
|
||||
ROCPROFSYS_COMPONENT_ALIAS(rcclp_gotcha_t,
|
||||
::tim::component::gotcha<ROCPROFSYS_NUM_RCCLP_WRAPPERS,
|
||||
rccl_toolset_t, category::rocm_rccl>)
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_RCCL)
|
||||
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type)
|
||||
#endif
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
uint64_t
|
||||
activate_rcclp();
|
||||
|
||||
uint64_t
|
||||
deactivate_rcclp(uint64_t id);
|
||||
|
||||
void
|
||||
configure_rcclp(const std::set<std::string>& permit = {},
|
||||
const std::set<std::string>& reject = {});
|
||||
|
||||
struct rcclp_handle : base<rcclp_handle, void>
|
||||
{
|
||||
static constexpr size_t rcclp_wrapper_count = ROCPROFSYS_NUM_RCCLP_WRAPPERS;
|
||||
|
||||
using value_type = void;
|
||||
using this_type = rcclp_handle;
|
||||
using base_type = base<this_type, value_type>;
|
||||
|
||||
using rcclp_tuple_t = tim::component_tuple<rcclp_gotcha_t>;
|
||||
using toolset_ptr_t = std::shared_ptr<rcclp_tuple_t>;
|
||||
|
||||
static std::string label() { return "rcclp_handle"; }
|
||||
static std::string description() { return "Handle for activating NCCL wrappers"; }
|
||||
static void get() {}
|
||||
static void start();
|
||||
static void stop();
|
||||
static int get_count() { return get_tool_count().load(); }
|
||||
|
||||
private:
|
||||
struct persistent_data
|
||||
{
|
||||
std::atomic<short> m_configured{ 0 };
|
||||
std::atomic<int64_t> m_count{ 0 };
|
||||
toolset_ptr_t m_tool = toolset_ptr_t{};
|
||||
};
|
||||
|
||||
static persistent_data& get_persistent_data();
|
||||
static std::atomic<short>& get_configured();
|
||||
static toolset_ptr_t& get_tool_instance();
|
||||
static std::atomic<int64_t>& get_tool_count();
|
||||
};
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
@@ -1,87 +0,0 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/rcclp.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "core/dynamic_library.hpp"
|
||||
#include "core/rccl.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
|
||||
#include <timemory/timemory.hpp>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace
|
||||
{
|
||||
uint64_t global_id = std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rcclp
|
||||
{
|
||||
void
|
||||
configure()
|
||||
{}
|
||||
|
||||
void
|
||||
setup()
|
||||
{
|
||||
configure();
|
||||
|
||||
// make sure the symbols are loaded to be wrapped
|
||||
dynamic_library _librccl{
|
||||
"ROCPROFSYS_RCCL_LIBRARY", "librccl.so", RTLD_NOW | RTLD_GLOBAL, true, true, true
|
||||
};
|
||||
|
||||
auto _use_data = tim::get_env("ROCPROFSYS_RCCLP_COMM_DATA", get_use_timemory());
|
||||
if(!get_use_timemory())
|
||||
{
|
||||
trait::runtime_enabled<component::comm_data>::set(false);
|
||||
trait::runtime_enabled<component::comm_data_tracker_t>::set(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
trait::runtime_enabled<component::comm_data>::set(_use_data);
|
||||
trait::runtime_enabled<component::comm_data_tracker_t>::set(_use_data);
|
||||
}
|
||||
|
||||
component::configure_rcclp();
|
||||
global_id = component::activate_rcclp();
|
||||
}
|
||||
|
||||
void
|
||||
shutdown()
|
||||
{
|
||||
if(global_id < std::numeric_limits<uint64_t>::max())
|
||||
component::deactivate_rcclp(global_id);
|
||||
}
|
||||
} // namespace rcclp
|
||||
} // namespace rocprofsys
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/rocprofiler-sdk/counters.hpp"
|
||||
#include "library/rocprofiler-sdk/fwd.hpp"
|
||||
#include "library/rocprofiler-sdk/rccl.hpp"
|
||||
#include "library/thread_info.hpp"
|
||||
#include "library/tracing.hpp"
|
||||
|
||||
@@ -482,6 +483,17 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto ts = rocprofiler_timestamp_t{};
|
||||
ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts));
|
||||
|
||||
const char* name = nullptr;
|
||||
rocprofiler_query_callback_tracing_kind_operation_name(record.kind, record.operation,
|
||||
&name, nullptr);
|
||||
|
||||
auto info = std::stringstream{};
|
||||
info << std::left << "tid=" << record.thread_id << ", cid=" << std::setw(3)
|
||||
<< record.correlation_id.internal << ", kind=" << std::setw(2) << record.kind
|
||||
<< ", operation=" << std::setw(3) << record.operation
|
||||
<< ", phase=" << record.phase << ", dt_nsec=" << std::setw(8) << ts
|
||||
<< ", name=" << name;
|
||||
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
|
||||
{
|
||||
user_data->value = ts;
|
||||
@@ -525,6 +537,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
|
||||
{
|
||||
tool_tracing_callback_start(category::rocm_rccl_api{}, record, user_data,
|
||||
ts);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_NONE:
|
||||
case ROCPROFILER_CALLBACK_TRACING_LAST:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API:
|
||||
@@ -533,7 +551,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
|
||||
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
|
||||
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
case ROCPROFILER_CALLBACK_TRACING_OMPT:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
|
||||
@@ -616,6 +633,13 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
|
||||
{
|
||||
tool_tracing_callback_rccl(record, user_data->value, ts);
|
||||
tool_tracing_callback_stop(category::rocm_rccl_api{}, record, user_data,
|
||||
ts, _bt_data);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_NONE:
|
||||
case ROCPROFILER_CALLBACK_TRACING_LAST:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API:
|
||||
@@ -624,7 +648,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
|
||||
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
|
||||
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
case ROCPROFILER_CALLBACK_TRACING_OMPT:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
|
||||
@@ -651,6 +674,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
_data->dispatch_info.dispatch_id,
|
||||
timing_interval{ _data->start_timestamp, _data->end_timestamp });
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCPROFSYS_WARNING_F(
|
||||
1, "tool_tracing_callback: unhandled PHASE_NONE callback record\n\t%s\n",
|
||||
info.str().c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1025,13 +1054,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
|
||||
{
|
||||
auto domains = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS");
|
||||
|
||||
ROCPROFSYS_VERBOSE_F(1, "rocprof-sys ROCm Domains:\n");
|
||||
ROCPROFSYS_VERBOSE_F(1, "Available ROCm Domains:\n");
|
||||
for(const auto& itr : domains->get_choices())
|
||||
ROCPROFSYS_VERBOSE_F(1, "- %s\n", itr.c_str());
|
||||
|
||||
auto _callback_domains = rocprofiler_sdk::get_callback_domains();
|
||||
auto _buffered_domain = rocprofiler_sdk::get_buffered_domains();
|
||||
auto _counter_events = rocprofiler_sdk::get_rocm_events();
|
||||
auto _version = rocprofiler_sdk::get_version();
|
||||
ROCPROFSYS_WARNING_IF(_version.formatted == 0,
|
||||
"Warning! rocprofiler-sdk version not initialized\n");
|
||||
|
||||
auto* _data = as_client_data(user_data);
|
||||
_data->client_fini = fini_func;
|
||||
@@ -1052,11 +1084,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
|
||||
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
|
||||
#if(ROCPROFILER_VERSION >= 700)
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_RCCL_API,
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API,
|
||||
#endif
|
||||
#if(ROCPROFILER_VERSION >= 700)
|
||||
ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API,
|
||||
#endif
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API
|
||||
})
|
||||
{
|
||||
if(_callback_domains.count(itr) > 0)
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#
|
||||
set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/fwd.cpp)
|
||||
set(rocprofiler_sdk_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/counters.cpp ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rccl.cpp)
|
||||
|
||||
set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/fwd.hpp)
|
||||
set(rocprofiler_sdk_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/counters.hpp ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/rccl.hpp)
|
||||
|
||||
target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources}
|
||||
${rocprofiler_sdk_headers})
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/rocprofiler-sdk/rccl.hpp"
|
||||
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/perfetto.hpp"
|
||||
|
||||
#include "library/tracing.hpp"
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
namespace
|
||||
{
|
||||
struct rccl_recv
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "RCCL Comm Recv";
|
||||
};
|
||||
|
||||
struct rccl_send
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "RCCL Comm Send";
|
||||
};
|
||||
|
||||
template <typename Tp, typename... Args>
|
||||
void
|
||||
write_perfetto_counter_track(uint64_t _val, uint64_t _begin_ts, uint64_t _end_ts)
|
||||
{
|
||||
using counter_track = rocprofsys::perfetto_counter_track<Tp>;
|
||||
|
||||
if(rocprofsys::get_use_perfetto() &&
|
||||
rocprofsys::get_state() == rocprofsys::State::Active)
|
||||
{
|
||||
const size_t _idx = 0;
|
||||
|
||||
if(!counter_track::exists(_idx))
|
||||
{
|
||||
std::string _label =
|
||||
(_idx > 0) ? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']')) : Tp::label;
|
||||
counter_track::emplace(_idx, _label, "bytes");
|
||||
}
|
||||
|
||||
TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _begin_ts, _val);
|
||||
TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _end_ts, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static auto
|
||||
rccl_type_size(ncclDataType_t datatype)
|
||||
{
|
||||
switch(datatype)
|
||||
{
|
||||
case ncclInt8:
|
||||
case ncclUint8: return 1;
|
||||
case ncclFloat16: return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32: return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64: return 8;
|
||||
default:
|
||||
ROCPROFSYS_CI_ABORT(true, "Unsupported RCCL datatype: %i", datatype);
|
||||
return 0;
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/*
|
||||
* @brief RCCL callback tracing handler
|
||||
*
|
||||
* This function processes RCCL API calls and writes the data transfer size to
|
||||
* the Perfetto counter track.
|
||||
*
|
||||
* @param record The tracing record containing the RCCL API call information.
|
||||
* @param begin_ts The timestamp when the operation started.
|
||||
* @param end_ts The timestamp when the operation ended.
|
||||
*/
|
||||
void
|
||||
tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record,
|
||||
uint64_t begin_ts, uint64_t end_ts)
|
||||
{
|
||||
if(record.kind == ROCPROFILER_CALLBACK_TRACING_RCCL_API)
|
||||
{
|
||||
auto* payload =
|
||||
static_cast<rocprofiler_callback_tracing_rccl_api_data_t*>(record.payload);
|
||||
|
||||
size_t size = 0;
|
||||
bool is_send = false;
|
||||
|
||||
auto set_recv = [&](size_t count, ncclDataType_t _dt) {
|
||||
is_send = false;
|
||||
size = count * rccl_type_size(_dt);
|
||||
};
|
||||
|
||||
auto set_send = [&](size_t count, ncclDataType_t _dt) {
|
||||
is_send = true;
|
||||
size = count * rccl_type_size(_dt);
|
||||
};
|
||||
|
||||
switch(record.operation)
|
||||
{
|
||||
// RCCL Data Receive
|
||||
case ROCPROFILER_RCCL_API_ID_ncclAllGather:
|
||||
set_recv(payload->args.ncclAllGather.sendcount,
|
||||
payload->args.ncclAllGather.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclAllToAll:
|
||||
set_recv(payload->args.ncclAllToAll.count,
|
||||
payload->args.ncclAllToAll.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclAllReduce:
|
||||
set_recv(payload->args.ncclAllReduce.count,
|
||||
payload->args.ncclAllReduce.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclGather:
|
||||
set_recv(payload->args.ncclGather.sendcount,
|
||||
payload->args.ncclGather.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclRecv:
|
||||
set_recv(payload->args.ncclRecv.count, payload->args.ncclRecv.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclReduce:
|
||||
set_recv(payload->args.ncclReduce.count,
|
||||
payload->args.ncclReduce.datatype);
|
||||
break;
|
||||
|
||||
// RCCL Data Send
|
||||
case ROCPROFILER_RCCL_API_ID_ncclBroadcast:
|
||||
set_send(payload->args.ncclBroadcast.count,
|
||||
payload->args.ncclBroadcast.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclReduceScatter:
|
||||
set_send(payload->args.ncclReduceScatter.recvcount,
|
||||
payload->args.ncclReduceScatter.datatype);
|
||||
break;
|
||||
case ROCPROFILER_RCCL_API_ID_ncclSend:
|
||||
set_send(payload->args.ncclSend.count, payload->args.ncclSend.datatype);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Skip other RCCL operations
|
||||
break;
|
||||
}
|
||||
|
||||
if(config::get_use_perfetto() && size > 0)
|
||||
{
|
||||
if(is_send)
|
||||
write_perfetto_counter_track<rccl_send>(size, begin_ts, end_ts);
|
||||
else
|
||||
write_perfetto_counter_track<rccl_recv>(size, begin_ts, end_ts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocprofiler_sdk
|
||||
} // namespace rocprofsys
|
||||
+12
-26
@@ -1,8 +1,6 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -24,34 +22,22 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/defines.hpp"
|
||||
#include <rocprofiler-sdk/buffer_tracing.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
// #include <rocprofiler-sdk/cxx/hash.hpp>
|
||||
// #include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
// #include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace rcclp
|
||||
namespace rocprofiler_sdk
|
||||
{
|
||||
void
|
||||
configure();
|
||||
tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record,
|
||||
uint64_t begin_ts, uint64_t end_ts);
|
||||
|
||||
void
|
||||
setup();
|
||||
} // namespace rocprofiler_sdk
|
||||
|
||||
void
|
||||
shutdown();
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_RCCL) || \
|
||||
(defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL == 0)
|
||||
inline void
|
||||
configure()
|
||||
{}
|
||||
|
||||
inline void
|
||||
setup()
|
||||
{}
|
||||
|
||||
inline void
|
||||
shutdown()
|
||||
{}
|
||||
#endif
|
||||
} // namespace rcclp
|
||||
} // namespace rocprofsys
|
||||
@@ -1,522 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_H_
|
||||
#define NCCL_H_
|
||||
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define NCCL_MAJOR 2
|
||||
#define NCCL_MINOR 11
|
||||
#define NCCL_PATCH 4
|
||||
#define NCCL_SUFFIX ""
|
||||
|
||||
#define NCCL_VERSION_CODE 21104
|
||||
#define NCCL_VERSION(X, Y, Z) \
|
||||
(((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
|
||||
|
||||
#define RCCL_BFLOAT16 1
|
||||
#define RCCL_GATHER_SCATTER 1
|
||||
#define RCCL_ALLTOALLV 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/*! @brief Opaque handle to communicator */
|
||||
typedef struct ncclComm* ncclComm_t;
|
||||
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
typedef struct
|
||||
{
|
||||
char internal[NCCL_UNIQUE_ID_BYTES];
|
||||
} ncclUniqueId;
|
||||
|
||||
/*! @brief Error type */
|
||||
typedef enum
|
||||
{
|
||||
ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclNumResults = 6
|
||||
} ncclResult_t;
|
||||
|
||||
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
*
|
||||
* @details This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||
* NCCL library
|
||||
*/
|
||||
ncclResult_t ncclGetVersion(int* version);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetVersion(int* version);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Generates an ID for ncclCommInitRank
|
||||
|
||||
@details
|
||||
Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||
called once and the Id should be distributed to all ranks in the
|
||||
communicator before calling ncclCommInitRank.
|
||||
|
||||
@param[in]
|
||||
uniqueId ncclUniqueId*
|
||||
pointer to uniqueId
|
||||
|
||||
*/
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version).
|
||||
|
||||
@details
|
||||
rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||
Each rank is associated to a CUDA device, which has to be set before calling
|
||||
ncclCommInitRank.
|
||||
ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
||||
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
|
||||
|
||||
@param[in]
|
||||
comm ncclComm_t*
|
||||
communicator struct pointer
|
||||
*/
|
||||
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
|
||||
int rank);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
|
||||
int rank);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Creates a clique of communicators (single process version).
|
||||
*
|
||||
* @details This is a convenience function to create a single-process communicator
|
||||
* clique. Returns an array of ndev newly initialized communicators in comm. comm
|
||||
* should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is
|
||||
* NULL, the first ndev HIP devices are used. Order of devlist defines user-order of
|
||||
* processors within the communicator.
|
||||
* */
|
||||
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object, but waits for any
|
||||
* operations that might still be running on the device */
|
||||
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Frees resources associated with communicator object and aborts any
|
||||
* operations that might still be running on the device. */
|
||||
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns a human-readable error message. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
|
||||
/*! @brief Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gets the number of ranks in the communicator clique. */
|
||||
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns the rocm device number associated with the communicator. */
|
||||
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns the user-ordered "rank" associated with the communicator. */
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Reduction operation selector */
|
||||
/* Reduction operation selector */
|
||||
typedef enum
|
||||
{
|
||||
ncclNumOps_dummy = 5
|
||||
} ncclRedOp_dummy_t;
|
||||
typedef enum
|
||||
{
|
||||
ncclSum = 0,
|
||||
ncclProd = 1,
|
||||
ncclMax = 2,
|
||||
ncclMin = 3,
|
||||
ncclAvg = 4,
|
||||
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
|
||||
* serves as the least possible value for dynamic ncclRedOp_t's
|
||||
* as constructed by ncclRedOpCreate*** functions. */
|
||||
ncclNumOps = 5,
|
||||
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
|
||||
* It is defined to be the largest signed value (since compilers
|
||||
* are permitted to use signed enums) that won't grow
|
||||
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
|
||||
* maintain ABI compatibility. */
|
||||
ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t))
|
||||
} ncclRedOp_t;
|
||||
|
||||
/*! @brief Data types */
|
||||
typedef enum
|
||||
{
|
||||
ncclInt8 = 0,
|
||||
ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2,
|
||||
ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6,
|
||||
ncclHalf = 6,
|
||||
ncclFloat32 = 7,
|
||||
ncclFloat = 7,
|
||||
ncclFloat64 = 8,
|
||||
ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
ncclNumTypes = 10
|
||||
} ncclDataType_t;
|
||||
|
||||
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||
typedef enum
|
||||
{
|
||||
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
||||
* dereferenced while the collective is running. */
|
||||
ncclScalarDevice = 0,
|
||||
|
||||
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
|
||||
* dereferenced before the ncclRedOpCreate***() function returns. */
|
||||
ncclScalarHostImmediate = 1
|
||||
} ncclScalarResidence_t;
|
||||
|
||||
/*
|
||||
* ncclRedOpCreatePreMulSum
|
||||
*
|
||||
* Creates a new reduction operator which pre-multiplies input values by a given
|
||||
* scalar locally before reducing them with peer values via summation. For use
|
||||
* only with collectives launched against *comm* and *datatype*. The
|
||||
* *residence* argument indicates how/when the memory pointed to by *scalar*
|
||||
* will be dereferenced. Upon return, the newly created operator's handle
|
||||
* is stored in *op*.
|
||||
*/
|
||||
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
|
||||
ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence,
|
||||
ncclComm_t comm);
|
||||
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
|
||||
ncclDataType_t datatype,
|
||||
ncclScalarResidence_t residence,
|
||||
ncclComm_t comm);
|
||||
|
||||
/*
|
||||
* ncclRedOpDestroy
|
||||
*
|
||||
* Destroys the reduction operator *op*. The operator must have been created by
|
||||
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
||||
* destroyed as soon as the last NCCL function which is given that operator returns.
|
||||
*/
|
||||
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||
|
||||
/*
|
||||
* Collective communication operations
|
||||
*
|
||||
* Collective communication operations must be called separately for each
|
||||
* communicator in a communicator clique.
|
||||
*
|
||||
* They return when operations have been enqueued on the CUDA stream.
|
||||
*
|
||||
* Since they may perform inter-CPU synchronization, each call has to be done
|
||||
* from a different thread or process, or need to use Group Semantics (see
|
||||
* below).
|
||||
*/
|
||||
|
||||
/*!
|
||||
* @brief Reduce
|
||||
*
|
||||
* @details Reduces data arrays of length count in sendbuff into recvbuff using op
|
||||
* operation.
|
||||
* recvbuff may be NULL on all calls except for root device.
|
||||
* root is the rank (not the CUDA device) where data will reside after the
|
||||
* operation is complete.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief (deprecated) Broadcast (in-place)
|
||||
*
|
||||
* @details Copies count values from root to all other devices.
|
||||
* root is the rank (not the CUDA device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* This operation is implicitely in place.
|
||||
*/
|
||||
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Broadcast
|
||||
*
|
||||
* @details Copies count values from root to all other devices.
|
||||
* root is the rank (not the HIP device) where data resides before the
|
||||
* operation is started.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-Reduce
|
||||
*
|
||||
* @details Reduces data arrays of length count in sendbuff using op operation, and
|
||||
* leaves identical copies of result on each recvbuff.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*!
|
||||
* @brief Reduce-Scatter
|
||||
*
|
||||
* @details Reduces data in sendbuff using op operation and leaves reduced result
|
||||
* scattered over the devices so that recvbuff on rank i will contain the i-th
|
||||
* block of the result.
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
* should have a size of at least nranks*recvcount elements.
|
||||
*
|
||||
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||
*/
|
||||
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, ncclRedOp_t op,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-Gather
|
||||
*
|
||||
* @details Each device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
*
|
||||
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||
*/
|
||||
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Send
|
||||
*
|
||||
* @details Send data from sendbuff to rank peer.
|
||||
* Rank peer needs to call ncclRecv with the same datatype and the same count from
|
||||
* this rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
|
||||
* operations need to progress concurrently to complete, they must be fused within a
|
||||
* ncclGroupStart/ ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Receive
|
||||
*
|
||||
* @details Receive data from rank peer into recvbuff.
|
||||
* Rank peer needs to call ncclSend with the same datatype and the same count to this
|
||||
* rank.
|
||||
*
|
||||
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
|
||||
* operations need to progress concurrently to complete, they must be fused within a
|
||||
* ncclGroupStart/ ncclGroupEnd section.
|
||||
*/
|
||||
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||
ncclComm_t comm, hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
int peer, ncclComm_t comm, hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Gather
|
||||
*
|
||||
* @details Root device gathers sendcount values from other GPUs into recvbuff,
|
||||
* receiving data from rank i at offset i*sendcount.
|
||||
*
|
||||
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||
* should have a size of at least nranks*sendcount elements.
|
||||
*
|
||||
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||
*/
|
||||
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Scatter
|
||||
*
|
||||
* @details Scattered over the devices so that recvbuff on rank i will contain the
|
||||
* i-th block of the data on root.
|
||||
*
|
||||
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||
* should have a size of at least nranks*recvcount elements.
|
||||
*
|
||||
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||
*/
|
||||
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
|
||||
ncclDataType_t datatype, int root, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-To-All
|
||||
*
|
||||
* @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
|
||||
* block. Each block for sending/receiving has count elements, which means
|
||||
* that recvbuff and sendbuff should have a size of nranks*count elements.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief All-To-Allv
|
||||
*
|
||||
* @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
|
||||
* to device (j). In the same time, device (i) receives recvcounts[j] of data
|
||||
* from device (j) to be placed at rdispls[j].
|
||||
|
||||
* sendcounts, sdispls, recvcounts and rdispls are all measured in the units
|
||||
* of datatype, not bytes.
|
||||
*
|
||||
* In-place operation will happen if sendbuff == recvbuff.
|
||||
*/
|
||||
ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void* recvbuff,
|
||||
const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
|
||||
const size_t sdispls[], void* recvbuff,
|
||||
const size_t recvcounts[], const size_t rdispls[],
|
||||
ncclDataType_t datatype, ncclComm_t comm,
|
||||
hipStream_t stream);
|
||||
/// @endcond
|
||||
|
||||
/*
|
||||
* Group semantics
|
||||
*
|
||||
* When managing multiple GPUs from a single thread, and since NCCL collective
|
||||
* calls may perform inter-CPU synchronization, we need to "group" calls for
|
||||
* different ranks/devices into a single call.
|
||||
*
|
||||
* Grouping NCCL calls as being part of the same collective operation is done
|
||||
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
|
||||
* collective calls until the ncclGroupEnd call, which will wait for all calls
|
||||
* to be complete. Note that for collective communication, ncclGroupEnd only
|
||||
* guarantees that the operations are enqueued on the streams, not that
|
||||
* the operation is effectively done.
|
||||
*
|
||||
* Both collective communication and ncclCommInitRank can be used in conjunction
|
||||
* of ncclGroupStart/ncclGroupEnd, but not together.
|
||||
*
|
||||
* Group semantics also allow to fuse multiple operations on the same device
|
||||
* to improve performance (for aggregated collective calls), or to permit
|
||||
* concurrent progress of multiple send/receive operations.
|
||||
*/
|
||||
|
||||
/*! @brief Group Start
|
||||
*
|
||||
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
|
||||
* a single NCCL operation. Nothing will be started on the CUDA stream until
|
||||
* ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupStart();
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupStart();
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Group End
|
||||
*
|
||||
* End a group call. Start a fused NCCL operation consisting of all calls since
|
||||
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
|
||||
* need to be called after ncclGroupEnd.
|
||||
*/
|
||||
ncclResult_t ncclGroupEnd();
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclGroupEnd();
|
||||
/// @endcond
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // end extern "C"
|
||||
#endif
|
||||
|
||||
#endif // end include guard
|
||||
@@ -30,12 +30,15 @@ foreach(_TARGET ${RCCL_TEST_TARGETS})
|
||||
string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}")
|
||||
string(REPLACE "_" "-" _NAME "${_NAME}")
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_RUNTIME
|
||||
NAME rccl-test-${_NAME}
|
||||
TARGET ${_TARGET}
|
||||
LABELS "rccl-tests;rcclp"
|
||||
MPI ON
|
||||
GPU ON
|
||||
NUM_PROCS 1
|
||||
SAMPLING_TIMEOUT 300
|
||||
REWRITE_TIMEOUT 300
|
||||
REWRITE_ARGS
|
||||
-e
|
||||
-v
|
||||
|
||||
@@ -166,9 +166,10 @@ set(_rccl_environment
|
||||
"ROCPROFSYS_PROFILE=ON"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
|
||||
"ROCPROFSYS_USE_RCCLP=ON"
|
||||
"ROCPROFSYS_TIME_OUTPUT=OFF"
|
||||
"ROCPROFSYS_USE_PID=OFF"
|
||||
"ROCPROFSYS_USE_RCCLP=ON"
|
||||
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy"
|
||||
"${_test_openmp_env}"
|
||||
"${_test_library_path}")
|
||||
|
||||
@@ -466,8 +467,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
|
||||
cmake_parse_arguments(
|
||||
TEST "SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME"
|
||||
"NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" "${_KWARGS}"
|
||||
${ARGN})
|
||||
"NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT"
|
||||
"${_KWARGS}" ${ARGN})
|
||||
|
||||
foreach(_PREFIX SAMPLING RUNTIME REWRITE REWRITE_RUN BASELINE)
|
||||
if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "")
|
||||
|
||||
Ссылка в новой задаче
Block a user