Use rocprofiler-sdk for RCCL-API tracing (#126)

- Add support for RCCL API tracing through rocprofiler-sdk.
- Refactored the comm_data code to use the SDK RCCL_API callbacks.
- Add a runtime version check for SDK to gate callback enablement, rather than just the compile-time check.
- Fixed: SAMPLING_TIMEOUT was not being handled correctly in add_test.
Этот коммит содержится в:
David Galiffi
2025-06-06 11:36:17 -04:00
коммит произвёл GitHub
родитель c5507e3740
Коммит af77d93f75
31 изменённых файлов: 350 добавлений и 1252 удалений
-7
Просмотреть файл
@@ -179,7 +179,6 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF)
rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON)
rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI"
ON)
rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support" OFF)
rocprofiler_systems_add_option(
ROCPROFSYS_USE_MPI_HEADERS
"Enable wrapping MPI functions w/o enabling MPI dependency" ON)
@@ -209,12 +208,6 @@ elseif("$ENV{ROCPROFSYS_CI}")
endif()
endif()
if(NOT ROCPROFSYS_USE_ROCM)
set(ROCPROFSYS_USE_RCCL
OFF
CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE)
endif()
if(ROCPROFSYS_BUILD_TESTING)
set(ROCPROFSYS_BUILD_EXAMPLES
ON
-94
Просмотреть файл
@@ -1,94 +0,0 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file
# Copyright.txt or https://cmake.org/licensing for details.
include(FindPackageHandleStandardArgs)
# ----------------------------------------------------------------------------------------#
set(RCCL-Headers_INCLUDE_DIR_INTERNAL
"${PROJECT_SOURCE_DIR}/source/lib/rocprof-sys/library/tpls/rccl"
CACHE PATH "Path to internal rccl.h")
# ----------------------------------------------------------------------------------------#
if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()
foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl)
if(EXISTS ${_DIR})
get_filename_component(_ABS_DIR "${_DIR}" REALPATH)
list(APPEND _RCCL_PATHS ${_ABS_DIR})
endif()
endforeach()
# ----------------------------------------------------------------------------------------#
find_package(
rccl
QUIET
CONFIG
HINTS
${_RCCL_PATHS}
PATHS
${_RCCL_PATHS}
PATH_SUFFIXES
rccl/lib/cmake)
if(NOT rccl_FOUND)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
CACHE PATH "Path to RCCL headers")
else()
set(RCCL-Headers_INCLUDE_DIR
"${rccl_INCLUDE_DIR}"
CACHE PATH "Path to RCCL headers")
endif()
# because of the annoying warning starting with v5.2.0, we've got to do this crap
if(ROCmVersion_NUMERIC_VERSION)
if(ROCmVersion_NUMERIC_VERSION LESS 50200)
set(_RCCL-Headers_FILE "rccl.h")
set(_RCCL-Headers_DIR "/rccl")
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()
else()
set(_RCCL-Headers_FILE "rccl/rccl.h")
set(_RCCL-Headers_DIR "")
endif()
if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}")
rocprofiler_systems_message(
AUTHOR_WARNING
"RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}"
)
set(RCCL-Headers_INCLUDE_DIR
"${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}"
CACHE PATH "Path to RCCL headers" FORCE)
endif()
unset(_RCCL-Headers_FILE)
unset(_RCCL-Headers_DIR)
mark_as_advanced(RCCL-Headers_INCLUDE_DIR)
# ----------------------------------------------------------------------------------------#
find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR)
# ------------------------------------------------------------------------------#
if(RCCL-Headers_FOUND)
add_library(roc::rccl-headers INTERFACE IMPORTED)
set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR})
target_include_directories(roc::rccl-headers SYSTEM
INTERFACE ${RCCL-Headers_INCLUDE_DIR})
add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED)
target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers)
endif()
# ------------------------------------------------------------------------------#
-17
Просмотреть файл
@@ -17,9 +17,6 @@ rocprofiler_systems_add_interface_library(
"Provides flags and libraries for Dyninst (dynamic instrumentation)")
rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm
"Provides flags and libraries for ROCm")
rocprofiler_systems_add_interface_library(
rocprofiler-systems-rccl
"Provides flags for ROCm Communication Collectives Library (RCCL)")
rocprofiler_systems_add_interface_library(rocprofiler-systems-mpi
"Provides MPI or MPI headers")
rocprofiler_systems_add_interface_library(rocprofiler-systems-libva
@@ -47,7 +44,6 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition
# libraries with relevant compile definitions
set(ROCPROFSYS_EXTENSION_LIBRARIES
rocprofiler-systems::rocprofiler-systems-rocm
rocprofiler-systems::rocprofiler-systems-rccl
rocprofiler-systems::rocprofiler-systems-bfd
rocprofiler-systems::rocprofiler-systems-mpi
rocprofiler-systems::rocprofiler-systems-ptl
@@ -185,19 +181,6 @@ if(ROCPROFSYS_USE_ROCM)
target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi)
endif()
# ----------------------------------------------------------------------------------------#
#
# RCCL
#
# ----------------------------------------------------------------------------------------#
if(ROCPROFSYS_USE_RCCL)
find_package(RCCL-Headers ${rocprofiler_systems_FIND_QUIETLY} REQUIRED)
target_link_libraries(rocprofiler-systems-rccl INTERFACE roc::rccl-headers)
rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rccl
INTERFACE ROCPROFSYS_USE_RCCL)
endif()
# ----------------------------------------------------------------------------------------#
#
# MPI
-4
Просмотреть файл
@@ -834,10 +834,6 @@ parse_args(int argc, char** argv, std::vector<char*>& _env,
(defined(ROCPROFSYS_USE_MPI_HEADERS) && ROCPROFSYS_USE_MPI_HEADERS > 0)
add_default_env(_env, "ROCPROFSYS_USE_MPIP", true);
#endif
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true);
#endif
}
_fill("ROCPROFSYS_CAUSAL_BINARY_EXCLUDE", _binary_excludes, _generate_configs);
+5 -10
Просмотреть файл
@@ -730,11 +730,10 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
}
});
std::set<std::string> _backend_choices = {
"all", "kokkosp", "mpip", "ompt",
"rcclp", "amd-smi", "mutex-locks", "spin-locks",
"rw-locks", "rocprofiler-sdk", "rocm"
};
std::set<std::string> _backend_choices = { "all", "kokkosp", "mpip",
"ompt", "rcclp", "amd-smi",
"mutex-locks", "spin-locks", "rw-locks",
"rocm" };
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
_backend_choices.erase("mpip");
@@ -744,14 +743,10 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
_backend_choices.erase("ompt");
#endif
#if !defined(ROCPROFSYS_USE_RCCL)
_backend_choices.erase("rcclp");
#endif
#if !defined(ROCPROFSYS_USE_ROCM)
_backend_choices.erase("rocm");
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rcclp");
#endif
parser.start_group("BACKEND OPTIONS",
-1
Просмотреть файл
@@ -44,7 +44,6 @@ target_link_libraries(
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-libva>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-ptl>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rocm>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-rccl>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libgcc-optional>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-static-libstdcxx-optional>
$<BUILD_INTERFACE:rocprofiler-systems::rocprofiler-systems-sanitizer>
-1
Просмотреть файл
@@ -35,7 +35,6 @@ set(core_headers
${CMAKE_CURRENT_LIST_DIR}/mproc.hpp
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp
${CMAKE_CURRENT_LIST_DIR}/rccl.hpp
${CMAKE_CURRENT_LIST_DIR}/redirect.hpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
${CMAKE_CURRENT_LIST_DIR}/state.hpp
+4 -14
Просмотреть файл
@@ -562,10 +562,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_data.processed_environs.emplace("periods");
}
strset_t _backend_choices = {
"all", "kokkosp", "mpip", "ompt", "rcclp",
"amd-smi", "rocprofiler-sdk", "mutex-locks", "spin-locks", "rw-locks"
};
strset_t _backend_choices = { "all", "kokkosp", "mpip", "ompt",
"rcclp", "amd-smi", "rocm", "mutex-locks",
"spin-locks", "rw-locks" };
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
_backend_choices.erase("mpip");
@@ -575,14 +574,10 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_backend_choices.erase("ompt");
#endif
#if !defined(ROCPROFSYS_USE_RCCL)
_backend_choices.erase("rcclp");
#endif
#if !defined(ROCPROFSYS_USE_ROCM)
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rocm");
_backend_choices.erase("rcclp");
#endif
if(gpu::device_count() == 0)
@@ -590,13 +585,8 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
// remove GPU-specific backends
_backend_choices.erase("rcclp");
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rocm");
#if defined(ROCPROFSYS_USE_RCCL)
update_env(_data, "ROCPROFSYS_USE_RCCLP", false);
#endif
#if defined(ROCPROFSYS_USE_ROCM)
update_env(_data, "ROCPROFSYS_USE_AMD_SMI", false);
update_env(_data, "ROCPROFSYS_USE_ROCM", false);
+2
Просмотреть файл
@@ -102,6 +102,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGOR
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm RocDecode API")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocjpeg_api, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, "rocm_rocjpeg_api", "ROCm RocJPEG API")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl_api, ROCPROFSYS_CATEGORY_ROCM_RCCL_API, "rocm_rccl_api", "ROCm RCCL API")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "amd_smi", "AMD-SMI data")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_gfx_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, "device_busy_gfx", "Busy percentage of GFX engine on a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_umc_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, "device_busy_umc", "Busy percentage of UMC engin on a GPU device")
@@ -171,6 +172,7 @@ using name = perfetto_category<Tp...>;
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocdecode_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocjpeg_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl_api), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_gfx_busy), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_umc_busy), \
+8
Просмотреть файл
@@ -1359,6 +1359,14 @@ configure_disabled_settings(const std::shared_ptr<settings>& _config)
_config->find("ROCPROFSYS_USE_AMD_SMI")->second->set_hidden(true);
for(const auto& itr : _config->disable_category("amd_smi"))
_config->find(itr)->second->set_hidden(true);
_config->find("ROCPROFSYS_USE_RCCLP")->second->set_hidden(true);
for(const auto& itr : _config->disable_category("rcclp"))
_config->find(itr)->second->set_hidden(true);
_config->find("ROCPROFSYS_USE_ROCM")->second->set_hidden(true);
for(const auto& itr : _config->disable_category("rocm"))
_config->find(itr)->second->set_hidden(true);
#endif
#if defined(ROCPROFSYS_USE_OMPT) || ROCPROFSYS_USE_OMPT == 0
-29
Просмотреть файл
@@ -1,29 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/defines.hpp"
#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0
# include <rccl/rccl.h>
#endif
+64 -13
Просмотреть файл
@@ -218,6 +218,30 @@ get_operations_impl(const std::unordered_set<int32_t>& _complete,
} // namespace
/// @brief Return the version of the rocprofiler-sdk
/// @return The version of the rocprofiler-sdk or 0 if not initialized
version_info&
get_version()
{
static auto _version = version_info{ 0 };
if(_version.formatted == 0)
{
uint32_t _major = 0;
uint32_t _minor = 0;
uint32_t _patch = 0;
ROCPROFILER_CALL(rocprofiler_get_version(&_major, &_minor, &_patch));
_version.major = _major;
_version.minor = _minor;
_version.patch = _patch;
_version.formatted = _major * 10000 + _minor * 100 + _patch;
}
return _version;
}
void
config_settings(const std::shared_ptr<settings>& _config)
{
@@ -319,6 +343,7 @@ config_settings(const std::shared_ptr<settings>& _config)
join::join(join::array_config{ ", ", "", "" }, _domain_choices));
auto _domain_defaults = std::string{ "hip_runtime_api,marker_api,kernel_dispatch,"
"memory_copy,scratch_memory" };
# if(ROCPROFILER_VERSION < 10000)
_domain_defaults.append(",page_migration");
# endif
@@ -353,28 +378,48 @@ std::unordered_set<rocprofiler_callback_tracing_kind_t>
get_callback_domains()
{
const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names();
const auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>
{
auto supported = std::unordered_set<rocprofiler_callback_tracing_kind_t>{
ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API,
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
# if(ROCPROFILER_VERSION >= 700)
ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API,
ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API,
# endif
ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
};
auto _version = get_version();
ROCPROFSYS_WARNING_IF(_version.formatted == 0,
"Warning! rocprofiler-sdk version not initialized\n");
# if(ROCPROFILER_VERSION >= 600)
if(_version.formatted >= 600)
{
// Argument tracing is supported in rocprofiler-sdk 0.6.0 and later
supported.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API);
supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API);
}
# endif
# if(ROCPROFILER_VERSION >= 700)
if(_version.formatted >= 700)
{
supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API);
}
# endif
auto _data = std::unordered_set<rocprofiler_callback_tracing_kind_t>{};
auto _domains =
tim::delimit(config::get_setting_value<std::string>("ROCPROFSYS_ROCM_DOMAINS")
.value_or(std::string{}),
" ,;:\t\n");
if(config::get_use_rcclp() && _version.formatted >= 600)
{
// Translate ROCPROFSYS_USE_RCCLP to entry in ROCPROFSYS_ROCM_DOMAINS
_data.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API);
}
const auto valid_choices =
settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices();
@@ -582,6 +627,12 @@ void
config_settings(const std::shared_ptr<settings>&)
{}
version_info&
get_version()
{
static auto _version = version_info{ 0 };
return _version;
}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
+11
Просмотреть файл
@@ -39,9 +39,20 @@ namespace rocprofsys
{
namespace rocprofiler_sdk
{
struct version_info
{
uint32_t major = 0;
uint32_t minor = 0;
uint32_t patch = 0;
uint32_t formatted = 0; // major * 10000 + minor * 100 + patch
};
void
config_settings(const std::shared_ptr<settings>&);
version_info&
get_version();
#if defined(ROCPROFSYS_USE_ROCM)
std::unordered_set<rocprofiler_callback_tracing_kind_t>
+1
Просмотреть файл
@@ -54,6 +54,7 @@ extern "C"
ROCPROFSYS_CATEGORY_ROCM_MARKER_API,
ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API,
ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API,
ROCPROFSYS_CATEGORY_ROCM_RCCL_API,
ROCPROFSYS_CATEGORY_AMD_SMI,
ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX,
ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC,
+1 -1
Просмотреть файл
@@ -122,7 +122,7 @@ rocprofsys_init_library(void)
extern "C" void
rocprofsys_init_tooling(void)
{
rocprofsys_init_tooling_hidden(true);
rocprofsys_init_tooling_hidden();
}
extern "C" void
+1 -1
Просмотреть файл
@@ -95,7 +95,7 @@ extern "C"
// these are the real implementations for internal calling convention
void rocprofsys_init_library_hidden(void) ROCPROFSYS_HIDDEN_API;
bool rocprofsys_init_tooling_hidden(bool postinit = false) ROCPROFSYS_HIDDEN_API;
bool rocprofsys_init_tooling_hidden(void) ROCPROFSYS_HIDDEN_API;
void rocprofsys_init_hidden(const char*, bool, const char*) ROCPROFSYS_HIDDEN_API;
void rocprofsys_finalize_hidden(void) ROCPROFSYS_HIDDEN_API;
void rocprofsys_reset_preload_hidden(void) ROCPROFSYS_HIDDEN_API;
+3 -37
Просмотреть файл
@@ -53,7 +53,6 @@
#include "library/ompt.hpp"
#include "library/process_sampler.hpp"
#include "library/ptl.hpp"
#include "library/rcclp.hpp"
#include "library/rocprofiler-sdk.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
@@ -404,42 +403,16 @@ rocprofsys_init_library_hidden()
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n");
}
// Initialize RCCL if:
// - postinit=true - so the code doesn't hang at the initialization stage
// - get_state() >= State::Init - so the code doesn't throw an exception
// - rccl_initialized=false - so we don't try to initialize RCCL twice
// - get_use_rcclp()=true - only if the environment is configured to use RCCL
static void
rccl_setup(bool postinit)
{
// Flag used to avoid initializing RCCL twice
static bool rccl_initialized = false;
if(postinit && (get_state() >= State::Init) && !rccl_initialized && get_use_rcclp())
{
ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n");
rcclp::setup();
rccl_initialized = true;
}
}
static void
rocprofsys_init_library_hidden_with_rccl(bool postinit)
{
rocprofsys_init_library_hidden();
rccl_setup(postinit);
}
//======================================================================================//
extern "C" bool
rocprofsys_init_tooling_hidden(bool postinit)
rocprofsys_init_tooling_hidden(void)
{
if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true;
if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true))
{
rocprofsys_init_library_hidden_with_rccl(postinit);
rocprofsys_init_library_hidden();
return false;
}
@@ -458,7 +431,6 @@ rocprofsys_init_tooling_hidden(bool postinit)
if(get_state() != State::PreInit || get_state() == State::Init || _once)
{
rccl_setup(postinit);
return false;
}
_once = true;
@@ -481,7 +453,7 @@ rocprofsys_init_tooling_hidden(bool postinit)
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init,
"Calling rocprofsys_init_library()...\n");
rocprofsys_init_library_hidden_with_rccl(postinit);
rocprofsys_init_library_hidden();
ROCPROFSYS_DEBUG_F("\n");
@@ -807,12 +779,6 @@ rocprofsys_finalize_hidden(void)
component::vaapi_gotcha::shutdown();
}
if(get_use_rcclp())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down RCCLP...\n");
rcclp::shutdown();
}
if(get_use_ompt())
{
ROCPROFSYS_VERBOSE_F(1, "Shutting down OMPT...\n");
-7
Просмотреть файл
@@ -20,7 +20,6 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/process_sampler.hpp
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
${CMAKE_CURRENT_LIST_DIR}/ptl.hpp
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
${CMAKE_CURRENT_LIST_DIR}/rocm.hpp
${CMAKE_CURRENT_LIST_DIR}/amd_smi.hpp
${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp
@@ -34,11 +33,6 @@ set(library_headers
target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources}
${library_headers})
if(ROCPROFSYS_USE_RCCL)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
endif()
if(ROCPROFSYS_USE_ROCM)
target_sources(
rocprofiler-systems-object-library
@@ -56,7 +50,6 @@ add_subdirectory(tracing)
set(ndebug_sources
${CMAKE_CURRENT_LIST_DIR}/components/mpi_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/components/backtrace_metrics.cpp
${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp
${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp
${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp
${CMAKE_CURRENT_LIST_DIR}/ompt.cpp)
-6
Просмотреть файл
@@ -30,15 +30,9 @@ set(component_headers
${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/vaapi_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp
${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp)
target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources}
${component_headers})
if(ROCPROFSYS_USE_RCCL)
target_sources(rocprofiler-systems-object-library
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp)
endif()
+3
Просмотреть файл
@@ -298,6 +298,8 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sen
#endif
#if defined(ROCPROFSYS_USE_RCCL)
// Kept for reference, but now gathered throught the SDK callbacks.
// ncclReduce
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
@@ -403,6 +405,7 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const v
}
// ncclAllGather
// ncclAllToAll
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
-62
Просмотреть файл
@@ -26,7 +26,6 @@
#include "core/common.hpp"
#include "core/components/fwd.hpp"
#include "core/defines.hpp"
#include "core/rccl.hpp"
#include "core/timemory.hpp"
#include "library/components/category_region.hpp"
@@ -78,18 +77,6 @@ struct comm_data : base<comm_data, void>
static constexpr auto label = "MPI Comm Send";
};
struct rccl_recv
{
static constexpr auto value = "comm_data";
static constexpr auto label = "RCCL Comm Recv";
};
struct rccl_send
{
static constexpr auto value = "comm_data";
static constexpr auto label = "RCCL Comm Send";
};
ROCPROFSYS_DEFAULT_OBJECT(comm_data)
static void preinit();
@@ -148,55 +135,6 @@ struct comm_data : base<comm_data, void>
MPI_Datatype recvtype, MPI_Comm);
#endif
#if defined(ROCPROFSYS_USE_RCCL)
static auto rccl_type_size(ncclDataType_t datatype)
{
switch(datatype)
{
case ncclInt8:
case ncclUint8: return 1;
case ncclFloat16: return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32: return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64: return 8;
default: return 0;
};
}
// ncclReduce
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
ncclComm_t, hipStream_t);
// ncclSend
// ncclGather
// ncclBcast
// ncclRecv
static void audit(const gotcha_data& _data, audit::incoming, const void*,
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
hipStream_t);
// ncclBroadcast
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
hipStream_t);
// ncclAllReduce
// ncclReduceScatter
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t);
// ncclAllGather
// ncclAlltoAll
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
#endif
private:
static auto& add(tracker_t& _t, data_type value)
{
-195
Просмотреть файл
@@ -1,195 +0,0 @@
// MIT License
//
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/rcclp.hpp"
#include "library/rcclp.hpp"
#include <timemory/manager.hpp>
std::ostream&
operator<<(std::ostream& _os, const ncclUniqueId& _v)
{
for(auto itr : _v.internal)
_os << itr;
return _os;
}
namespace rocprofsys
{
namespace component
{
uint64_t
activate_rcclp()
{
using handle_t = tim::component::rcclp_handle;
static auto _handle = std::shared_ptr<handle_t>{};
if(!_handle.get())
{
_handle = std::make_shared<handle_t>();
_handle->start();
auto cleanup_functor = [=]() {
if(_handle)
{
_handle->stop();
_handle.reset();
}
};
std::stringstream ss;
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
<< demangle<category::rocm_rccl>();
tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor);
return 1;
}
return 0;
}
//
//======================================================================================//
//
uint64_t
deactivate_rcclp(uint64_t id)
{
if(id > 0)
{
std::stringstream ss;
ss << "timemory-rcclp-" << demangle<rccl_toolset_t>() << "-"
<< demangle<category::rocm_rccl>();
tim::manager::instance()->cleanup(ss.str());
return 0;
}
return 1;
}
//
//======================================================================================//
//
void
configure_rcclp(const std::set<std::string>& permit, const std::set<std::string>& reject)
{
static bool is_initialized = false;
if(!is_initialized)
{
// generate the gotcha wrappers
rcclp_gotcha_t::get_initializer() = []() {
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion);
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll);
TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv);
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum);
// TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy);
};
// provide environment variable for suppressing wrappers
rcclp_gotcha_t::get_reject_list() = [reject]() {
auto _reject = reject;
// check environment
auto reject_list =
tim::get_env<std::string>("ROCPROFSYS_RCCLP_REJECT_LIST", "");
// add environment setting
for(const auto& itr : tim::delimit(reject_list))
_reject.insert(itr);
return _reject;
};
// provide environment variable for selecting wrappers
rcclp_gotcha_t::get_permit_list() = [permit]() {
auto _permit = permit;
// check environment
auto permit_list =
tim::get_env<std::string>("ROCPROFSYS_RCCLP_PERMIT_LIST", "");
// add environment setting
for(const auto& itr : tim::delimit(permit_list))
_permit.insert(itr);
return _permit;
};
is_initialized = true;
}
}
void
rcclp_handle::start()
{
if(get_tool_count()++ == 0)
{
get_tool_instance() = std::make_shared<rcclp_tuple_t>("timemory_rcclp");
get_tool_instance()->start();
}
}
void
rcclp_handle::stop()
{
auto idx = --get_tool_count();
if(get_tool_instance().get())
{
get_tool_instance()->stop();
if(idx == 0) get_tool_instance().reset();
}
}
rcclp_handle::persistent_data&
rcclp_handle::get_persistent_data()
{
static persistent_data _instance;
return _instance;
}
std::atomic<short>&
rcclp_handle::get_configured()
{
return get_persistent_data().m_configured;
}
rcclp_handle::toolset_ptr_t&
rcclp_handle::get_tool_instance()
{
return get_persistent_data().m_tool;
}
std::atomic<int64_t>&
rcclp_handle::get_tool_count()
{
return get_persistent_data().m_count;
}
} // namespace component
} // namespace rocprofsys
-106
Просмотреть файл
@@ -1,106 +0,0 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "core/common.hpp"
#include "core/components/fwd.hpp"
#include "core/defines.hpp"
#include "core/rccl.hpp"
#include "core/timemory.hpp"
#include "library/components/category_region.hpp"
#include "library/components/comm_data.hpp"
#include <timemory/api/macros.hpp>
#include <timemory/components/macros.hpp>
#include <atomic>
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <utility>
#if !defined(ROCPROFSYS_NUM_RCCLP_WRAPPERS)
# define ROCPROFSYS_NUM_RCCLP_WRAPPERS 25
#endif
ROCPROFSYS_COMPONENT_ALIAS(
rccl_toolset_t,
::tim::component_bundle<category::rocm_rccl,
rocprofsys::component::category_region<category::rocm_rccl>,
comm_data>)
ROCPROFSYS_COMPONENT_ALIAS(rcclp_gotcha_t,
::tim::component::gotcha<ROCPROFSYS_NUM_RCCLP_WRAPPERS,
rccl_toolset_t, category::rocm_rccl>)
#if !defined(ROCPROFSYS_USE_RCCL)
ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type)
#endif
namespace rocprofsys
{
namespace component
{
uint64_t
activate_rcclp();
uint64_t
deactivate_rcclp(uint64_t id);
void
configure_rcclp(const std::set<std::string>& permit = {},
const std::set<std::string>& reject = {});
struct rcclp_handle : base<rcclp_handle, void>
{
static constexpr size_t rcclp_wrapper_count = ROCPROFSYS_NUM_RCCLP_WRAPPERS;
using value_type = void;
using this_type = rcclp_handle;
using base_type = base<this_type, value_type>;
using rcclp_tuple_t = tim::component_tuple<rcclp_gotcha_t>;
using toolset_ptr_t = std::shared_ptr<rcclp_tuple_t>;
static std::string label() { return "rcclp_handle"; }
static std::string description() { return "Handle for activating NCCL wrappers"; }
static void get() {}
static void start();
static void stop();
static int get_count() { return get_tool_count().load(); }
private:
struct persistent_data
{
std::atomic<short> m_configured{ 0 };
std::atomic<int64_t> m_count{ 0 };
toolset_ptr_t m_tool = toolset_ptr_t{};
};
static persistent_data& get_persistent_data();
static std::atomic<short>& get_configured();
static toolset_ptr_t& get_tool_instance();
static std::atomic<int64_t>& get_tool_count();
};
} // namespace component
} // namespace rocprofsys
-87
Просмотреть файл
@@ -1,87 +0,0 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/rcclp.hpp"
#include "core/components/fwd.hpp"
#include "core/defines.hpp"
#include "core/dynamic_library.hpp"
#include "core/rccl.hpp"
#include "core/timemory.hpp"
#include "library/components/category_region.hpp"
#include <timemory/timemory.hpp>
#include <dlfcn.h>
#include <limits>
#include <memory>
#include <set>
#include <unordered_map>
namespace
{
uint64_t global_id = std::numeric_limits<uint64_t>::max();
}
namespace rocprofsys
{
namespace rcclp
{
void
configure()
{}
void
setup()
{
configure();
// make sure the symbols are loaded to be wrapped
dynamic_library _librccl{
"ROCPROFSYS_RCCL_LIBRARY", "librccl.so", RTLD_NOW | RTLD_GLOBAL, true, true, true
};
auto _use_data = tim::get_env("ROCPROFSYS_RCCLP_COMM_DATA", get_use_timemory());
if(!get_use_timemory())
{
trait::runtime_enabled<component::comm_data>::set(false);
trait::runtime_enabled<component::comm_data_tracker_t>::set(false);
}
else
{
trait::runtime_enabled<component::comm_data>::set(_use_data);
trait::runtime_enabled<component::comm_data_tracker_t>::set(_use_data);
}
component::configure_rcclp();
global_id = component::activate_rcclp();
}
void
shutdown()
{
if(global_id < std::numeric_limits<uint64_t>::max())
component::deactivate_rcclp(global_id);
}
} // namespace rcclp
} // namespace rocprofsys
+40 -5
Просмотреть файл
@@ -34,6 +34,7 @@
#include "library/components/category_region.hpp"
#include "library/rocprofiler-sdk/counters.hpp"
#include "library/rocprofiler-sdk/fwd.hpp"
#include "library/rocprofiler-sdk/rccl.hpp"
#include "library/thread_info.hpp"
#include "library/tracing.hpp"
@@ -482,6 +483,17 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
auto ts = rocprofiler_timestamp_t{};
ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts));
const char* name = nullptr;
rocprofiler_query_callback_tracing_kind_operation_name(record.kind, record.operation,
&name, nullptr);
auto info = std::stringstream{};
info << std::left << "tid=" << record.thread_id << ", cid=" << std::setw(3)
<< record.correlation_id.internal << ", kind=" << std::setw(2) << record.kind
<< ", operation=" << std::setw(3) << record.operation
<< ", phase=" << record.phase << ", dt_nsec=" << std::setw(8) << ts
<< ", name=" << name;
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
{
user_data->value = ts;
@@ -525,6 +537,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
break;
}
#endif
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
{
tool_tracing_callback_start(category::rocm_rccl_api{}, record, user_data,
ts);
break;
}
case ROCPROFILER_CALLBACK_TRACING_NONE:
case ROCPROFILER_CALLBACK_TRACING_LAST:
case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API:
@@ -533,7 +551,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
#if(ROCPROFILER_VERSION >= 600)
case ROCPROFILER_CALLBACK_TRACING_OMPT:
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
@@ -616,6 +633,13 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
break;
}
#endif
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
{
tool_tracing_callback_rccl(record, user_data->value, ts);
tool_tracing_callback_stop(category::rocm_rccl_api{}, record, user_data,
ts, _bt_data);
break;
}
case ROCPROFILER_CALLBACK_TRACING_NONE:
case ROCPROFILER_CALLBACK_TRACING_LAST:
case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API:
@@ -624,7 +648,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
#if(ROCPROFILER_VERSION >= 600)
case ROCPROFILER_CALLBACK_TRACING_OMPT:
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
@@ -651,6 +674,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
_data->dispatch_info.dispatch_id,
timing_interval{ _data->start_timestamp, _data->end_timestamp });
}
else
{
ROCPROFSYS_WARNING_F(
1, "tool_tracing_callback: unhandled PHASE_NONE callback record\n\t%s\n",
info.str().c_str());
}
}
else
{
@@ -1025,13 +1054,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
{
auto domains = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS");
ROCPROFSYS_VERBOSE_F(1, "rocprof-sys ROCm Domains:\n");
ROCPROFSYS_VERBOSE_F(1, "Available ROCm Domains:\n");
for(const auto& itr : domains->get_choices())
ROCPROFSYS_VERBOSE_F(1, "- %s\n", itr.c_str());
auto _callback_domains = rocprofiler_sdk::get_callback_domains();
auto _buffered_domain = rocprofiler_sdk::get_buffered_domains();
auto _counter_events = rocprofiler_sdk::get_rocm_events();
auto _version = rocprofiler_sdk::get_version();
ROCPROFSYS_WARNING_IF(_version.formatted == 0,
"Warning! rocprofiler-sdk version not initialized\n");
auto* _data = as_client_data(user_data);
_data->client_fini = fini_func;
@@ -1052,11 +1084,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
#if(ROCPROFILER_VERSION >= 700)
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
ROCPROFILER_CALLBACK_TRACING_RCCL_API,
#if(ROCPROFILER_VERSION >= 600)
ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API,
#endif
#if(ROCPROFILER_VERSION >= 700)
ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API,
#endif
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API
})
{
if(_callback_domains.count(itr) > 0)
+6 -4
Просмотреть файл
@@ -1,9 +1,11 @@
#
set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp
${CMAKE_CURRENT_LIST_DIR}/fwd.cpp)
set(rocprofiler_sdk_sources
${CMAKE_CURRENT_LIST_DIR}/counters.cpp ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp
${CMAKE_CURRENT_LIST_DIR}/rccl.cpp)
set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp
${CMAKE_CURRENT_LIST_DIR}/fwd.hpp)
set(rocprofiler_sdk_headers
${CMAKE_CURRENT_LIST_DIR}/counters.hpp ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp
${CMAKE_CURRENT_LIST_DIR}/rccl.hpp)
target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources}
${rocprofiler_sdk_headers})
+182
Просмотреть файл
@@ -0,0 +1,182 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/rocprofiler-sdk/rccl.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/perfetto.hpp"
#include "library/tracing.hpp"
namespace rocprofsys
{
namespace rocprofiler_sdk
{
namespace
{
struct rccl_recv
{
static constexpr auto value = "comm_data";
static constexpr auto label = "RCCL Comm Recv";
};
struct rccl_send
{
static constexpr auto value = "comm_data";
static constexpr auto label = "RCCL Comm Send";
};
template <typename Tp, typename... Args>
void
write_perfetto_counter_track(uint64_t _val, uint64_t _begin_ts, uint64_t _end_ts)
{
using counter_track = rocprofsys::perfetto_counter_track<Tp>;
if(rocprofsys::get_use_perfetto() &&
rocprofsys::get_state() == rocprofsys::State::Active)
{
const size_t _idx = 0;
if(!counter_track::exists(_idx))
{
std::string _label =
(_idx > 0) ? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']')) : Tp::label;
counter_track::emplace(_idx, _label, "bytes");
}
TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _begin_ts, _val);
TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _end_ts, 0);
}
}
static auto
rccl_type_size(ncclDataType_t datatype)
{
switch(datatype)
{
case ncclInt8:
case ncclUint8: return 1;
case ncclFloat16: return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32: return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64: return 8;
default:
ROCPROFSYS_CI_ABORT(true, "Unsupported RCCL datatype: %i", datatype);
return 0;
};
}
} // namespace
/*
* @brief RCCL callback tracing handler
*
* This function processes RCCL API calls and writes the data transfer size to
* the Perfetto counter track.
*
* @param record The tracing record containing the RCCL API call information.
* @param begin_ts The timestamp when the operation started.
* @param end_ts The timestamp when the operation ended.
*/
void
tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record,
uint64_t begin_ts, uint64_t end_ts)
{
if(record.kind == ROCPROFILER_CALLBACK_TRACING_RCCL_API)
{
auto* payload =
static_cast<rocprofiler_callback_tracing_rccl_api_data_t*>(record.payload);
size_t size = 0;
bool is_send = false;
auto set_recv = [&](size_t count, ncclDataType_t _dt) {
is_send = false;
size = count * rccl_type_size(_dt);
};
auto set_send = [&](size_t count, ncclDataType_t _dt) {
is_send = true;
size = count * rccl_type_size(_dt);
};
switch(record.operation)
{
// RCCL Data Receive
case ROCPROFILER_RCCL_API_ID_ncclAllGather:
set_recv(payload->args.ncclAllGather.sendcount,
payload->args.ncclAllGather.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclAllToAll:
set_recv(payload->args.ncclAllToAll.count,
payload->args.ncclAllToAll.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclAllReduce:
set_recv(payload->args.ncclAllReduce.count,
payload->args.ncclAllReduce.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclGather:
set_recv(payload->args.ncclGather.sendcount,
payload->args.ncclGather.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclRecv:
set_recv(payload->args.ncclRecv.count, payload->args.ncclRecv.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclReduce:
set_recv(payload->args.ncclReduce.count,
payload->args.ncclReduce.datatype);
break;
// RCCL Data Send
case ROCPROFILER_RCCL_API_ID_ncclBroadcast:
set_send(payload->args.ncclBroadcast.count,
payload->args.ncclBroadcast.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclReduceScatter:
set_send(payload->args.ncclReduceScatter.recvcount,
payload->args.ncclReduceScatter.datatype);
break;
case ROCPROFILER_RCCL_API_ID_ncclSend:
set_send(payload->args.ncclSend.count, payload->args.ncclSend.datatype);
break;
default:
// Skip other RCCL operations
break;
}
if(config::get_use_perfetto() && size > 0)
{
if(is_send)
write_perfetto_counter_track<rccl_send>(size, begin_ts, end_ts);
else
write_perfetto_counter_track<rccl_recv>(size, begin_ts, end_ts);
}
}
}
} // namespace rocprofiler_sdk
} // namespace rocprofsys
@@ -1,8 +1,6 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@@ -24,34 +22,22 @@
#pragma once
#include "core/defines.hpp"
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
// #include <rocprofiler-sdk/cxx/hash.hpp>
// #include <rocprofiler-sdk/cxx/name_info.hpp>
// #include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
namespace rocprofsys
{
namespace rcclp
namespace rocprofiler_sdk
{
void
configure();
tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record,
uint64_t begin_ts, uint64_t end_ts);
void
setup();
} // namespace rocprofiler_sdk
void
shutdown();
#if !defined(ROCPROFSYS_USE_RCCL) || \
(defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL == 0)
inline void
configure()
{}
inline void
setup()
{}
inline void
shutdown()
{}
#endif
} // namespace rcclp
} // namespace rocprofsys
-522
Просмотреть файл
@@ -1,522 +0,0 @@
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_H_
#define NCCL_H_
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#define NCCL_MAJOR 2
#define NCCL_MINOR 11
#define NCCL_PATCH 4
#define NCCL_SUFFIX ""
#define NCCL_VERSION_CODE 21104
#define NCCL_VERSION(X, Y, Z) \
(((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z))
#define RCCL_BFLOAT16 1
#define RCCL_GATHER_SCATTER 1
#define RCCL_ALLTOALLV 1
#ifdef __cplusplus
extern "C"
{
#endif
/*! @brief Opaque handle to communicator */
typedef struct ncclComm* ncclComm_t;
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct
{
char internal[NCCL_UNIQUE_ID_BYTES];
} ncclUniqueId;
/*! @brief Error type */
typedef enum
{
ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclNumResults = 6
} ncclResult_t;
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
*
* @details This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
*/
ncclResult_t ncclGetVersion(int* version);
/// @cond include_hidden
ncclResult_t pncclGetVersion(int* version);
/// @endcond
/*! @brief Generates an ID for ncclCommInitRank
@details
Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be
called once and the Id should be distributed to all ranks in the
communicator before calling ncclCommInitRank.
@param[in]
uniqueId ncclUniqueId*
pointer to uniqueId
*/
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
/// @cond include_hidden
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
/// @endcond
/*! @brief Creates a new communicator (multi thread/process version).
@details
rank must be between 0 and nranks-1 and unique within a communicator clique.
Each rank is associated to a CUDA device, which has to be set before calling
ncclCommInitRank.
ncclCommInitRank implicitly syncronizes with other ranks, so it must be
called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
@param[in]
comm ncclComm_t*
communicator struct pointer
*/
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
int rank);
/// @cond include_hidden
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
int rank);
/// @endcond
/*! @brief Creates a clique of communicators (single process version).
*
* @details This is a convenience function to create a single-process communicator
* clique. Returns an array of ndev newly initialized communicators in comm. comm
* should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is
* NULL, the first ndev HIP devices are used. Order of devlist defines user-order of
* processors within the communicator.
* */
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/// @cond include_hidden
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
/// @endcond
/*! @brief Frees resources associated with communicator object, but waits for any
* operations that might still be running on the device */
ncclResult_t ncclCommDestroy(ncclComm_t comm);
/// @cond include_hidden
ncclResult_t pncclCommDestroy(ncclComm_t comm);
/// @endcond
/*! @brief Frees resources associated with communicator object and aborts any
* operations that might still be running on the device. */
ncclResult_t ncclCommAbort(ncclComm_t comm);
/// @cond include_hidden
ncclResult_t pncclCommAbort(ncclComm_t comm);
/// @endcond
/*! @brief Returns a human-readable error message. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
/*! @brief Checks whether the comm has encountered any asynchronous errors */
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
/// @cond include_hidden
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError);
/// @endcond
/*! @brief Gets the number of ranks in the communicator clique. */
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
/// @cond include_hidden
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
/// @endcond
/*! @brief Returns the rocm device number associated with the communicator. */
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
/// @cond include_hidden
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
/// @endcond
/*! @brief Returns the user-ordered "rank" associated with the communicator. */
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
/// @cond include_hidden
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
/// @endcond
/*! @brief Reduction operation selector */
/* Reduction operation selector */
typedef enum
{
ncclNumOps_dummy = 5
} ncclRedOp_dummy_t;
typedef enum
{
ncclSum = 0,
ncclProd = 1,
ncclMax = 2,
ncclMin = 3,
ncclAvg = 4,
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
* serves as the least possible value for dynamic ncclRedOp_t's
* as constructed by ncclRedOpCreate*** functions. */
ncclNumOps = 5,
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
* It is defined to be the largest signed value (since compilers
* are permitted to use signed enums) that won't grow
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
* maintain ABI compatibility. */
ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t))
} ncclRedOp_t;
/*! @brief Data types */
typedef enum
{
ncclInt8 = 0,
ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2,
ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6,
ncclHalf = 6,
ncclFloat32 = 7,
ncclFloat = 7,
ncclFloat64 = 8,
ncclDouble = 8,
ncclBfloat16 = 9,
ncclNumTypes = 10
} ncclDataType_t;
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
typedef enum
{
/* ncclScalarDevice: The scalar is in device-visible memory and will be
* dereferenced while the collective is running. */
ncclScalarDevice = 0,
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
* dereferenced before the ncclRedOpCreate***() function returns. */
ncclScalarHostImmediate = 1
} ncclScalarResidence_t;
/*
* ncclRedOpCreatePreMulSum
*
* Creates a new reduction operator which pre-multiplies input values by a given
* scalar locally before reducing them with peer values via summation. For use
* only with collectives launched against *comm* and *datatype*. The
* *residence* argument indicates how/when the memory pointed to by *scalar*
* will be dereferenced. Upon return, the newly created operator's handle
* is stored in *op*.
*/
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
ncclDataType_t datatype,
ncclScalarResidence_t residence,
ncclComm_t comm);
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar,
ncclDataType_t datatype,
ncclScalarResidence_t residence,
ncclComm_t comm);
/*
* ncclRedOpDestroy
*
* Destroys the reduction operator *op*. The operator must have been created by
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
* destroyed as soon as the last NCCL function which is given that operator returns.
*/
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
/*
* Collective communication operations
*
* Collective communication operations must be called separately for each
* communicator in a communicator clique.
*
* They return when operations have been enqueued on the CUDA stream.
*
* Since they may perform inter-CPU synchronization, each call has to be done
* from a different thread or process, or need to use Group Semantics (see
* below).
*/
/*!
* @brief Reduce
*
* @details Reduces data arrays of length count in sendbuff into recvbuff using op
* operation.
* recvbuff may be NULL on all calls except for root device.
* root is the rank (not the CUDA device) where data will reside after the
* operation is complete.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root,
ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief (deprecated) Broadcast (in-place)
*
* @details Copies count values from root to all other devices.
* root is the rank (not the CUDA device) where data resides before the
* operation is started.
*
* This operation is implicitely in place.
*/
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Broadcast
*
* @details Copies count values from root to all other devices.
* root is the rank (not the HIP device) where data resides before the
* operation is started.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief All-Reduce
*
* @details Reduces data arrays of length count in sendbuff using op operation, and
* leaves identical copies of result on each recvbuff.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*!
* @brief Reduce-Scatter
*
* @details Reduces data in sendbuff using op operation and leaves reduced result
* scattered over the devices so that recvbuff on rank i will contain the i-th
* block of the result.
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype,
ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief All-Gather
*
* @details Each device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief Send
*
* @details Send data from sendbuff to rank peer.
* Rank peer needs to call ncclRecv with the same datatype and the same count from
* this rank.
*
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
* operations need to progress concurrently to complete, they must be fused within a
* ncclGroupStart/ ncclGroupEnd section.
*/
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Receive
*
* @details Receive data from rank peer into recvbuff.
* Rank peer needs to call ncclSend with the same datatype and the same count to this
* rank.
*
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv
* operations need to progress concurrently to complete, they must be fused within a
* ncclGroupStart/ ncclGroupEnd section.
*/
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype,
int peer, ncclComm_t comm, hipStream_t stream);
/// @endcond
/*! @brief Gather
*
* @details Root device gathers sendcount values from other GPUs into recvbuff,
* receiving data from rank i at offset i*sendcount.
*
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
* should have a size of at least nranks*sendcount elements.
*
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
*/
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief Scatter
*
* @details Scattered over the devices so that recvbuff on rank i will contain the
* i-th block of the data on root.
*
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
* should have a size of at least nranks*recvcount elements.
*
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
*/
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, int root, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief All-To-All
*
* @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
* block. Each block for sending/receiving has count elements, which means
* that recvbuff and sendbuff should have a size of nranks*count elements.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*! @brief All-To-Allv
*
* @details Device (i) sends sendcounts[j] of data from offset sdispls[j]
* to device (j). In the same time, device (i) receives recvcounts[j] of data
* from device (j) to be placed at rdispls[j].
* sendcounts, sdispls, recvcounts and rdispls are all measured in the units
* of datatype, not bytes.
*
* In-place operation will happen if sendbuff == recvbuff.
*/
ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
const size_t sdispls[], void* recvbuff,
const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @cond include_hidden
ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[],
const size_t sdispls[], void* recvbuff,
const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm,
hipStream_t stream);
/// @endcond
/*
* Group semantics
*
* When managing multiple GPUs from a single thread, and since NCCL collective
* calls may perform inter-CPU synchronization, we need to "group" calls for
* different ranks/devices into a single call.
*
* Grouping NCCL calls as being part of the same collective operation is done
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
* collective calls until the ncclGroupEnd call, which will wait for all calls
* to be complete. Note that for collective communication, ncclGroupEnd only
* guarantees that the operations are enqueued on the streams, not that
* the operation is effectively done.
*
* Both collective communication and ncclCommInitRank can be used in conjunction
* of ncclGroupStart/ncclGroupEnd, but not together.
*
* Group semantics also allow to fuse multiple operations on the same device
* to improve performance (for aggregated collective calls), or to permit
* concurrent progress of multiple send/receive operations.
*/
/*! @brief Group Start
*
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
* a single NCCL operation. Nothing will be started on the CUDA stream until
* ncclGroupEnd.
*/
ncclResult_t ncclGroupStart();
/// @cond include_hidden
ncclResult_t pncclGroupStart();
/// @endcond
/*! @brief Group End
*
* End a group call. Start a fused NCCL operation consisting of all calls since
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
* need to be called after ncclGroupEnd.
*/
ncclResult_t ncclGroupEnd();
/// @cond include_hidden
ncclResult_t pncclGroupEnd();
/// @endcond
#ifdef __cplusplus
} // end extern "C"
#endif
#endif // end include guard
+3
Просмотреть файл
@@ -30,12 +30,15 @@ foreach(_TARGET ${RCCL_TEST_TARGETS})
string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}")
string(REPLACE "_" "-" _NAME "${_NAME}")
rocprofiler_systems_add_test(
SKIP_RUNTIME
NAME rccl-test-${_NAME}
TARGET ${_TARGET}
LABELS "rccl-tests;rcclp"
MPI ON
GPU ON
NUM_PROCS 1
SAMPLING_TIMEOUT 300
REWRITE_TIMEOUT 300
REWRITE_ARGS
-e
-v
+4 -3
Просмотреть файл
@@ -166,9 +166,10 @@ set(_rccl_environment
"ROCPROFSYS_PROFILE=ON"
"ROCPROFSYS_USE_SAMPLING=OFF"
"ROCPROFSYS_USE_PROCESS_SAMPLING=ON"
"ROCPROFSYS_USE_RCCLP=ON"
"ROCPROFSYS_TIME_OUTPUT=OFF"
"ROCPROFSYS_USE_PID=OFF"
"ROCPROFSYS_USE_RCCLP=ON"
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy"
"${_test_openmp_env}"
"${_test_library_path}")
@@ -466,8 +467,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
cmake_parse_arguments(
TEST "SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME"
"NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" "${_KWARGS}"
${ARGN})
"NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT"
"${_KWARGS}" ${ARGN})
foreach(_PREFIX SAMPLING RUNTIME REWRITE REWRITE_RUN BASELINE)
if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "")