From c7c3c3f97eaed7bba0bd50e1e846bc7008133de4 Mon Sep 17 00:00:00 2001 From: David Galiffi Date: Fri, 6 Jun 2025 11:36:17 -0400 Subject: [PATCH] Use rocprofiler-sdk for RCCL-API tracing (#126) - Add support for RCCL API tracing through rocprofiler-sdk. - Refactored the comm_data code to use the SDK RCCL_API callbacks. - Add a runtime version check for SDK to gate callback enablement, rather than just the compile-time check. - Fixed: SAMPLING_TIMEOUT was not being handled correctly in add_test. [ROCm/rocprofiler-systems commit: af77d93f750df829e2fd12b49562614f2207975e] --- projects/rocprofiler-systems/CMakeLists.txt | 7 - .../cmake/Modules/FindRCCL-Headers.cmake | 94 ---- .../rocprofiler-systems/cmake/Packages.cmake | 17 - .../source/bin/rocprof-sys-causal/impl.cpp | 4 - .../source/bin/rocprof-sys-sample/impl.cpp | 15 +- .../source/lib/CMakeLists.txt | 1 - .../source/lib/core/CMakeLists.txt | 1 - .../source/lib/core/argparse.cpp | 18 +- .../source/lib/core/categories.hpp | 2 + .../source/lib/core/config.cpp | 8 + .../source/lib/core/rccl.hpp | 29 - .../source/lib/core/rocprofiler-sdk.cpp | 77 ++- .../source/lib/core/rocprofiler-sdk.hpp | 11 + .../rocprofiler-systems/categories.h | 1 + .../source/lib/rocprof-sys/api.cpp | 2 +- .../source/lib/rocprof-sys/api.hpp | 2 +- .../source/lib/rocprof-sys/library.cpp | 40 +- .../lib/rocprof-sys/library/CMakeLists.txt | 7 - .../library/components/CMakeLists.txt | 6 - .../library/components/comm_data.cpp | 3 + .../library/components/comm_data.hpp | 62 --- .../rocprof-sys/library/components/rcclp.cpp | 195 ------- .../rocprof-sys/library/components/rcclp.hpp | 106 ---- .../source/lib/rocprof-sys/library/rcclp.cpp | 87 --- .../rocprof-sys/library/rocprofiler-sdk.cpp | 45 +- .../library/rocprofiler-sdk/CMakeLists.txt | 10 +- .../library/rocprofiler-sdk/rccl.cpp | 182 ++++++ .../{rcclp.hpp => rocprofiler-sdk/rccl.hpp} | 38 +- .../rocprof-sys/library/tpls/rccl/rccl/rccl.h | 522 ------------------ .../tests/rocprof-sys-rccl-tests.cmake | 3 + .../tests/rocprof-sys-testing.cmake | 7 +- 31 files changed, 350 insertions(+), 1252 deletions(-) delete mode 100644 projects/rocprofiler-systems/cmake/Modules/FindRCCL-Headers.cmake delete mode 100644 projects/rocprofiler-systems/source/lib/core/rccl.hpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.cpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.hpp delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.cpp create mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp rename projects/rocprofiler-systems/source/lib/rocprof-sys/library/{rcclp.hpp => rocprofiler-sdk/rccl.hpp} (67%) delete mode 100644 projects/rocprofiler-systems/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index c8976c3d8f..4bbe2a44f9 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -179,7 +179,6 @@ rocprofiler_systems_add_option(ROCPROFSYS_USE_MPI "Enable MPI support" OFF) rocprofiler_systems_add_option(ROCPROFSYS_USE_ROCM "Enable ROCm support" ON) rocprofiler_systems_add_option(ROCPROFSYS_USE_PAPI "Enable HW counter support via PAPI" ON) -rocprofiler_systems_add_option(ROCPROFSYS_USE_RCCL "Enable RCCL support" OFF) rocprofiler_systems_add_option( ROCPROFSYS_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) @@ -209,12 +208,6 @@ elseif("$ENV{ROCPROFSYS_CI}") endif() endif() -if(NOT ROCPROFSYS_USE_ROCM) - set(ROCPROFSYS_USE_RCCL - OFF - CACHE BOOL "Disabled via ROCPROFSYS_USE_ROCM=OFF" FORCE) -endif() - if(ROCPROFSYS_BUILD_TESTING) set(ROCPROFSYS_BUILD_EXAMPLES ON diff --git a/projects/rocprofiler-systems/cmake/Modules/FindRCCL-Headers.cmake b/projects/rocprofiler-systems/cmake/Modules/FindRCCL-Headers.cmake deleted file mode 100644 index 8d0befed39..0000000000 --- a/projects/rocprofiler-systems/cmake/Modules/FindRCCL-Headers.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file -# Copyright.txt or https://cmake.org/licensing for details. - -include(FindPackageHandleStandardArgs) - -# ----------------------------------------------------------------------------------------# - -set(RCCL-Headers_INCLUDE_DIR_INTERNAL - "${PROJECT_SOURCE_DIR}/source/lib/rocprof-sys/library/tpls/rccl" - CACHE PATH "Path to internal rccl.h") - -# ----------------------------------------------------------------------------------------# - -if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "") - set(ROCM_PATH "$ENV{ROCM_PATH}") -endif() - -foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl) - if(EXISTS ${_DIR}) - get_filename_component(_ABS_DIR "${_DIR}" REALPATH) - list(APPEND _RCCL_PATHS ${_ABS_DIR}) - endif() -endforeach() - -# ----------------------------------------------------------------------------------------# - -find_package( - rccl - QUIET - CONFIG - HINTS - ${_RCCL_PATHS} - PATHS - ${_RCCL_PATHS} - PATH_SUFFIXES - rccl/lib/cmake) - -if(NOT rccl_FOUND) - set(RCCL-Headers_INCLUDE_DIR - "${RCCL-Headers_INCLUDE_DIR_INTERNAL}" - CACHE PATH "Path to RCCL headers") -else() - set(RCCL-Headers_INCLUDE_DIR - "${rccl_INCLUDE_DIR}" - CACHE PATH "Path to RCCL headers") -endif() - -# because of the annoying warning starting with v5.2.0, we've got to do this crap -if(ROCmVersion_NUMERIC_VERSION) - if(ROCmVersion_NUMERIC_VERSION LESS 50200) - set(_RCCL-Headers_FILE "rccl.h") - set(_RCCL-Headers_DIR "/rccl") - else() - set(_RCCL-Headers_FILE "rccl/rccl.h") - set(_RCCL-Headers_DIR "") - endif() -else() - set(_RCCL-Headers_FILE "rccl/rccl.h") - set(_RCCL-Headers_DIR "") -endif() - -if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}") - rocprofiler_systems_message( - AUTHOR_WARNING - "RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}" - ) - set(RCCL-Headers_INCLUDE_DIR - "${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}" - CACHE PATH "Path to RCCL headers" FORCE) -endif() - -unset(_RCCL-Headers_FILE) -unset(_RCCL-Headers_DIR) - -mark_as_advanced(RCCL-Headers_INCLUDE_DIR) - -# ----------------------------------------------------------------------------------------# - -find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR) - -# ------------------------------------------------------------------------------# - -if(RCCL-Headers_FOUND) - add_library(roc::rccl-headers INTERFACE IMPORTED) - set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR}) - - target_include_directories(roc::rccl-headers SYSTEM - INTERFACE ${RCCL-Headers_INCLUDE_DIR}) - - add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED) - target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers) -endif() - -# ------------------------------------------------------------------------------# diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index c911da78ae..767bdda1a3 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -17,9 +17,6 @@ rocprofiler_systems_add_interface_library( "Provides flags and libraries for Dyninst (dynamic instrumentation)") rocprofiler_systems_add_interface_library(rocprofiler-systems-rocm "Provides flags and libraries for ROCm") -rocprofiler_systems_add_interface_library( - rocprofiler-systems-rccl - "Provides flags for ROCm Communication Collectives Library (RCCL)") rocprofiler_systems_add_interface_library(rocprofiler-systems-mpi "Provides MPI or MPI headers") rocprofiler_systems_add_interface_library(rocprofiler-systems-libva @@ -47,7 +44,6 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-compile-definition # libraries with relevant compile definitions set(ROCPROFSYS_EXTENSION_LIBRARIES rocprofiler-systems::rocprofiler-systems-rocm - rocprofiler-systems::rocprofiler-systems-rccl rocprofiler-systems::rocprofiler-systems-bfd rocprofiler-systems::rocprofiler-systems-mpi rocprofiler-systems::rocprofiler-systems-ptl @@ -185,19 +181,6 @@ if(ROCPROFSYS_USE_ROCM) target_link_libraries(rocprofiler-systems-rocm INTERFACE amd-smi::amd-smi) endif() -# ----------------------------------------------------------------------------------------# -# -# RCCL -# -# ----------------------------------------------------------------------------------------# - -if(ROCPROFSYS_USE_RCCL) - find_package(RCCL-Headers ${rocprofiler_systems_FIND_QUIETLY} REQUIRED) - target_link_libraries(rocprofiler-systems-rccl INTERFACE roc::rccl-headers) - rocprofiler_systems_target_compile_definitions(rocprofiler-systems-rccl - INTERFACE ROCPROFSYS_USE_RCCL) -endif() - # ----------------------------------------------------------------------------------------# # # MPI diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp index 39237443cb..91b7bceb0d 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-causal/impl.cpp @@ -834,10 +834,6 @@ parse_args(int argc, char** argv, std::vector& _env, (defined(ROCPROFSYS_USE_MPI_HEADERS) && ROCPROFSYS_USE_MPI_HEADERS > 0) add_default_env(_env, "ROCPROFSYS_USE_MPIP", true); #endif - -#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 - add_default_env(_env, "ROCPROFSYS_USE_RCCLP", true); -#endif } _fill("ROCPROFSYS_CAUSAL_BINARY_EXCLUDE", _binary_excludes, _generate_configs); diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp index 37d177b5df..7225a21b62 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp @@ -730,11 +730,10 @@ parse_args(int argc, char** argv, std::vector& _env) } }); - std::set _backend_choices = { - "all", "kokkosp", "mpip", "ompt", - "rcclp", "amd-smi", "mutex-locks", "spin-locks", - "rw-locks", "rocprofiler-sdk", "rocm" - }; + std::set _backend_choices = { "all", "kokkosp", "mpip", + "ompt", "rcclp", "amd-smi", + "mutex-locks", "spin-locks", "rw-locks", + "rocm" }; #if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS) _backend_choices.erase("mpip"); @@ -744,14 +743,10 @@ parse_args(int argc, char** argv, std::vector& _env) _backend_choices.erase("ompt"); #endif -#if !defined(ROCPROFSYS_USE_RCCL) - _backend_choices.erase("rcclp"); -#endif - #if !defined(ROCPROFSYS_USE_ROCM) _backend_choices.erase("rocm"); _backend_choices.erase("amd-smi"); - _backend_choices.erase("rocprofiler-sdk"); + _backend_choices.erase("rcclp"); #endif parser.start_group("BACKEND OPTIONS", diff --git a/projects/rocprofiler-systems/source/lib/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/CMakeLists.txt index 29a65bf6c7..0a2b8ea51d 100644 --- a/projects/rocprofiler-systems/source/lib/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/CMakeLists.txt @@ -44,7 +44,6 @@ target_link_libraries( $ $ $ - $ $ $ $ diff --git a/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt index ee14f6baec..a0363a16f6 100644 --- a/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/core/CMakeLists.txt @@ -35,7 +35,6 @@ set(core_headers ${CMAKE_CURRENT_LIST_DIR}/mproc.hpp ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp - ${CMAKE_CURRENT_LIST_DIR}/rccl.hpp ${CMAKE_CURRENT_LIST_DIR}/redirect.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp ${CMAKE_CURRENT_LIST_DIR}/state.hpp diff --git a/projects/rocprofiler-systems/source/lib/core/argparse.cpp b/projects/rocprofiler-systems/source/lib/core/argparse.cpp index 30a41cc479..a8026c26f8 100644 --- a/projects/rocprofiler-systems/source/lib/core/argparse.cpp +++ b/projects/rocprofiler-systems/source/lib/core/argparse.cpp @@ -562,10 +562,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _data.processed_environs.emplace("periods"); } - strset_t _backend_choices = { - "all", "kokkosp", "mpip", "ompt", "rcclp", - "amd-smi", "rocprofiler-sdk", "mutex-locks", "spin-locks", "rw-locks" - }; + strset_t _backend_choices = { "all", "kokkosp", "mpip", "ompt", + "rcclp", "amd-smi", "rocm", "mutex-locks", + "spin-locks", "rw-locks" }; #if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS) _backend_choices.erase("mpip"); @@ -575,14 +574,10 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _backend_choices.erase("ompt"); #endif -#if !defined(ROCPROFSYS_USE_RCCL) - _backend_choices.erase("rcclp"); -#endif - #if !defined(ROCPROFSYS_USE_ROCM) _backend_choices.erase("amd-smi"); - _backend_choices.erase("rocprofiler-sdk"); _backend_choices.erase("rocm"); + _backend_choices.erase("rcclp"); #endif if(gpu::device_count() == 0) @@ -590,13 +585,8 @@ add_core_arguments(parser_t& _parser, parser_data& _data) // remove GPU-specific backends _backend_choices.erase("rcclp"); _backend_choices.erase("amd-smi"); - _backend_choices.erase("rocprofiler-sdk"); _backend_choices.erase("rocm"); -#if defined(ROCPROFSYS_USE_RCCL) - update_env(_data, "ROCPROFSYS_USE_RCCLP", false); -#endif - #if defined(ROCPROFSYS_USE_ROCM) update_env(_data, "ROCPROFSYS_USE_AMD_SMI", false); update_env(_data, "ROCPROFSYS_USE_ROCM", false); diff --git a/projects/rocprofiler-systems/source/lib/core/categories.hpp b/projects/rocprofiler-systems/source/lib/core/categories.hpp index fe28aa548e..bb9db10b03 100644 --- a/projects/rocprofiler-systems/source/lib/core/categories.hpp +++ b/projects/rocprofiler-systems/source/lib/core/categories.hpp @@ -102,6 +102,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGOR ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm RocDecode API") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocjpeg_api, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, "rocm_rocjpeg_api", "ROCm RocJPEG API") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl_api, ROCPROFSYS_CATEGORY_ROCM_RCCL_API, "rocm_rccl_api", "ROCm RCCL API") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "amd_smi", "AMD-SMI data") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_gfx_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, "device_busy_gfx", "Busy percentage of GFX engine on a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_umc_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, "device_busy_umc", "Busy percentage of UMC engin on a GPU device") @@ -171,6 +172,7 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_marker_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocdecode_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rocjpeg_api), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi), \ ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_gfx_busy), \ ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_umc_busy), \ diff --git a/projects/rocprofiler-systems/source/lib/core/config.cpp b/projects/rocprofiler-systems/source/lib/core/config.cpp index 95b735462a..2bc3a77a3b 100644 --- a/projects/rocprofiler-systems/source/lib/core/config.cpp +++ b/projects/rocprofiler-systems/source/lib/core/config.cpp @@ -1359,6 +1359,14 @@ configure_disabled_settings(const std::shared_ptr& _config) _config->find("ROCPROFSYS_USE_AMD_SMI")->second->set_hidden(true); for(const auto& itr : _config->disable_category("amd_smi")) _config->find(itr)->second->set_hidden(true); + + _config->find("ROCPROFSYS_USE_RCCLP")->second->set_hidden(true); + for(const auto& itr : _config->disable_category("rcclp")) + _config->find(itr)->second->set_hidden(true); + + _config->find("ROCPROFSYS_USE_ROCM")->second->set_hidden(true); + for(const auto& itr : _config->disable_category("rocm")) + _config->find(itr)->second->set_hidden(true); #endif #if defined(ROCPROFSYS_USE_OMPT) || ROCPROFSYS_USE_OMPT == 0 diff --git a/projects/rocprofiler-systems/source/lib/core/rccl.hpp b/projects/rocprofiler-systems/source/lib/core/rccl.hpp deleted file mode 100644 index b5f271ab01..0000000000 --- a/projects/rocprofiler-systems/source/lib/core/rccl.hpp +++ /dev/null @@ -1,29 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#pragma once - -#include "core/defines.hpp" - -#if defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL > 0 -# include -#endif diff --git a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp index 8c5764d848..e39ca29023 100644 --- a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp @@ -218,6 +218,30 @@ get_operations_impl(const std::unordered_set& _complete, } // namespace +/// @brief Return the version of the rocprofiler-sdk +/// @return The version of the rocprofiler-sdk or 0 if not initialized +version_info& +get_version() +{ + static auto _version = version_info{ 0 }; + + if(_version.formatted == 0) + { + uint32_t _major = 0; + uint32_t _minor = 0; + uint32_t _patch = 0; + + ROCPROFILER_CALL(rocprofiler_get_version(&_major, &_minor, &_patch)); + + _version.major = _major; + _version.minor = _minor; + _version.patch = _patch; + _version.formatted = _major * 10000 + _minor * 100 + _patch; + } + + return _version; +} + void config_settings(const std::shared_ptr& _config) { @@ -319,6 +343,7 @@ config_settings(const std::shared_ptr& _config) join::join(join::array_config{ ", ", "", "" }, _domain_choices)); auto _domain_defaults = std::string{ "hip_runtime_api,marker_api,kernel_dispatch," "memory_copy,scratch_memory" }; + # if(ROCPROFILER_VERSION < 10000) _domain_defaults.append(",page_migration"); # endif @@ -353,28 +378,48 @@ std::unordered_set get_callback_domains() { const auto callback_tracing_info = rocprofiler::sdk::get_callback_tracing_names(); - const auto supported = std::unordered_set - { + auto supported = std::unordered_set{ ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API, - ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, - ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, - ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, - ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, - ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, -# if(ROCPROFILER_VERSION >= 700) - ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API, - ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API, -# endif + ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, }; + auto _version = get_version(); + ROCPROFSYS_WARNING_IF(_version.formatted == 0, + "Warning! rocprofiler-sdk version not initialized\n"); + +# if(ROCPROFILER_VERSION >= 600) + if(_version.formatted >= 600) + { + // Argument tracing is supported in rocprofiler-sdk 0.6.0 and later + supported.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API); + supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API); + } +# endif +# if(ROCPROFILER_VERSION >= 700) + if(_version.formatted >= 700) + { + supported.emplace(ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API); + } +# endif + auto _data = std::unordered_set{}; auto _domains = tim::delimit(config::get_setting_value("ROCPROFSYS_ROCM_DOMAINS") .value_or(std::string{}), " ,;:\t\n"); + if(config::get_use_rcclp() && _version.formatted >= 600) + { + // Translate ROCPROFSYS_USE_RCCLP to entry in ROCPROFSYS_ROCM_DOMAINS + _data.emplace(ROCPROFILER_CALLBACK_TRACING_RCCL_API); + } + const auto valid_choices = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS")->get_choices(); @@ -582,6 +627,12 @@ void config_settings(const std::shared_ptr&) {} +version_info& +get_version() +{ + static auto _version = version_info{ 0 }; + return _version; +} } // namespace rocprofiler_sdk } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp index 5ceee14e2a..b70c1af633 100644 --- a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp +++ b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.hpp @@ -39,9 +39,20 @@ namespace rocprofsys { namespace rocprofiler_sdk { +struct version_info +{ + uint32_t major = 0; + uint32_t minor = 0; + uint32_t patch = 0; + uint32_t formatted = 0; // major * 10000 + minor * 100 + patch +}; + void config_settings(const std::shared_ptr&); +version_info& +get_version(); + #if defined(ROCPROFSYS_USE_ROCM) std::unordered_set diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index 5357469cf8..2084113032 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -54,6 +54,7 @@ extern "C" ROCPROFSYS_CATEGORY_ROCM_MARKER_API, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, + ROCPROFSYS_CATEGORY_ROCM_RCCL_API, ROCPROFSYS_CATEGORY_AMD_SMI, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp index bda609ef6e..ad2369ca59 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp @@ -122,7 +122,7 @@ rocprofsys_init_library(void) extern "C" void rocprofsys_init_tooling(void) { - rocprofsys_init_tooling_hidden(true); + rocprofsys_init_tooling_hidden(); } extern "C" void diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp index 55dfbbdd0a..478f96ec58 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp @@ -95,7 +95,7 @@ extern "C" // these are the real implementations for internal calling convention void rocprofsys_init_library_hidden(void) ROCPROFSYS_HIDDEN_API; - bool rocprofsys_init_tooling_hidden(bool postinit = false) ROCPROFSYS_HIDDEN_API; + bool rocprofsys_init_tooling_hidden(void) ROCPROFSYS_HIDDEN_API; void rocprofsys_init_hidden(const char*, bool, const char*) ROCPROFSYS_HIDDEN_API; void rocprofsys_finalize_hidden(void) ROCPROFSYS_HIDDEN_API; void rocprofsys_reset_preload_hidden(void) ROCPROFSYS_HIDDEN_API; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp index e90594264c..ca6e4f6927 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp @@ -53,7 +53,6 @@ #include "library/ompt.hpp" #include "library/process_sampler.hpp" #include "library/ptl.hpp" -#include "library/rcclp.hpp" #include "library/rocprofiler-sdk.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" @@ -404,42 +403,16 @@ rocprofsys_init_library_hidden() ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n"); } -// Initialize RCCL if: -// - postinit=true - so the code doesn't hang at the initialization stage -// - get_state() >= State::Init - so the code doesn't throw an exception -// - rccl_initialized=false - so we don't try to initialize RCCL twice -// - get_use_rcclp()=true - only if the environment is configured to use RCCL -static void -rccl_setup(bool postinit) -{ - // Flag used to avoid initializing RCCL twice - static bool rccl_initialized = false; - - if(postinit && (get_state() >= State::Init) && !rccl_initialized && get_use_rcclp()) - { - ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n"); - rcclp::setup(); - rccl_initialized = true; - } -} - -static void -rocprofsys_init_library_hidden_with_rccl(bool postinit) -{ - rocprofsys_init_library_hidden(); - rccl_setup(postinit); -} - //======================================================================================// extern "C" bool -rocprofsys_init_tooling_hidden(bool postinit) +rocprofsys_init_tooling_hidden(void) { if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true; if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) { - rocprofsys_init_library_hidden_with_rccl(postinit); + rocprofsys_init_library_hidden(); return false; } @@ -458,7 +431,6 @@ rocprofsys_init_tooling_hidden(bool postinit) if(get_state() != State::PreInit || get_state() == State::Init || _once) { - rccl_setup(postinit); return false; } _once = true; @@ -481,7 +453,7 @@ rocprofsys_init_tooling_hidden(bool postinit) ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "Calling rocprofsys_init_library()...\n"); - rocprofsys_init_library_hidden_with_rccl(postinit); + rocprofsys_init_library_hidden(); ROCPROFSYS_DEBUG_F("\n"); @@ -807,12 +779,6 @@ rocprofsys_finalize_hidden(void) component::vaapi_gotcha::shutdown(); } - if(get_use_rcclp()) - { - ROCPROFSYS_VERBOSE_F(1, "Shutting down RCCLP...\n"); - rcclp::shutdown(); - } - if(get_use_ompt()) { ROCPROFSYS_VERBOSE_F(1, "Shutting down OMPT...\n"); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt index 104ee1fa66..17b06c63d7 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt @@ -20,7 +20,6 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/process_sampler.hpp ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/ptl.hpp - ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm.hpp ${CMAKE_CURRENT_LIST_DIR}/amd_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp @@ -34,11 +33,6 @@ set(library_headers target_sources(rocprofiler-systems-object-library PRIVATE ${library_sources} ${library_headers}) -if(ROCPROFSYS_USE_RCCL) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) -endif() - if(ROCPROFSYS_USE_ROCM) target_sources( rocprofiler-systems-object-library @@ -56,7 +50,6 @@ add_subdirectory(tracing) set(ndebug_sources ${CMAKE_CURRENT_LIST_DIR}/components/mpi_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/components/backtrace_metrics.cpp - ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp ${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp ${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp ${CMAKE_CURRENT_LIST_DIR}/ompt.cpp) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt index 977d302569..15940da05d 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/CMakeLists.txt @@ -30,15 +30,9 @@ set(component_headers ${CMAKE_CURRENT_LIST_DIR}/mpi_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/numa_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/vaapi_gotcha.hpp - ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_create_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/pthread_mutex_gotcha.hpp) target_sources(rocprofiler-systems-object-library PRIVATE ${component_sources} ${component_headers}) - -if(ROCPROFSYS_USE_RCCL) - target_sources(rocprofiler-systems-object-library - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rcclp.cpp) -endif() diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.cpp index f733f53f05..50b5b437e4 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.cpp @@ -298,6 +298,8 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sen #endif #if defined(ROCPROFSYS_USE_RCCL) +// Kept for reference, but now gathered throught the SDK callbacks. + // ncclReduce void comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, @@ -403,6 +405,7 @@ comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const v } // ncclAllGather +// ncclAllToAll void comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.hpp index 037358623e..93dfac7521 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/comm_data.hpp @@ -26,7 +26,6 @@ #include "core/common.hpp" #include "core/components/fwd.hpp" #include "core/defines.hpp" -#include "core/rccl.hpp" #include "core/timemory.hpp" #include "library/components/category_region.hpp" @@ -78,18 +77,6 @@ struct comm_data : base static constexpr auto label = "MPI Comm Send"; }; - struct rccl_recv - { - static constexpr auto value = "comm_data"; - static constexpr auto label = "RCCL Comm Recv"; - }; - - struct rccl_send - { - static constexpr auto value = "comm_data"; - static constexpr auto label = "RCCL Comm Send"; - }; - ROCPROFSYS_DEFAULT_OBJECT(comm_data) static void preinit(); @@ -148,55 +135,6 @@ struct comm_data : base MPI_Datatype recvtype, MPI_Comm); #endif -#if defined(ROCPROFSYS_USE_RCCL) - static auto rccl_type_size(ncclDataType_t datatype) - { - switch(datatype) - { - case ncclInt8: - case ncclUint8: return 1; - case ncclFloat16: return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: return 8; - default: return 0; - }; - } - - // ncclReduce - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, - ncclComm_t, hipStream_t); - - // ncclSend - // ncclGather - // ncclBcast - // ncclRecv - static void audit(const gotcha_data& _data, audit::incoming, const void*, - size_t count, ncclDataType_t datatype, int peer, ncclComm_t, - hipStream_t); - - // ncclBroadcast - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, int root, ncclComm_t, - hipStream_t); - - // ncclAllReduce - // ncclReduceScatter - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, - hipStream_t); - - // ncclAllGather - // ncclAlltoAll - static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, - size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t); - -#endif - private: static auto& add(tracker_t& _t, data_type value) { diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.cpp deleted file mode 100644 index 9b5f00d62d..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.cpp +++ /dev/null @@ -1,195 +0,0 @@ -// MIT License -// -// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/rcclp.hpp" -#include "library/rcclp.hpp" - -#include - -std::ostream& -operator<<(std::ostream& _os, const ncclUniqueId& _v) -{ - for(auto itr : _v.internal) - _os << itr; - return _os; -} - -namespace rocprofsys -{ -namespace component -{ -uint64_t -activate_rcclp() -{ - using handle_t = tim::component::rcclp_handle; - - static auto _handle = std::shared_ptr{}; - - if(!_handle.get()) - { - _handle = std::make_shared(); - _handle->start(); - - auto cleanup_functor = [=]() { - if(_handle) - { - _handle->stop(); - _handle.reset(); - } - }; - - std::stringstream ss; - ss << "timemory-rcclp-" << demangle() << "-" - << demangle(); - tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); - return 1; - } - return 0; -} -// -//======================================================================================// -// -uint64_t -deactivate_rcclp(uint64_t id) -{ - if(id > 0) - { - std::stringstream ss; - ss << "timemory-rcclp-" << demangle() << "-" - << demangle(); - tim::manager::instance()->cleanup(ss.str()); - return 0; - } - return 1; -} -// -//======================================================================================// -// -void -configure_rcclp(const std::set& permit, const std::set& reject) -{ - static bool is_initialized = false; - if(!is_initialized) - { - // generate the gotcha wrappers - rcclp_gotcha_t::get_initializer() = []() { - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll); - TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum); - // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy); - }; - - // provide environment variable for suppressing wrappers - rcclp_gotcha_t::get_reject_list() = [reject]() { - auto _reject = reject; - // check environment - auto reject_list = - tim::get_env("ROCPROFSYS_RCCLP_REJECT_LIST", ""); - // add environment setting - for(const auto& itr : tim::delimit(reject_list)) - _reject.insert(itr); - return _reject; - }; - - // provide environment variable for selecting wrappers - rcclp_gotcha_t::get_permit_list() = [permit]() { - auto _permit = permit; - // check environment - auto permit_list = - tim::get_env("ROCPROFSYS_RCCLP_PERMIT_LIST", ""); - // add environment setting - for(const auto& itr : tim::delimit(permit_list)) - _permit.insert(itr); - return _permit; - }; - - is_initialized = true; - } -} - -void -rcclp_handle::start() -{ - if(get_tool_count()++ == 0) - { - get_tool_instance() = std::make_shared("timemory_rcclp"); - get_tool_instance()->start(); - } -} - -void -rcclp_handle::stop() -{ - auto idx = --get_tool_count(); - if(get_tool_instance().get()) - { - get_tool_instance()->stop(); - if(idx == 0) get_tool_instance().reset(); - } -} - -rcclp_handle::persistent_data& -rcclp_handle::get_persistent_data() -{ - static persistent_data _instance; - return _instance; -} - -std::atomic& -rcclp_handle::get_configured() -{ - return get_persistent_data().m_configured; -} - -rcclp_handle::toolset_ptr_t& -rcclp_handle::get_tool_instance() -{ - return get_persistent_data().m_tool; -} - -std::atomic& -rcclp_handle::get_tool_count() -{ - return get_persistent_data().m_count; -} -} // namespace component -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.hpp deleted file mode 100644 index 957dec1ca1..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/rcclp.hpp +++ /dev/null @@ -1,106 +0,0 @@ -// MIT License -// -// Copyright (c) 2020, The Regents of the University of California, -// through Lawrence Berkeley National Laboratory (subject to receipt of any -// required approvals from the U.S. Dept. of Energy). All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "core/rccl.hpp" -#include "core/timemory.hpp" -#include "library/components/category_region.hpp" -#include "library/components/comm_data.hpp" - -#include -#include - -#include -#include -#include -#include -#include -#include - -#if !defined(ROCPROFSYS_NUM_RCCLP_WRAPPERS) -# define ROCPROFSYS_NUM_RCCLP_WRAPPERS 25 -#endif - -ROCPROFSYS_COMPONENT_ALIAS( - rccl_toolset_t, - ::tim::component_bundle, - comm_data>) -ROCPROFSYS_COMPONENT_ALIAS(rcclp_gotcha_t, - ::tim::component::gotcha) - -#if !defined(ROCPROFSYS_USE_RCCL) -ROCPROFSYS_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) -#endif - -namespace rocprofsys -{ -namespace component -{ -uint64_t -activate_rcclp(); - -uint64_t -deactivate_rcclp(uint64_t id); - -void -configure_rcclp(const std::set& permit = {}, - const std::set& reject = {}); - -struct rcclp_handle : base -{ - static constexpr size_t rcclp_wrapper_count = ROCPROFSYS_NUM_RCCLP_WRAPPERS; - - using value_type = void; - using this_type = rcclp_handle; - using base_type = base; - - using rcclp_tuple_t = tim::component_tuple; - using toolset_ptr_t = std::shared_ptr; - - static std::string label() { return "rcclp_handle"; } - static std::string description() { return "Handle for activating NCCL wrappers"; } - static void get() {} - static void start(); - static void stop(); - static int get_count() { return get_tool_count().load(); } - -private: - struct persistent_data - { - std::atomic m_configured{ 0 }; - std::atomic m_count{ 0 }; - toolset_ptr_t m_tool = toolset_ptr_t{}; - }; - - static persistent_data& get_persistent_data(); - static std::atomic& get_configured(); - static toolset_ptr_t& get_tool_instance(); - static std::atomic& get_tool_count(); -}; -} // namespace component -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.cpp deleted file mode 100644 index 541f0575fb..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// MIT License -// -// Copyright (c) 2020, The Regents of the University of California, -// through Lawrence Berkeley National Laboratory (subject to receipt of any -// required approvals from the U.S. Dept. of Energy). All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include "library/components/rcclp.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "core/dynamic_library.hpp" -#include "core/rccl.hpp" -#include "core/timemory.hpp" -#include "library/components/category_region.hpp" - -#include - -#include -#include -#include -#include -#include - -namespace -{ -uint64_t global_id = std::numeric_limits::max(); -} - -namespace rocprofsys -{ -namespace rcclp -{ -void -configure() -{} - -void -setup() -{ - configure(); - - // make sure the symbols are loaded to be wrapped - dynamic_library _librccl{ - "ROCPROFSYS_RCCL_LIBRARY", "librccl.so", RTLD_NOW | RTLD_GLOBAL, true, true, true - }; - - auto _use_data = tim::get_env("ROCPROFSYS_RCCLP_COMM_DATA", get_use_timemory()); - if(!get_use_timemory()) - { - trait::runtime_enabled::set(false); - trait::runtime_enabled::set(false); - } - else - { - trait::runtime_enabled::set(_use_data); - trait::runtime_enabled::set(_use_data); - } - - component::configure_rcclp(); - global_id = component::activate_rcclp(); -} - -void -shutdown() -{ - if(global_id < std::numeric_limits::max()) - component::deactivate_rcclp(global_id); -} -} // namespace rcclp -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index a13d024265..ffbe9592b3 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -34,6 +34,7 @@ #include "library/components/category_region.hpp" #include "library/rocprofiler-sdk/counters.hpp" #include "library/rocprofiler-sdk/fwd.hpp" +#include "library/rocprofiler-sdk/rccl.hpp" #include "library/thread_info.hpp" #include "library/tracing.hpp" @@ -482,6 +483,17 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, auto ts = rocprofiler_timestamp_t{}; ROCPROFILER_CALL(rocprofiler_get_timestamp(&ts)); + const char* name = nullptr; + rocprofiler_query_callback_tracing_kind_operation_name(record.kind, record.operation, + &name, nullptr); + + auto info = std::stringstream{}; + info << std::left << "tid=" << record.thread_id << ", cid=" << std::setw(3) + << record.correlation_id.internal << ", kind=" << std::setw(2) << record.kind + << ", operation=" << std::setw(3) << record.operation + << ", phase=" << record.phase << ", dt_nsec=" << std::setw(8) << ts + << ", name=" << name; + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { user_data->value = ts; @@ -525,6 +537,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, break; } #endif + case ROCPROFILER_CALLBACK_TRACING_RCCL_API: + { + tool_tracing_callback_start(category::rocm_rccl_api{}, record, user_data, + ts); + break; + } case ROCPROFILER_CALLBACK_TRACING_NONE: case ROCPROFILER_CALLBACK_TRACING_LAST: case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API: @@ -533,7 +551,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: - case ROCPROFILER_CALLBACK_TRACING_RCCL_API: #if(ROCPROFILER_VERSION >= 600) case ROCPROFILER_CALLBACK_TRACING_OMPT: case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: @@ -616,6 +633,13 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, break; } #endif + case ROCPROFILER_CALLBACK_TRACING_RCCL_API: + { + tool_tracing_callback_rccl(record, user_data->value, ts); + tool_tracing_callback_stop(category::rocm_rccl_api{}, record, user_data, + ts, _bt_data); + break; + } case ROCPROFILER_CALLBACK_TRACING_NONE: case ROCPROFILER_CALLBACK_TRACING_LAST: case ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API: @@ -624,7 +648,6 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: - case ROCPROFILER_CALLBACK_TRACING_RCCL_API: #if(ROCPROFILER_VERSION >= 600) case ROCPROFILER_CALLBACK_TRACING_OMPT: case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: @@ -651,6 +674,12 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, _data->dispatch_info.dispatch_id, timing_interval{ _data->start_timestamp, _data->end_timestamp }); } + else + { + ROCPROFSYS_WARNING_F( + 1, "tool_tracing_callback: unhandled PHASE_NONE callback record\n\t%s\n", + info.str().c_str()); + } } else { @@ -1025,13 +1054,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) { auto domains = settings::instance()->at("ROCPROFSYS_ROCM_DOMAINS"); - ROCPROFSYS_VERBOSE_F(1, "rocprof-sys ROCm Domains:\n"); + ROCPROFSYS_VERBOSE_F(1, "Available ROCm Domains:\n"); for(const auto& itr : domains->get_choices()) ROCPROFSYS_VERBOSE_F(1, "- %s\n", itr.c_str()); auto _callback_domains = rocprofiler_sdk::get_callback_domains(); auto _buffered_domain = rocprofiler_sdk::get_buffered_domains(); auto _counter_events = rocprofiler_sdk::get_rocm_events(); + auto _version = rocprofiler_sdk::get_version(); + ROCPROFSYS_WARNING_IF(_version.formatted == 0, + "Warning! rocprofiler-sdk version not initialized\n"); auto* _data = as_client_data(user_data); _data->client_fini = fini_func; @@ -1052,11 +1084,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API, ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, -#if(ROCPROFILER_VERSION >= 700) + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + ROCPROFILER_CALLBACK_TRACING_RCCL_API, +#if(ROCPROFILER_VERSION >= 600) ROCPROFILER_CALLBACK_TRACING_ROCDECODE_API, +#endif +#if(ROCPROFILER_VERSION >= 700) ROCPROFILER_CALLBACK_TRACING_ROCJPEG_API, #endif - ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API }) { if(_callback_domains.count(itr) > 0) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt index 97446e34c4..407037a67f 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/CMakeLists.txt @@ -1,9 +1,11 @@ # -set(rocprofiler_sdk_sources ${CMAKE_CURRENT_LIST_DIR}/counters.cpp - ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp) +set(rocprofiler_sdk_sources + ${CMAKE_CURRENT_LIST_DIR}/counters.cpp ${CMAKE_CURRENT_LIST_DIR}/fwd.cpp + ${CMAKE_CURRENT_LIST_DIR}/rccl.cpp) -set(rocprofiler_sdk_headers ${CMAKE_CURRENT_LIST_DIR}/counters.hpp - ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp) +set(rocprofiler_sdk_headers + ${CMAKE_CURRENT_LIST_DIR}/counters.hpp ${CMAKE_CURRENT_LIST_DIR}/fwd.hpp + ${CMAKE_CURRENT_LIST_DIR}/rccl.hpp) target_sources(rocprofiler-systems-object-library PRIVATE ${rocprofiler_sdk_sources} ${rocprofiler_sdk_headers}) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp new file mode 100644 index 0000000000..7b2e04b254 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.cpp @@ -0,0 +1,182 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/rocprofiler-sdk/rccl.hpp" + +#include "core/config.hpp" +#include "core/debug.hpp" +#include "core/perfetto.hpp" + +#include "library/tracing.hpp" + +namespace rocprofsys +{ +namespace rocprofiler_sdk +{ +namespace +{ +struct rccl_recv +{ + static constexpr auto value = "comm_data"; + static constexpr auto label = "RCCL Comm Recv"; +}; + +struct rccl_send +{ + static constexpr auto value = "comm_data"; + static constexpr auto label = "RCCL Comm Send"; +}; + +template +void +write_perfetto_counter_track(uint64_t _val, uint64_t _begin_ts, uint64_t _end_ts) +{ + using counter_track = rocprofsys::perfetto_counter_track; + + if(rocprofsys::get_use_perfetto() && + rocprofsys::get_state() == rocprofsys::State::Active) + { + const size_t _idx = 0; + + if(!counter_track::exists(_idx)) + { + std::string _label = + (_idx > 0) ? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']')) : Tp::label; + counter_track::emplace(_idx, _label, "bytes"); + } + + TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _begin_ts, _val); + TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _end_ts, 0); + } +} + +static auto +rccl_type_size(ncclDataType_t datatype) +{ + switch(datatype) + { + case ncclInt8: + case ncclUint8: return 1; + case ncclFloat16: return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: return 8; + default: + ROCPROFSYS_CI_ABORT(true, "Unsupported RCCL datatype: %i", datatype); + return 0; + }; +} + +} // namespace + +/* + * @brief RCCL callback tracing handler + * + * This function processes RCCL API calls and writes the data transfer size to + * the Perfetto counter track. + * + * @param record The tracing record containing the RCCL API call information. + * @param begin_ts The timestamp when the operation started. + * @param end_ts The timestamp when the operation ended. + */ +void +tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record, + uint64_t begin_ts, uint64_t end_ts) +{ + if(record.kind == ROCPROFILER_CALLBACK_TRACING_RCCL_API) + { + auto* payload = + static_cast(record.payload); + + size_t size = 0; + bool is_send = false; + + auto set_recv = [&](size_t count, ncclDataType_t _dt) { + is_send = false; + size = count * rccl_type_size(_dt); + }; + + auto set_send = [&](size_t count, ncclDataType_t _dt) { + is_send = true; + size = count * rccl_type_size(_dt); + }; + + switch(record.operation) + { + // RCCL Data Receive + case ROCPROFILER_RCCL_API_ID_ncclAllGather: + set_recv(payload->args.ncclAllGather.sendcount, + payload->args.ncclAllGather.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclAllToAll: + set_recv(payload->args.ncclAllToAll.count, + payload->args.ncclAllToAll.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclAllReduce: + set_recv(payload->args.ncclAllReduce.count, + payload->args.ncclAllReduce.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclGather: + set_recv(payload->args.ncclGather.sendcount, + payload->args.ncclGather.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclRecv: + set_recv(payload->args.ncclRecv.count, payload->args.ncclRecv.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclReduce: + set_recv(payload->args.ncclReduce.count, + payload->args.ncclReduce.datatype); + break; + + // RCCL Data Send + case ROCPROFILER_RCCL_API_ID_ncclBroadcast: + set_send(payload->args.ncclBroadcast.count, + payload->args.ncclBroadcast.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclReduceScatter: + set_send(payload->args.ncclReduceScatter.recvcount, + payload->args.ncclReduceScatter.datatype); + break; + case ROCPROFILER_RCCL_API_ID_ncclSend: + set_send(payload->args.ncclSend.count, payload->args.ncclSend.datatype); + break; + + default: + // Skip other RCCL operations + break; + } + + if(config::get_use_perfetto() && size > 0) + { + if(is_send) + write_perfetto_counter_track(size, begin_ts, end_ts); + else + write_perfetto_counter_track(size, begin_ts, end_ts); + } + } +} + +} // namespace rocprofiler_sdk +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.hpp similarity index 67% rename from projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.hpp rename to projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.hpp index 0534336710..0a1ab3b9c5 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rcclp.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/rccl.hpp @@ -1,8 +1,6 @@ // MIT License // -// Copyright (c) 2020, The Regents of the University of California, -// through Lawrence Berkeley National Laboratory (subject to receipt of any -// required approvals from the U.S. Dept. of Energy). All rights reserved. +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,34 +22,22 @@ #pragma once -#include "core/defines.hpp" +#include +#include +// #include +// #include +// #include +#include +#include namespace rocprofsys { -namespace rcclp +namespace rocprofiler_sdk { void -configure(); +tool_tracing_callback_rccl(rocprofiler_callback_tracing_record_t record, + uint64_t begin_ts, uint64_t end_ts); -void -setup(); +} // namespace rocprofiler_sdk -void -shutdown(); - -#if !defined(ROCPROFSYS_USE_RCCL) || \ - (defined(ROCPROFSYS_USE_RCCL) && ROCPROFSYS_USE_RCCL == 0) -inline void -configure() -{} - -inline void -setup() -{} - -inline void -shutdown() -{} -#endif -} // namespace rcclp } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h deleted file mode 100644 index 5fb23b1ab7..0000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/tpls/rccl/rccl/rccl.h +++ /dev/null @@ -1,522 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_H_ -#define NCCL_H_ - -#include -#include - -#define NCCL_MAJOR 2 -#define NCCL_MINOR 11 -#define NCCL_PATCH 4 -#define NCCL_SUFFIX "" - -#define NCCL_VERSION_CODE 21104 -#define NCCL_VERSION(X, Y, Z) \ - (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z)) - -#define RCCL_BFLOAT16 1 -#define RCCL_GATHER_SCATTER 1 -#define RCCL_ALLTOALLV 1 - -#ifdef __cplusplus -extern "C" -{ -#endif - - /*! @brief Opaque handle to communicator */ - typedef struct ncclComm* ncclComm_t; - -#define NCCL_UNIQUE_ID_BYTES 128 - typedef struct - { - char internal[NCCL_UNIQUE_ID_BYTES]; - } ncclUniqueId; - - /*! @brief Error type */ - typedef enum - { - ncclSuccess = 0, - ncclUnhandledCudaError = 1, - ncclSystemError = 2, - ncclInternalError = 3, - ncclInvalidArgument = 4, - ncclInvalidUsage = 5, - ncclNumResults = 6 - } ncclResult_t; - - /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. - * - * @details This integer is coded with the MAJOR, MINOR and PATCH level of the - * NCCL library - */ - ncclResult_t ncclGetVersion(int* version); - /// @cond include_hidden - ncclResult_t pncclGetVersion(int* version); - /// @endcond - - /*! @brief Generates an ID for ncclCommInitRank - - @details - Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be - called once and the Id should be distributed to all ranks in the - communicator before calling ncclCommInitRank. - - @param[in] - uniqueId ncclUniqueId* - pointer to uniqueId - - */ - ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); - /// @cond include_hidden - ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); - /// @endcond - - /*! @brief Creates a new communicator (multi thread/process version). - - @details - rank must be between 0 and nranks-1 and unique within a communicator clique. - Each rank is associated to a CUDA device, which has to be set before calling - ncclCommInitRank. - ncclCommInitRank implicitly syncronizes with other ranks, so it must be - called by different threads/processes or use ncclGroupStart/ncclGroupEnd. - - @param[in] - comm ncclComm_t* - communicator struct pointer - */ - ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, - int rank); - /// @cond include_hidden - ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, - int rank); - /// @endcond - - /*! @brief Creates a clique of communicators (single process version). - * - * @details This is a convenience function to create a single-process communicator - * clique. Returns an array of ndev newly initialized communicators in comm. comm - * should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is - * NULL, the first ndev HIP devices are used. Order of devlist defines user-order of - * processors within the communicator. - * */ - ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); - /// @cond include_hidden - ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); - /// @endcond - - /*! @brief Frees resources associated with communicator object, but waits for any - * operations that might still be running on the device */ - ncclResult_t ncclCommDestroy(ncclComm_t comm); - /// @cond include_hidden - ncclResult_t pncclCommDestroy(ncclComm_t comm); - /// @endcond - - /*! @brief Frees resources associated with communicator object and aborts any - * operations that might still be running on the device. */ - ncclResult_t ncclCommAbort(ncclComm_t comm); - /// @cond include_hidden - ncclResult_t pncclCommAbort(ncclComm_t comm); - /// @endcond - - /*! @brief Returns a human-readable error message. */ - const char* ncclGetErrorString(ncclResult_t result); - const char* pncclGetErrorString(ncclResult_t result); - - /*! @brief Checks whether the comm has encountered any asynchronous errors */ - ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); - /// @cond include_hidden - ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); - /// @endcond - - /*! @brief Gets the number of ranks in the communicator clique. */ - ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); - /// @cond include_hidden - ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); - /// @endcond - - /*! @brief Returns the rocm device number associated with the communicator. */ - ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); - /// @cond include_hidden - ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); - /// @endcond - - /*! @brief Returns the user-ordered "rank" associated with the communicator. */ - ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); - /// @cond include_hidden - ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); - /// @endcond - - /*! @brief Reduction operation selector */ - /* Reduction operation selector */ - typedef enum - { - ncclNumOps_dummy = 5 - } ncclRedOp_dummy_t; - typedef enum - { - ncclSum = 0, - ncclProd = 1, - ncclMax = 2, - ncclMin = 3, - ncclAvg = 4, - /* ncclNumOps: The number of built-in ncclRedOp_t values. Also - * serves as the least possible value for dynamic ncclRedOp_t's - * as constructed by ncclRedOpCreate*** functions. */ - ncclNumOps = 5, - /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(ncclRedOp_t) when compared to previous NCCL versions to - * maintain ABI compatibility. */ - ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) - } ncclRedOp_t; - - /*! @brief Data types */ - typedef enum - { - ncclInt8 = 0, - ncclChar = 0, - ncclUint8 = 1, - ncclInt32 = 2, - ncclInt = 2, - ncclUint32 = 3, - ncclInt64 = 4, - ncclUint64 = 5, - ncclFloat16 = 6, - ncclHalf = 6, - ncclFloat32 = 7, - ncclFloat = 7, - ncclFloat64 = 8, - ncclDouble = 8, - ncclBfloat16 = 9, - ncclNumTypes = 10 - } ncclDataType_t; - - /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ - typedef enum - { - /* ncclScalarDevice: The scalar is in device-visible memory and will be - * dereferenced while the collective is running. */ - ncclScalarDevice = 0, - - /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be - * dereferenced before the ncclRedOpCreate***() function returns. */ - ncclScalarHostImmediate = 1 - } ncclScalarResidence_t; - - /* - * ncclRedOpCreatePreMulSum - * - * Creates a new reduction operator which pre-multiplies input values by a given - * scalar locally before reducing them with peer values via summation. For use - * only with collectives launched against *comm* and *datatype*. The - * *residence* argument indicates how/when the memory pointed to by *scalar* - * will be dereferenced. Upon return, the newly created operator's handle - * is stored in *op*. - */ - ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, - ncclDataType_t datatype, - ncclScalarResidence_t residence, - ncclComm_t comm); - ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, - ncclDataType_t datatype, - ncclScalarResidence_t residence, - ncclComm_t comm); - - /* - * ncclRedOpDestroy - * - * Destroys the reduction operator *op*. The operator must have been created by - * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be - * destroyed as soon as the last NCCL function which is given that operator returns. - */ - ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); - ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); - - /* - * Collective communication operations - * - * Collective communication operations must be called separately for each - * communicator in a communicator clique. - * - * They return when operations have been enqueued on the CUDA stream. - * - * Since they may perform inter-CPU synchronization, each call has to be done - * from a different thread or process, or need to use Group Semantics (see - * below). - */ - - /*! - * @brief Reduce - * - * @details Reduces data arrays of length count in sendbuff into recvbuff using op - * operation. - * recvbuff may be NULL on all calls except for root device. - * root is the rank (not the CUDA device) where data will reside after the - * operation is complete. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, - ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief (deprecated) Broadcast (in-place) - * - * @details Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * This operation is implicitely in place. - */ - ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief Broadcast - * - * @details Copies count values from root to all other devices. - * root is the rank (not the HIP device) where data resides before the - * operation is started. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief All-Reduce - * - * @details Reduces data arrays of length count in sendbuff using op operation, and - * leaves identical copies of result on each recvbuff. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! - * @brief Reduce-Scatter - * - * @details Reduces data in sendbuff using op operation and leaves reduced result - * scattered over the devices so that recvbuff on rank i will contain the i-th - * block of the result. - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ - ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, ncclRedOp_t op, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, - ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief All-Gather - * - * @details Each device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ - ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief Send - * - * @details Send data from sendbuff to rank peer. - * Rank peer needs to call ncclRecv with the same datatype and the same count from - * this rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv - * operations need to progress concurrently to complete, they must be fused within a - * ncclGroupStart/ ncclGroupEnd section. - */ - ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, - int peer, ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, - int peer, ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief Receive - * - * @details Receive data from rank peer into recvbuff. - * Rank peer needs to call ncclSend with the same datatype and the same count to this - * rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv - * operations need to progress concurrently to complete, they must be fused within a - * ncclGroupStart/ ncclGroupEnd section. - */ - ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, - int peer, ncclComm_t comm, hipStream_t stream); - /// @endcond - - /*! @brief Gather - * - * @details Root device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ - ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief Scatter - * - * @details Scattered over the devices so that recvbuff on rank i will contain the - * i-th block of the data on root. - * - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ - ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, int root, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief All-To-All - * - * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th - * block. Each block for sending/receiving has count elements, which means - * that recvbuff and sendbuff should have a size of nranks*count elements. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /*! @brief All-To-Allv - * - * @details Device (i) sends sendcounts[j] of data from offset sdispls[j] - * to device (j). In the same time, device (i) receives recvcounts[j] of data - * from device (j) to be placed at rdispls[j]. - - * sendcounts, sdispls, recvcounts and rdispls are all measured in the units - * of datatype, not bytes. - * - * In-place operation will happen if sendbuff == recvbuff. - */ - ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], - const size_t sdispls[], void* recvbuff, - const size_t recvcounts[], const size_t rdispls[], - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @cond include_hidden - ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[], - const size_t sdispls[], void* recvbuff, - const size_t recvcounts[], const size_t rdispls[], - ncclDataType_t datatype, ncclComm_t comm, - hipStream_t stream); - /// @endcond - - /* - * Group semantics - * - * When managing multiple GPUs from a single thread, and since NCCL collective - * calls may perform inter-CPU synchronization, we need to "group" calls for - * different ranks/devices into a single call. - * - * Grouping NCCL calls as being part of the same collective operation is done - * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all - * collective calls until the ncclGroupEnd call, which will wait for all calls - * to be complete. Note that for collective communication, ncclGroupEnd only - * guarantees that the operations are enqueued on the streams, not that - * the operation is effectively done. - * - * Both collective communication and ncclCommInitRank can be used in conjunction - * of ncclGroupStart/ncclGroupEnd, but not together. - * - * Group semantics also allow to fuse multiple operations on the same device - * to improve performance (for aggregated collective calls), or to permit - * concurrent progress of multiple send/receive operations. - */ - - /*! @brief Group Start - * - * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into - * a single NCCL operation. Nothing will be started on the CUDA stream until - * ncclGroupEnd. - */ - ncclResult_t ncclGroupStart(); - /// @cond include_hidden - ncclResult_t pncclGroupStart(); - /// @endcond - - /*! @brief Group End - * - * End a group call. Start a fused NCCL operation consisting of all calls since - * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations - * need to be called after ncclGroupEnd. - */ - ncclResult_t ncclGroupEnd(); - /// @cond include_hidden - ncclResult_t pncclGroupEnd(); - /// @endcond - -#ifdef __cplusplus -} // end extern "C" -#endif - -#endif // end include guard diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-rccl-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-rccl-tests.cmake index ea17deccb2..60d5bf0035 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-rccl-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-rccl-tests.cmake @@ -30,12 +30,15 @@ foreach(_TARGET ${RCCL_TEST_TARGETS}) string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}") string(REPLACE "_" "-" _NAME "${_NAME}") rocprofiler_systems_add_test( + SKIP_RUNTIME NAME rccl-test-${_NAME} TARGET ${_TARGET} LABELS "rccl-tests;rcclp" MPI ON GPU ON NUM_PROCS 1 + SAMPLING_TIMEOUT 300 + REWRITE_TIMEOUT 300 REWRITE_ARGS -e -v diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake index a72ddaf857..15d4d42584 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-testing.cmake @@ -166,9 +166,10 @@ set(_rccl_environment "ROCPROFSYS_PROFILE=ON" "ROCPROFSYS_USE_SAMPLING=OFF" "ROCPROFSYS_USE_PROCESS_SAMPLING=ON" - "ROCPROFSYS_USE_RCCLP=ON" "ROCPROFSYS_TIME_OUTPUT=OFF" "ROCPROFSYS_USE_PID=OFF" + "ROCPROFSYS_USE_RCCLP=ON" + "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy" "${_test_openmp_env}" "${_test_library_path}") @@ -466,8 +467,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) cmake_parse_arguments( TEST "SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME" - "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" "${_KWARGS}" - ${ARGN}) + "NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" + "${_KWARGS}" ${ARGN}) foreach(_PREFIX SAMPLING RUNTIME REWRITE REWRITE_RUN BASELINE) if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "")