15c82d6da8
## Motivation Enable UCX communication tracing and communication metadata ## Technical Details Implement UCX API wrappers to trace transport-layer communication. This adds communication data tracking and exposes “UCX Comm Send/Recv” timelines, enabling detailed analysis of MPI, OpenSHMEM, and other UCX-based runtime communication patterns. - Implements function interception for UCX functions across multiple categories using gotcha component. - Extended comm_data component to track UCX send/recv operations - Added ucx_send and ucx_recv labels for Perfetto counter tracks. Integrated UCX data tracking with existing MPI/RCCL tracking infrastructure. - Added ROCPROFSYS_USE_UCX configuration option (enabled by default). - Created FindUCX.cmake module for UCX header detection. Falls back to internal UCX headers if system headers not found. - Updated all Dockerfiles to include UCX dependencies.
115 linhas
4.6 KiB
C++
115 linhas
4.6 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
#pragma once
|
|
|
|
#include "core/common.hpp"
|
|
#include "core/defines.hpp"
|
|
#include "core/timemory.hpp"
|
|
#include "library/components/comm_data.hpp"
|
|
|
|
#include <timemory/components/base.hpp>
|
|
#include <timemory/components/gotcha/backends.hpp>
|
|
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <string>
|
|
#include <utility>
|
|
|
|
namespace rocprofsys
|
|
{
|
|
namespace component
|
|
{
|
|
struct ucx_gotcha : tim::component::base<ucx_gotcha, void>
|
|
{
|
|
static constexpr size_t gotcha_capacity = 100;
|
|
|
|
using gotcha_data = tim::component::gotcha_data;
|
|
|
|
ROCPROFSYS_DEFAULT_OBJECT(ucx_gotcha)
|
|
|
|
// string id for component
|
|
static std::string label() { return "ucx_gotcha"; }
|
|
|
|
// generate the gotcha wrappers
|
|
static void configure();
|
|
static void shutdown();
|
|
|
|
static void start();
|
|
static void stop();
|
|
|
|
// Generic template audit function for UCX operations with void* parameters
|
|
template <typename... Args>
|
|
static void audit(const gotcha_data& _data, audit::incoming, Args...)
|
|
{
|
|
category_region<category::ucx>::start(std::string_view{ _data.tool_id });
|
|
}
|
|
|
|
public:
|
|
// Specific audit functions for tag operations (with uint64_t tags)
|
|
// ucp_tag_send_nbx: (void* ep, const void* buffer, size_t count, uint64_t tag, const
|
|
// void* param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, const void*, size_t,
|
|
uint64_t, const void*);
|
|
// ucp_tag_recv_nbx: (void* worker, void* buffer, size_t count, uint64_t tag, uint64_t
|
|
// tag_mask, const void* param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, void*, size_t, uint64_t,
|
|
uint64_t, const void*);
|
|
|
|
// RMA operations
|
|
// ucp_put_nbx: (void* ep, const void* buffer, size_t count, uint64_t remote_addr,
|
|
// void* rkey, const void* param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, const void*, size_t,
|
|
uint64_t, void*, const void*);
|
|
// ucp_get_nbx: (void* ep, void* buffer, size_t count, uint64_t remote_addr, void*
|
|
// rkey, const void* param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, void*, size_t, uint64_t,
|
|
void*, const void*);
|
|
|
|
// Active message send
|
|
// ucp_am_send_nbx: (void* ep, unsigned id, const void* header, size_t header_length,
|
|
// const void* buffer, size_t count, const void* param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, unsigned, const void*,
|
|
size_t, const void*, size_t, const void*);
|
|
|
|
// Stream operations
|
|
// ucp_stream_send_nbx: (void* ep, const void* buffer, size_t count, const void*
|
|
// param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, const void*, size_t,
|
|
const void*);
|
|
// ucp_stream_recv_nbx: (void* ep, void* buffer, size_t count, size_t* length, const
|
|
// void* param)
|
|
static void audit(const gotcha_data&, audit::incoming, void*, void*, size_t, size_t*,
|
|
const void*);
|
|
|
|
// Outgoing audit for return values
|
|
static void audit(const gotcha_data&, audit::outgoing, void*);
|
|
static void audit(const gotcha_data&, audit::outgoing, int);
|
|
};
|
|
} // namespace component
|
|
|
|
using ucx_bundle_t =
|
|
tim::component_bundle<category::ucx, component::ucx_gotcha, component::comm_data>;
|
|
using ucx_gotcha_t = tim::component::gotcha<component::ucx_gotcha::gotcha_capacity,
|
|
ucx_bundle_t, category::ucx>;
|
|
} // namespace rocprofsys
|