SWDEV-492625 memory free functions (#11)

* SWDEV-492625: Track free memory HSA functions to help determine total amount of memory allocated on the system at any one time

* Minor fixes to address comments

* Update allocation size description

* Moved get function back to specialization, minor typo fixes

* Removed memory_operation_type field, removed memory_pool allocation enum, converted starting address to hex string for json format.

* Made conversion to hex_string a function, changed address to use union rocprofiler_address_t type, changed VMEM descriptors

* Removed as_hex from the global namespace

* Formatting

* Removed TRACK_EVENT for memory allocation, now TRACK_COUNTER for memory allocation is being performed

* Check if address was recorded before retrieving allocation size in generate Perfetto

* Formatting

* Update source/lib/output/generatePerfetto.cpp

* Explicitly disable app-abort tests

* Remove excluding app-abort test from workflow CI

- redundant bc these tests are explicitly marked as disabled now

---------

Co-authored-by: Madsen, Jonathan <Jonathan.Madsen@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>

[ROCm/rocprofiler-sdk commit: 79006bb896]
Этот коммит содержится в:
Trowbridge, Ian
2024-12-06 00:05:30 -06:00
коммит произвёл GitHub
родитель a79f8a0198
Коммит 792329fefd
23 изменённых файлов: 564 добавлений и 248 удалений
+4 -4
Просмотреть файл
@@ -23,10 +23,10 @@ env:
ROCM_PATH: "/opt/rocm"
GPU_TARGETS: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"
PATH: "/usr/bin:$PATH"
navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
navi3_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$"
vega20_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$"
mi200_EXCLUDE_LABEL_REGEX: "^(openmp-target)$"
+7 -4
Просмотреть файл
@@ -1,4 +1,7 @@
"Kind","Operation","Agent_Id","Allocation_Size","Starting_Address","Correlation_Id","Start_Timestamp","End_Timestamp"
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497356288,1,65788054621500,65788055678893
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497348096,1,65788055691832,65788056666844
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497339904,1,65788056672061,65788057643457
"Kind","Operation","Agent_Id","Allocation_Size","Address","Correlation_Id","Start_Timestamp","End_Timestamp"
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,11,3721742710532634,3721742710584854
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,12,3721742710596404,3721742710933366
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,13,3721742710941416,3721742710960916
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,14,3721742710967236,3721742711197647
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,15,3721742711204077,3721742711219717
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,16,3721742711225857,3721742711466018
1 Kind Operation Agent_Id Allocation_Size Starting_Address Address Correlation_Id Start_Timestamp End_Timestamp
2 MEMORY_ALLOCATION MEMORY_ALLOCATION_ALLOCATE 0 1024 140341497356288 0x7fb2d0005000 1 11 65788054621500 3721742710532634 65788055678893 3721742710584854
3 MEMORY_ALLOCATION MEMORY_ALLOCATION_ALLOCATE MEMORY_ALLOCATION_FREE 0 1024 0 140341497348096 0x7fb2d0005000 1 12 65788055691832 3721742710596404 65788056666844 3721742710933366
4 MEMORY_ALLOCATION MEMORY_ALLOCATION_ALLOCATE 0 1024 140341497339904 0x7fb2d0005000 1 13 65788056672061 3721742710941416 65788057643457 3721742710960916
5 MEMORY_ALLOCATION MEMORY_ALLOCATION_FREE 0 0 0x7fb2d0005000 14 3721742710967236 3721742711197647
6 MEMORY_ALLOCATION MEMORY_ALLOCATION_ALLOCATE 0 1024 0x7fb2d0005000 15 3721742711204077 3721742711219717
7 MEMORY_ALLOCATION MEMORY_ALLOCATION_FREE 0 0 0x7fb2d0005000 16 3721742711225857 3721742711466018
+1 -1
Просмотреть файл
@@ -1318,5 +1318,5 @@ Properties
- **`thread_id`** *(integer, required)*: Thread ID.
- **`agent_id`** *(object, required)*: Agent ID.
- **`handle`** *(integer, required)*: Handle of the agent.
- **`starting_address`** *(string, required)*: Starting address of allocation.
- **`address`** *(string, required)*: Starting address of allocation.
- **`allocation_size`** *(integer, required)*: Size of allocation.
+5 -5
Просмотреть файл
@@ -1566,13 +1566,13 @@
"handle"
]
},
"starting_address": {
"type": "integer",
"description": "Starting address of allocation"
"address": {
"type": "string",
"description": "Starting address for allocation or freeing memory."
},
"allocation_size": {
"type": "integer",
"description": "allocation_size"
"description": "Size of memory allocation. Free operations not tracked currently."
}
},
"required": [
@@ -1584,7 +1584,7 @@
"end_timestamp",
"thread_id",
"agent_id",
"starting_address",
"address",
"allocation_size"
]
}
+3 -3
Просмотреть файл
@@ -217,9 +217,9 @@ typedef struct
rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation
uint64_t starting_address; ///< starting address for memory allocation
uint64_t allocation_size; ///< size for memory allocation
rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation
rocprofiler_address_t address; ///< starting address for memory allocation
uint64_t allocation_size; ///< size for memory allocation
/// @var kind
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION
/// @var operation
+7 -7
Просмотреть файл
@@ -211,16 +211,16 @@ typedef struct
} rocprofiler_callback_tracing_memory_copy_data_t;
/**
* @brief ROCProfiler Memory Copy Allocation Tracer Record.
* @brief ROCProfiler Memory Allocation Tracer Record.
*/
typedef struct
{
uint64_t size; ///< size of this struct
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation
uint64_t starting_address; ///< starting address for memory allocation
uint64_t allocation_size; ///< size of memory allocation
uint64_t size; ///< size of this struct
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation
rocprofiler_address_t address; ///< starting address for memory allocation
uint64_t allocation_size; ///< size of memory allocation
} rocprofiler_callback_tracing_memory_allocation_data_t;
/**
+1 -1
Просмотреть файл
@@ -4,7 +4,7 @@
#
#
set(ROCPROFILER_CXX_HEADER_FILES hash.hpp name_info.hpp operators.hpp perfetto.hpp
serialization.hpp)
utility.hpp serialization.hpp)
install(
FILES ${ROCPROFILER_CXX_HEADER_FILES}
+5 -2
Просмотреть файл
@@ -30,6 +30,8 @@
#include <rocprofiler-sdk/internal_threading.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <rocprofiler-sdk/cxx/name_info.hpp>
#include <rocprofiler-sdk/cxx/perfetto.hpp>
#include <rocprofiler-sdk/cxx/utility.hpp>
#include <cereal/archives/binary.hpp>
#include <cereal/archives/json.hpp>
@@ -64,6 +66,7 @@
#define ROCP_SDK_SAVE_DATA_FIELD(FIELD) ar(make_nvp(#FIELD, data.FIELD))
#define ROCP_SDK_SAVE_DATA_VALUE(NAME, VALUE) ar(make_nvp(NAME, data.VALUE))
#define ROCP_SDK_SAVE_VALUE(NAME, VALUE) ar(make_nvp(NAME, VALUE))
#define ROCP_SDK_SAVE_DATA_CSTR(FIELD) \
ar(make_nvp(#FIELD, std::string{data.FIELD ? data.FIELD : ""}))
#define ROCP_SDK_SAVE_DATA_BITFIELD(NAME, VALUE) \
@@ -338,7 +341,7 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_memory_allocation_data_t data)
ROCP_SDK_SAVE_DATA_FIELD(start_timestamp);
ROCP_SDK_SAVE_DATA_FIELD(end_timestamp);
ROCP_SDK_SAVE_DATA_FIELD(agent_id);
ROCP_SDK_SAVE_DATA_FIELD(starting_address);
ROCP_SDK_SAVE_VALUE("address", rocprofiler::sdk::utility::as_hex(data.address.value, 16));
ROCP_SDK_SAVE_DATA_FIELD(allocation_size);
}
@@ -543,7 +546,7 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_allocation_record_t data)
ROCP_SDK_SAVE_DATA_FIELD(start_timestamp);
ROCP_SDK_SAVE_DATA_FIELD(end_timestamp);
ROCP_SDK_SAVE_DATA_FIELD(agent_id);
ROCP_SDK_SAVE_DATA_FIELD(starting_address);
ROCP_SDK_SAVE_VALUE("address", rocprofiler::sdk::utility::as_hex(data.address.value, 16));
ROCP_SDK_SAVE_DATA_FIELD(allocation_size);
}
+54
Просмотреть файл
@@ -0,0 +1,54 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
#pragma once
#include <iomanip>
#include <sstream>
namespace rocprofiler
{
namespace sdk
{
namespace utility
{
template <typename Tp>
auto
_as_hex(Tp val, size_t width = 0)
{
auto ss = std::stringstream{};
ss << "0x" << std::hex << std::setw(width) << std::setfill('0') << val;
return ss.str();
}
#define ROCPROFILER_CXX_DEFINE_AS_HEX(TYPE) \
inline auto as_hex(TYPE val, size_t width = 0) \
{ \
return ::rocprofiler::sdk::utility::_as_hex(val, width); \
}
ROCPROFILER_CXX_DEFINE_AS_HEX(uint64_t)
#undef ROCPROFILER_CXX_DEFINE_AS_HEX
} // namespace utility
} // namespace sdk
} // namespace rocprofiler
+15 -9
Просмотреть файл
@@ -238,10 +238,11 @@ typedef enum // NOLINT(performance-enum-size)
*/
typedef enum // NOLINT(performance-enum-size)
{
ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function
ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function
ROCPROFILER_MEMORY_ALLOCATION_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool
ROCPROFILER_MEMORY_ALLOCATION_VMEM_HANDLE_CREATE, ///< Allocate vmem memory handle
ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function
ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function
ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE, ///< Allocate vmem memory handle
ROCPROFILER_MEMORY_ALLOCATION_FREE, ///< Free memory function
ROCPROFILER_MEMORY_ALLOCATION_VMEM_FREE, ///< Release vmem memory handle
ROCPROFILER_MEMORY_ALLOCATION_LAST,
} rocprofiler_memory_allocation_operation_t;
@@ -458,11 +459,6 @@ typedef enum
*/
typedef uint64_t rocprofiler_timestamp_t;
/**
* @brief ROCProfiler Address.
*/
typedef uint64_t rocprofiler_address_t;
/**
* @brief Thread ID. Value will be equivalent to `syscall(__NR_gettid)`
*/
@@ -519,6 +515,16 @@ typedef union rocprofiler_user_data_t
void* ptr; ///< usage example: set to address of data allocation
} rocprofiler_user_data_t;
/**
* @brief Stores memory address for profiling
*
*/
typedef union rocprofiler_address_t
{
uint64_t value; ///< usage example: store address in uint64_t format
void* ptr; ///< usage example: generic form of address
} rocprofiler_address_t;
//--------------------------------------------------------------------------------------//
//
// STRUCTS
+13 -3
Просмотреть файл
@@ -33,6 +33,7 @@
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/marker/api_id.h>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <rocprofiler-sdk/cxx/utility.hpp>
#include <unistd.h>
#include <cstdint>
@@ -456,7 +457,7 @@ generate_csv(const output_config&
"Operation",
"Agent_Id",
"Allocation_Size",
"Starting_Address",
"Address",
"Correlation_Id",
"Start_Timestamp",
"End_Timestamp"}};
@@ -464,15 +465,24 @@ generate_csv(const output_config&
{
for(auto record : data.get(ditr))
{
uint64_t agent_info{0};
// Free functions currently do not track agent information. Only set it on allocation
// operations, otherwise set it to 0 currently
if(record.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE ||
record.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE)
{
agent_info = tool_metadata.get_node_id(record.agent_id);
}
auto api_name = tool_metadata.get_operation_name(record.kind, record.operation);
auto row_ss = std::stringstream{};
rocprofiler::tool::csv::memory_allocation_csv_encoder::write_row(
row_ss,
tool_metadata.get_kind_name(record.kind),
api_name,
tool_metadata.get_node_id(record.agent_id),
agent_info,
record.allocation_size,
record.starting_address,
rocprofiler::sdk::utility::as_hex(record.address.value, 16),
record.correlation_id.internal,
record.start_timestamp,
record.end_timestamp);
+21 -7
Просмотреть файл
@@ -486,17 +486,25 @@ write_otf2(
{
for(auto& [agent, evt] : itr)
{
const auto* _agent = _get_agent(agent);
auto _type_name = std::string_view{"UNK"};
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
// Free functions do not track agent information. Below handles case where
// null rocprof agent id is passed to generate OTF2
constexpr auto null_rocp_agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
const rocprofiler_agent_t* _agent = nullptr;
if(agent != null_rocp_agent_id)
{
_agent = _get_agent(agent);
}
auto _type_name = std::string_view{"UNK"};
if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_type_name = "CPU";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_type_name = "GPU";
evt.name = fmt::format("Thread {}, Memory Allocation at {} {}",
evt.name = fmt::format("Thread {}, Memory Operation at {} {}",
tid,
_type_name,
_agent->logical_node_type_id);
_agent == nullptr ? 0 : _agent->logical_node_type_id);
}
}
@@ -860,6 +868,12 @@ write_otf2(
for(auto& [agent, evt] : itr)
{
auto _hash = get_hash_id(evt.name);
// Using max numeric limits results in an out-of-bound runtime error for OTF2
// and perfetto for agent ids. Setting handle to 0 for free functions.
constexpr auto null_rocp_agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
auto handle = agent.handle;
if(agent == null_rocp_agent_id) handle = 0;
add_write_string(_hash, evt.name);
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
@@ -867,7 +881,7 @@ write_otf2(
_hash,
OTF2_LOCATION_TYPE_ACCELERATOR_STREAM,
2 * evt.event_count, // # events
agent.handle // location group
handle // location group
));
}
}
+93 -68
Просмотреть файл
@@ -241,33 +241,6 @@ write_perfetto(
}
}
for(const auto& itr : agent_thread_ids_alloc)
{
const auto* _agent = _get_agent(itr.first);
for(auto titr : itr.second)
{
auto _namess = std::stringstream{};
_namess << "MEMORY ALLOCATION on AGENT [" << _agent->logical_node_id << "] THREAD ["
<< thread_indexes.at(titr) << "] ";
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << "(CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << "(GPU)";
else
_namess << "(UNK)";
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_thread_tracks_alloc[itr.first].emplace(titr, _track);
}
}
for(const auto& aitr : agent_queue_ids)
{
uint32_t nqueue = 0;
@@ -463,47 +436,6 @@ write_perfetto(
tracing_session->FlushBlocking();
}
for(auto ditr : memory_allocation_gen)
for(auto itr : memory_allocation_gen.get(ditr))
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = agent_thread_tracks_alloc.at(itr.agent_id).at(itr.thread_id);
std::stringstream hex_stream;
hex_stream << "0x" << std::hex << std::setw(16) << std::setfill('0')
<< itr.starting_address;
std::string hex_starting_address(hex_stream.str());
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_allocation>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"end_ns",
itr.end_timestamp,
"delta_ns",
(itr.end_timestamp - itr.start_timestamp),
"kind",
itr.kind,
"operation",
itr.operation,
"agent",
agents_map.at(itr.agent_id).logical_node_id,
"allocation_size",
itr.allocation_size,
"starting_address",
hex_starting_address,
"corr_id",
itr.correlation_id.internal,
"tid",
itr.thread_id);
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_allocation>::name,
track,
itr.end_timestamp);
tracing_session->FlushBlocking();
}
for(auto ditr : kernel_dispatch_gen)
for(auto itr : kernel_dispatch_gen.get(ditr))
{
@@ -634,6 +566,99 @@ write_perfetto(
tracing_session->FlushBlocking();
}
}
// memory allocation counter track
auto mem_alloc_endpoints = std::map<rocprofiler_agent_id_t, std::map<uint64_t, uint64_t>>{};
auto mem_alloc_extremes = std::pair<uint64_t, uint64_t>{};
auto address_to_size = std::unordered_map<uint64_t, uint64_t>{};
for(auto ditr : memory_allocation_gen)
for(auto itr : memory_allocation_gen.get(ditr))
{
uint64_t _mean_timestamp =
itr.start_timestamp + (0.5 * (itr.end_timestamp - itr.start_timestamp));
mem_alloc_endpoints[itr.agent_id].emplace(itr.start_timestamp - 1000, 0);
mem_alloc_endpoints[itr.agent_id].emplace(itr.start_timestamp, 0);
mem_alloc_endpoints[itr.agent_id].emplace(_mean_timestamp, 0);
mem_alloc_endpoints[itr.agent_id].emplace(itr.end_timestamp, 0);
mem_alloc_endpoints[itr.agent_id].emplace(itr.end_timestamp + 1000, 0);
mem_alloc_extremes =
std::make_pair(std::min(mem_alloc_extremes.first, itr.start_timestamp),
std::max(mem_alloc_extremes.second, itr.end_timestamp));
if(itr.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE ||
itr.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE)
{
address_to_size.emplace(itr.address.value, itr.allocation_size);
}
}
for(auto ditr : memory_allocation_gen)
for(auto itr : memory_allocation_gen.get(ditr))
{
auto alloc_beg =
mem_alloc_endpoints.at(itr.agent_id).lower_bound(itr.start_timestamp);
auto alloc_end =
mem_alloc_endpoints.at(itr.agent_id).upper_bound(itr.end_timestamp);
LOG_IF(FATAL, alloc_beg == alloc_end)
<< "Missing range for timestamp [" << itr.start_timestamp << ", "
<< itr.end_timestamp << "]";
for(auto alloc_itr = alloc_beg; alloc_itr != alloc_end; ++alloc_itr)
{
if(address_to_size.count(itr.address.value) > 0)
{
alloc_itr->second += address_to_size.at(itr.address.value);
}
}
}
auto mem_alloc_tracks =
std::unordered_map<rocprofiler_agent_id_t, ::perfetto::CounterTrack>{};
auto mem_alloc_cnt_names = std::vector<std::string>{};
constexpr auto null_rocp_agent_id =
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
mem_alloc_cnt_names.reserve(mem_alloc_endpoints.size());
for(auto& alloc_itr : mem_alloc_endpoints)
{
mem_alloc_endpoints[alloc_itr.first].emplace(mem_alloc_extremes.first - 5000, 0);
mem_alloc_endpoints[alloc_itr.first].emplace(mem_alloc_extremes.second + 5000, 0);
auto _track_name = std::stringstream{};
const rocprofiler_agent_t* _agent = nullptr;
if(alloc_itr.first != null_rocp_agent_id)
{
_agent = _get_agent(alloc_itr.first);
}
if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_track_name << "ALLOCATE BYTES on AGENT [" << _agent->logical_node_id << "] (CPU)";
else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_track_name << "ALLOCATE BYTES on AGENT [" << _agent->logical_node_id << "] (GPU)";
else
_track_name << "FREE BYTES";
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = mem_alloc_cnt_names.emplace_back(_track_name.str());
mem_alloc_tracks.emplace(alloc_itr.first,
::perfetto::CounterTrack{_name.c_str()}
.set_unit(_unit)
.set_unit_multiplier(bytes_multiplier)
.set_is_incremental(false));
}
for(auto& alloc_itr : mem_alloc_endpoints)
{
for(auto itr : alloc_itr.second)
{
TRACE_COUNTER(sdk::perfetto_category<sdk::category::memory_allocation>::name,
mem_alloc_tracks.at(alloc_itr.first),
itr.first,
itr.second / bytes_multiplier);
tracing_session->FlushBlocking();
}
}
}
::perfetto::TrackEvent::Flush();
+3
Просмотреть файл
@@ -481,6 +481,9 @@ HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, hsa_memory_allocate, hsa_memory_allocate_fn)
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate_fn)
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create_fn)
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_free, hsa_memory_free, hsa_memory_free_fn)
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_free, hsa_amd_memory_pool_free, hsa_amd_memory_pool_free_fn)
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_release, hsa_amd_vmem_handle_release, hsa_amd_vmem_handle_release_fn)
// clang-format on
#else
+260 -50
Просмотреть файл
@@ -68,49 +68,125 @@ using memory_pool_to_agent_map = std::unordered_map<hsa_amd_memory_pool_t, rocpr
using region_to_agent_pair = std::pair<region_to_agent_map*, rocprofiler_agent_id_t>;
using map_pool_to_agent_pair = std::pair<memory_pool_to_agent_map*, rocprofiler_agent_id_t>;
template <size_t TableIdx, size_t OpIdx, typename... Args>
hsa_status_t
memory_allocation_impl(Args... args);
template <size_t TableIdx, size_t OpIdx, typename... Args>
hsa_status_t
memory_free_impl(Args... args);
// Local enum to specify implementation of memory function wrappers
typedef enum
{
HSA_NONE = 0, ///< Unknown memory allocation function
HSA_MEMORY_ALLOCATE, ///< Allocate memory function
HSA_AMD_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool
HSA_AMD_VMEM_ALLOCATE, ///< Allocate vmem memory handle
HSA_MEMORY_FREE, ///< Free memory function
HSA_AMD_MEMORY_POOL_FREE, ///< Free memory pool
HSA_AMD_VMEM_FREE, ///< Release vmem memory handle
HSA_LAST,
} hsa_memory_operation_functions_t;
// Set up information to identify agent from regions/pool
template <size_t OpIdx>
struct memory_allocation_info;
#define SPECIALIZE_MEMORY_ALLOCATION_INFO(FUNCTION, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC) \
#define SPECIALIZE_MEMORY_ALLOCATION_INFO( \
FUNCTION, ENUM, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC, IMPLEMENTATION) \
template <> \
struct memory_allocation_info<ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION> \
struct memory_allocation_info<FUNCTION> \
{ \
using maptype = MAPTYPE; \
using pairtype = PAIRTYPE; \
using searchtype = SEARCHTYPE; \
auto& operator()() const { return ITERATEFUNC; } \
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \
static constexpr auto name = "MEMORY_ALLOCATION_" #FUNCTION; \
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##ENUM; \
static constexpr auto name = "MEMORY_ALLOCATION_" #ENUM; \
\
template <size_t TableIdx, size_t OpIdx, typename RetT, typename... Args> \
static auto get_memory_allocation_impl(RetT (*)(Args...)) \
{ \
return &IMPLEMENTATION<TableIdx, OpIdx, Args...>; \
} \
};
SPECIALIZE_MEMORY_ALLOCATION_INFO(NONE,
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_NONE,
NONE,
region_to_agent_map,
region_to_agent_pair,
hsa_region_t,
get_core_table()->hsa_agent_iterate_regions_fn)
SPECIALIZE_MEMORY_ALLOCATION_INFO(ALLOCATE,
get_core_table()->hsa_agent_iterate_regions_fn,
memory_allocation_impl)
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_MEMORY_ALLOCATE,
ALLOCATE,
region_to_agent_map,
region_to_agent_pair,
hsa_region_t,
get_core_table()->hsa_agent_iterate_regions_fn)
SPECIALIZE_MEMORY_ALLOCATION_INFO(MEMORY_POOL_ALLOCATE,
get_core_table()->hsa_agent_iterate_regions_fn,
memory_allocation_impl)
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_MEMORY_POOL_ALLOCATE,
ALLOCATE,
memory_pool_to_agent_map,
map_pool_to_agent_pair,
hsa_amd_memory_pool_t,
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn)
SPECIALIZE_MEMORY_ALLOCATION_INFO(VMEM_HANDLE_CREATE,
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
memory_allocation_impl)
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_VMEM_ALLOCATE,
VMEM_ALLOCATE,
memory_pool_to_agent_map,
map_pool_to_agent_pair,
hsa_amd_memory_pool_t,
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn)
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
memory_allocation_impl)
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_MEMORY_FREE,
FREE,
region_to_agent_map,
region_to_agent_pair,
hsa_region_t,
get_core_table()->hsa_agent_iterate_regions_fn,
memory_free_impl)
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_MEMORY_POOL_FREE,
FREE,
memory_pool_to_agent_map,
map_pool_to_agent_pair,
hsa_amd_memory_pool_t,
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
memory_free_impl)
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_VMEM_FREE,
VMEM_FREE,
memory_pool_to_agent_map,
map_pool_to_agent_pair,
hsa_amd_memory_pool_t,
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
memory_free_impl)
#undef SPECIALIZE_MEMORY_ALLOCATION_INFO
// Map rocprofiler_memory_allocation_operation_t to respective name
template <size_t OpIdx>
struct memory_allocation_name;
#define MEMORY_ALLOCATION_NAME(ENUM) \
template <> \
struct memory_allocation_name<ROCPROFILER_MEMORY_ALLOCATION_##ENUM> \
{ \
static constexpr auto name = "MEMORY_ALLOCATION_" #ENUM; \
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##ENUM; \
};
MEMORY_ALLOCATION_NAME(NONE)
MEMORY_ALLOCATION_NAME(ALLOCATE)
MEMORY_ALLOCATION_NAME(VMEM_ALLOCATE)
MEMORY_ALLOCATION_NAME(FREE)
MEMORY_ALLOCATION_NAME(VMEM_FREE)
#undef MEMORY_ALLOCATION_NAME
template <size_t Idx, size_t... IdxTail>
const char*
name_by_id(const uint32_t id, std::index_sequence<Idx, IdxTail...>)
{
if(Idx == id) return memory_allocation_info<Idx>::name;
if(Idx == id) return memory_allocation_name<Idx>::name;
if constexpr(sizeof...(IdxTail) > 0)
return name_by_id(id, std::index_sequence<IdxTail...>{});
else
@@ -121,8 +197,8 @@ template <size_t Idx, size_t... IdxTail>
uint32_t
id_by_name(const char* name, std::index_sequence<Idx, IdxTail...>)
{
if(std::string_view{memory_allocation_info<Idx>::name} == std::string_view{name})
return memory_allocation_info<Idx>::operation_idx;
if(std::string_view{memory_allocation_name<Idx>::name} == std::string_view{name})
return memory_allocation_name<Idx>::operation_idx;
if constexpr(sizeof...(IdxTail) > 0)
return id_by_name(name, std::index_sequence<IdxTail...>{});
else
@@ -137,7 +213,7 @@ get_ids(std::vector<uint32_t>& _id_list, std::index_sequence<Idx...>)
if(_v < static_cast<uint32_t>(ROCPROFILER_MEMORY_ALLOCATION_LAST)) _vec.emplace_back(_v);
};
(_emplace(_id_list, memory_allocation_info<Idx>::operation_idx), ...);
(_emplace(_id_list, memory_allocation_name<Idx>::operation_idx), ...);
}
template <size_t... Idx>
@@ -148,7 +224,7 @@ get_names(std::vector<const char*>& _name_list, std::index_sequence<Idx...>)
if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v);
};
(_emplace(_name_list, memory_allocation_info<Idx>::name), ...);
(_emplace(_name_list, memory_allocation_name<Idx>::name), ...);
}
bool
@@ -169,17 +245,24 @@ context_filter(const context::context* ctx)
enum memory_allocation_core_id
{
memory_allocation_core_allocate_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate,
memory_allocation_core_free_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_free,
};
using memory_allocation_core_index_seq_t = std::index_sequence<memory_allocation_core_allocate_id>;
using memory_allocation_core_index_seq_t =
std::index_sequence<memory_allocation_core_allocate_id, memory_allocation_core_free_id>;
enum memory_allocation_amd_ext_id
{
memory_allocation_amd_ext_allocate_id =
ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate,
memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create
memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create,
memory_allocation_amd_ext_free_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_free,
memory_allocation_vmem_release_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_release,
};
using memory_allocation_amd_ext_index_seq_t =
std::index_sequence<memory_allocation_amd_ext_allocate_id, memory_allocation_vmem_allocate_id>;
std::index_sequence<memory_allocation_amd_ext_allocate_id,
memory_allocation_vmem_allocate_id,
memory_allocation_amd_ext_free_id,
memory_allocation_vmem_release_id>;
template <size_t TableIdx>
struct memory_allocation_seq;
@@ -203,14 +286,17 @@ struct arg_indices;
template <> \
struct arg_indices<ENUM_ID> \
{ \
static constexpr auto starting_address_idx = STARTING_ADDRESS_IDX; \
static constexpr auto size_idx = SIZE_IDX; \
static constexpr auto region_idx = REGION_IDX; \
static constexpr auto address_idx = STARTING_ADDRESS_IDX; \
static constexpr auto size_idx = SIZE_IDX; \
static constexpr auto region_idx = REGION_IDX; \
};
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_allocate_id, 2, 1, 0)
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_allocate_id, 3, 1, 0)
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_allocate_id, 4, 1, 0)
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_free_id, 0, 0, 0)
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_free_id, 0, 0, 0)
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_release_id, 0, 0, 0)
// Define operation indices for each tracked functions
template <size_t Idx>
@@ -220,12 +306,15 @@ struct memory_allocation_op;
template <> \
struct memory_allocation_op<ENUM_ID> \
{ \
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \
static constexpr auto operation_idx = FUNCTION; \
};
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, ALLOCATE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, MEMORY_POOL_ALLOCATE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, VMEM_HANDLE_CREATE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, HSA_MEMORY_ALLOCATE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, HSA_AMD_MEMORY_POOL_ALLOCATE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, HSA_AMD_VMEM_ALLOCATE)
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_free_id, HSA_MEMORY_FREE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_free_id, HSA_AMD_MEMORY_POOL_FREE);
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_release_id, HSA_AMD_VMEM_FREE);
template <typename FuncT, typename ArgsT, size_t... Idx>
decltype(auto)
@@ -255,7 +344,7 @@ struct memory_allocation_data
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_agent_id_t agent = null_rocp_agent_id;
uint64_t size_allocated = 0;
uint64_t starting_addr = 0;
rocprofiler_address_t address = {.value = 0};
uint64_t start_ts = 0;
context::correlation_id* correlation_id = nullptr;
tracing::tracing_data tracing_data = {};
@@ -271,7 +360,7 @@ memory_allocation_data::callback_data_t
memory_allocation_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const
{
return common::init_public_api_struct(
callback_data_t{}, _beg, _end, agent, starting_addr, size_allocated);
callback_data_t{}, _beg, _end, agent, address, size_allocated);
}
memory_allocation_data::buffered_data_t
@@ -291,7 +380,7 @@ memory_allocation_data::get_buffered_record(const context_t* _ctx,
_beg,
_end,
agent,
starting_addr,
address,
size_allocated);
}
@@ -336,16 +425,32 @@ get_agent(T val, IterateFunc iterate_func, CallbackFunc callback)
return existing.count(val) == 0 ? null_rocp_agent_id : existing.at(val);
}
uint64_t
void*
handle_starting_addr(void** starting_addr_pointer)
{
return reinterpret_cast<uint64_t>(*starting_addr_pointer);
return *starting_addr_pointer;
}
uint64_t
// The handle field of hsa_amd_vmem_alloc_handle_t is the starting address
// cast as uint64_t, so returning the handle field after casting to void* suffices
void*
handle_starting_addr(hsa_amd_vmem_alloc_handle_t* vmem_alloc_handle)
{
return vmem_alloc_handle->handle;
return reinterpret_cast<void*>(vmem_alloc_handle->handle);
}
// Handling starting address for free memory operations
void*
handle_starting_addr(void* starting_addr_pointer)
{
return starting_addr_pointer;
}
// Handles starting address for releasing handle
void*
handle_starting_addr(hsa_amd_vmem_alloc_handle_t vmem_alloc_handle)
{
return reinterpret_cast<void*>(vmem_alloc_handle.handle);
}
// Wrapper implementation that stores memory allocation information
@@ -353,11 +458,12 @@ template <size_t TableIdx, size_t OpIdx, typename... Args>
hsa_status_t
memory_allocation_impl(Args... args)
{
constexpr auto N = sizeof...(Args);
constexpr auto starting_address_idx = arg_indices<OpIdx>::starting_address_idx;
constexpr auto size_idx = arg_indices<OpIdx>::size_idx;
constexpr auto region_idx = arg_indices<OpIdx>::region_idx;
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
constexpr auto N = sizeof...(Args);
constexpr auto address_idx = arg_indices<OpIdx>::address_idx;
constexpr auto size_idx = arg_indices<OpIdx>::size_idx;
constexpr auto region_idx = arg_indices<OpIdx>::region_idx;
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
constexpr auto rocprofiler_enum = memory_allocation_info<operation>::operation_idx;
auto&& _tied_args = std::tie(args...);
memory_allocation_data _data{};
@@ -380,7 +486,7 @@ memory_allocation_impl(Args... args)
}
auto& tracing_data = _data.tracing_data;
auto starting_addr_pointer = std::get<starting_address_idx>(_tied_args);
auto starting_addr_pointer = std::get<address_idx>(_tied_args);
auto region_or_pool = std::get<region_idx>(_tied_args);
_data.tid = common::get_tid();
@@ -389,7 +495,7 @@ memory_allocation_impl(Args... args)
memory_allocation_info<operation>{}(),
callback_populate_map<operation, typename memory_allocation_info<operation>::searchtype>);
_data.size_allocated = std::get<size_idx>(_tied_args);
_data.func = operation;
_data.func = rocprofiler_enum;
_data.correlation_id = context::get_latest_correlation_id();
if(!_data.correlation_id)
@@ -405,7 +511,7 @@ memory_allocation_impl(Args... args)
tracing_data.external_correlation_ids,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION,
operation,
rocprofiler_enum,
_data.correlation_id->internal);
if(!tracing_data.callback_contexts.empty())
@@ -417,7 +523,7 @@ memory_allocation_impl(Args... args)
_data.correlation_id->internal,
tracing_data.external_correlation_ids,
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
operation,
rocprofiler_enum,
_tracer_data);
// enter callback may update the external correlation id field
tracing::update_external_correlation_ids(
@@ -433,7 +539,7 @@ memory_allocation_impl(Args... args)
// checks before retrieving starting address?
if(starting_addr_pointer != nullptr)
{
_data.starting_addr = handle_starting_addr(starting_addr_pointer);
_data.address.ptr = handle_starting_addr(starting_addr_pointer);
}
if(!tracing_data.empty())
@@ -445,7 +551,7 @@ memory_allocation_impl(Args... args)
tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts,
_data.tracing_data.external_correlation_ids,
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
operation,
rocprofiler_enum,
_tracer_data);
}
@@ -458,7 +564,7 @@ memory_allocation_impl(Args... args)
_data.correlation_id->internal,
_data.tracing_data.external_correlation_ids,
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
operation,
rocprofiler_enum,
record);
}
}
@@ -468,11 +574,113 @@ memory_allocation_impl(Args... args)
return _ret;
}
template <size_t TableIdx, size_t OpIdx, typename RetT, typename... Args>
auto get_memory_allocation_impl(RetT (*)(Args...))
// Wrapper implementation that stores memory free operation information
template <size_t TableIdx, size_t OpIdx, typename... Args>
hsa_status_t
memory_free_impl(Args... args)
{
return &memory_allocation_impl<TableIdx, OpIdx, Args...>;
constexpr auto N = sizeof...(Args);
constexpr auto address_idx = arg_indices<OpIdx>::address_idx;
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
constexpr auto rocprofiler_enum = memory_allocation_info<operation>::operation_idx;
auto&& _tied_args = std::tie(args...);
memory_allocation_data _data{};
{
auto tracing_data = tracing::tracing_data{};
tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
OpIdx,
tracing_data);
// if no contexts are tracing memory copies for this direction, execute as usual
if(tracing_data.empty())
{
return invoke(get_next_dispatch<TableIdx, OpIdx>(),
std::move(_tied_args),
std::make_index_sequence<N>{});
}
_data.tracing_data = std::move(tracing_data);
}
auto& tracing_data = _data.tracing_data;
_data.tid = common::get_tid();
_data.func = rocprofiler_enum;
_data.correlation_id = context::get_latest_correlation_id();
_data.address.ptr = handle_starting_addr(std::get<address_idx>(_tied_args));
if(!_data.correlation_id)
{
constexpr auto ref_count = 1;
_data.correlation_id = context::correlation_tracing_service::construct(ref_count);
}
// increase the reference count to denote that this correlation id is being used in a kernel
_data.correlation_id->add_ref_count();
auto thr_id = _data.correlation_id->thread_idx;
tracing::populate_external_correlation_ids(
tracing_data.external_correlation_ids,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION,
rocprofiler_enum,
_data.correlation_id->internal);
if(!tracing_data.callback_contexts.empty())
{
auto _tracer_data = _data.get_callback_data();
tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts,
thr_id,
_data.correlation_id->internal,
tracing_data.external_correlation_ids,
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
rocprofiler_enum,
_tracer_data);
// enter callback may update the external correlation id field
tracing::update_external_correlation_ids(
tracing_data.external_correlation_ids,
thr_id,
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION);
}
auto start_ts = common::timestamp_ns();
auto _ret = invoke(
get_next_dispatch<TableIdx, OpIdx>(), std::move(_tied_args), std::make_index_sequence<N>{});
auto end_ts = common::timestamp_ns();
if(!tracing_data.empty())
{
if(!_data.tracing_data.callback_contexts.empty())
{
auto _tracer_data = _data.get_callback_data(start_ts, end_ts);
tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts,
_data.tracing_data.external_correlation_ids,
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
rocprofiler_enum,
_tracer_data);
}
if(!_data.tracing_data.buffered_contexts.empty())
{
auto record = _data.get_buffered_record(nullptr, start_ts, end_ts);
tracing::execute_buffer_record_emplace(_data.tracing_data.buffered_contexts,
_data.tid,
_data.correlation_id->internal,
_data.tracing_data.external_correlation_ids,
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
rocprofiler_enum,
record);
}
}
// decrement the reference count after usage in the callback/buffers
_data.correlation_id->sub_ref_count();
return _ret;
}
} // namespace
// check out the assembly here... this compiles to a switch statement
const char*
@@ -573,7 +781,9 @@ memory_allocation_wrap(Tp* _orig, std::integral_constant<size_t, OpIdx>)
auto& _dispatch = get_next_dispatch<TableIdx, OpIdx>();
CHECK_NOTNULL(_dispatch);
_func = get_memory_allocation_impl<TableIdx, OpIdx>(_func);
constexpr auto LocalIdx = memory_allocation_op<OpIdx>::operation_idx;
_func = memory_allocation_info<LocalIdx>::template get_memory_allocation_impl<TableIdx, OpIdx>(
_func);
}
template <size_t TableIdx, typename Tp, size_t OpIdx, size_t... OpIdxTail>
+7 -1
Просмотреть файл
@@ -170,6 +170,8 @@ call_hsa_memory_allocate(const size_t i, const size_t base_size, hsa_agent_t age
status = hsa_memory_allocate(region_list[0], base_size, &addr);
RET_IF_HSA_ERR(status)
status = hsa_memory_free(addr);
RET_IF_HSA_ERR(status)
}
}
@@ -199,6 +201,8 @@ call_hsa_memory_pool_allocate(const size_t i, const size_t base_size, hsa_agent_
status = hsa_amd_memory_pool_allocate(memory_pool_list[0], base_size, flags, &addr);
RET_IF_HSA_ERR(status)
status = hsa_amd_memory_pool_free(addr);
RET_IF_HSA_ERR(status)
}
}
@@ -243,6 +247,8 @@ call_hsa_vmem_allocate(const size_t i, hsa_agent_t agent)
status = hsa_amd_vmem_handle_create(
memory_pool_list[0], size, MEMORY_TYPE_NONE, 0, &memory_handle);
RET_IF_HSA_ERR(status)
status = hsa_amd_vmem_handle_release(memory_handle);
RET_IF_HSA_ERR(status)
}
}
@@ -257,7 +263,7 @@ main()
hsa_agent_t cpu_agent = get_cpu_agent(agents);
hsa_agent_t gpu_agent = get_gpu_agent(agents);
call_hsa_memory_allocate(6, 1024, cpu_agent);
call_hsa_memory_pool_allocate(9, 512, gpu_agent);
call_hsa_memory_pool_allocate(9, 2048, gpu_agent);
// Virtual memory API not supported in CI. Will add back if this changes
// call_hsa_vmem_allocate(3, gpu_agent);
+18 -19
Просмотреть файл
@@ -175,25 +175,26 @@ def test_memory_alloc_sizes(input_data):
# Op values:
# 0 == ??? (unknown)
# 1 == hsa_memory_allocate
# 2 == hsa_amd_ext_memory_allocate
# 3 == hsa_amd_vmem_handle_create
# 2 == hsa_amd_vmem_handle_create
# 3 == hsa_memory_free
# 4 == hsa_amd_vmem_handle_release
memory_alloc_cnt = dict(
[
(idx, {"agent": set(), "starting_addr": set(), "size": set(), "count": 0})
for idx in range(1, 4)
for idx in range(1, 5)
]
)
for itr in sdk_data["buffer_records"]["memory_allocations"]:
op_id = itr["operation"]
assert op_id > 0 and op_id <= 3, f"{itr}"
assert op_id > 0 and op_id <= 5, f"{itr}"
memory_alloc_cnt[op_id]["count"] += 1
memory_alloc_cnt[op_id]["starting_addr"].add(itr.starting_address)
memory_alloc_cnt[op_id]["starting_addr"].add(itr.address)
memory_alloc_cnt[op_id]["size"].add(itr.allocation_size)
memory_alloc_cnt[op_id]["agent"].add(itr.agent_id.handle)
for itr in sdk_data["callback_records"]["memory_copies"]:
op_id = itr.operation
assert op_id > 0 and op_id <= 3, f"{itr}"
assert op_id > 0 and op_id <= 5, f"{itr}"
memory_alloc_cnt[op_id]["count"] += 1
phase = itr.phase
@@ -210,7 +211,7 @@ def test_memory_alloc_sizes(input_data):
assert pitr.end_timestamp > 0, f"{itr}"
assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}"
memory_alloc_cnt[op_id]["starting_addr"].add(pitr.starting_address)
memory_alloc_cnt[op_id]["starting_addr"].add(pitr.address)
memory_alloc_cnt[op_id]["size"].add(pitr.allocation_size)
memory_alloc_cnt[op_id]["agent"].add(pitr.agent_id.handle)
else:
@@ -218,24 +219,22 @@ def test_memory_alloc_sizes(input_data):
# In the memory allocation test which generates this file
# 6 hsa_memory_allocation calls with 1024 bytes were called
# and 9 hsa_amd_memory_pool_allocations with 512 bytes
# and 9 hsa_amd_memory_pool_allocations with 2048 bytes
# were called
assert memory_alloc_cnt[1]["count"] == 6
assert memory_alloc_cnt[2]["count"] == 9
assert memory_alloc_cnt[1]["count"] == 15
assert memory_alloc_cnt[3]["count"] == 15
# assert memory_alloc_cnt[3]["count"] == 3
assert len(memory_alloc_cnt[1]["starting_addr"]) == 6
assert len(memory_alloc_cnt[2]["starting_addr"]) == 9
assert len(memory_alloc_cnt[1]["starting_addr"]) == len(
memory_alloc_cnt[3]["starting_addr"]
)
# assert len(memory_alloc_cnt[3]["starting_addr"]) == 3
assert len(memory_alloc_cnt[1]["size"]) == 1
assert len(memory_alloc_cnt[2]["size"]) == 1
assert len(memory_alloc_cnt[1]["size"]) == 2
# assert len(memory_alloc_cnt[3]["size"]) == 1
assert 1024 in memory_alloc_cnt[1]["size"]
assert 512 in memory_alloc_cnt[2]["size"]
assert len(memory_alloc_cnt[1]["agent"]) == 1
assert len(memory_alloc_cnt[2]["agent"]) == 1
assert 2048 in memory_alloc_cnt[1]["size"]
assert len(memory_alloc_cnt[1]["agent"]) == 2
# assert len(memory_alloc_cnt[3]["agent"]) == 1
assert memory_alloc_cnt[1]["agent"] != memory_alloc_cnt[2]["agent"]
# assert memory_alloc_cnt[2]["agent"] == memory_alloc_cnt[3]["agent"]
def test_retired_correlation_ids(input_data):
+6 -4
Просмотреть файл
@@ -19,9 +19,11 @@ string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
# disable this test for thread sanitizers because of "signal-unsafe call inside signal"
# issues on mi200 and mi300 (works fine on vega20 and navi32)
if(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer")
set(IS_THREAD_SANITIZER ON)
set(DISABLE_THIS_TEST ON)
else()
set(IS_THREAD_SANITIZER OFF)
# set(DISABLE_THIS_TEST OFF)
set(DISABLE_THIS_TEST ON) # this test is currently unstable so we are disabling it
# unconditionally for now
endif()
set(aborted-app-env "${PRELOAD_ENV}" ROCPROF_TESTING_RAISE_SIGNAL=1
@@ -46,7 +48,7 @@ set_tests_properties(
WILL_FAIL
TRUE
DISABLED
"${IS_THREAD_SANITIZER}")
"${DISABLE_THIS_TEST}")
add_test(
NAME rocprofv3-test-validate-app-abort
@@ -64,4 +66,4 @@ set_tests_properties(
FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
DISABLED
"${IS_THREAD_SANITIZER}")
"${DISABLE_THIS_TEST}")
+1 -3
Просмотреть файл
@@ -21,7 +21,7 @@ add_test(
NAME rocprofv3-test-memory-allocation-tracing-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --memory-allocation-trace -d
${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace otf2
${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json otf2
--log-level env -- $<TARGET_FILE:hsa-memory-allocation>)
set_tests_properties(
@@ -35,8 +35,6 @@ add_test(
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.json
--pftrace-input
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.pftrace
--otf2-input
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.otf2)
-12
Просмотреть файл
@@ -17,12 +17,6 @@ def pytest_addoption(parser):
default="memory-allocation-tracing/out_results.json",
help="Input JSON",
)
parser.addoption(
"--pftrace-input",
action="store",
default="memory-allocation-tracing/out_results.pftrace",
help="Input JSON",
)
parser.addoption(
"--otf2-input",
action="store",
@@ -38,12 +32,6 @@ def json_data(request):
return dotdict(collapse_dict_list(json.load(inp)))
@pytest.fixture
def pftrace_data(request):
filename = request.config.getoption("--pftrace-input")
return PerfettoReader(filename).read()[0]
@pytest.fixture
def otf2_data(request):
filename = request.config.getoption("--otf2-input")
+4 -12
Просмотреть файл
@@ -35,7 +35,7 @@ def test_memory_allocation(json_data):
_, bf_op_names = get_operation(data, "MEMORY_ALLOCATION")
assert len(bf_op_names) == 4
assert len(bf_op_names) == 5
allocation_reported_agent_ids = set()
# check buffering data
@@ -49,12 +49,12 @@ def test_memory_allocation(json_data):
assert "thread_id" in node
assert "agent_id" in node
assert "starting_address" in node
assert "address" in node
assert "allocation_size" in node
assert node.size > 0
assert node.allocation_size > 0
assert node.starting_address > 0
assert node.allocation_size >= 0
assert len(node.address) > 0
assert node.thread_id > 0
assert node.agent_id.handle > 0
assert node.start_timestamp > 0
@@ -69,14 +69,6 @@ def test_memory_allocation(json_data):
allocation_reported_agent_ids.add(node["agent_id"]["handle"])
assert 2**64 - 1 not in allocation_reported_agent_ids
def test_perfetto_data(pftrace_data, json_data):
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
rocprofv3.test_perfetto_data(pftrace_data, json_data, ("memory_allocation",))
def test_otf2_data(otf2_data, json_data):
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
+3 -3
Просмотреть файл
@@ -242,12 +242,12 @@ def test_summary_display_data(json_data, summary_data):
assert get_dims(marker) == [7, 9], f"{marker}"
assert get_dims(memcpy) == [2, 9], f"{memcpy}"
assert get_dims(memalloc) == [1, 9], f"{memalloc}"
assert get_dims(memalloc) == [2, 9], f"{memalloc}"
assert get_dims(dispatch) == [3, 9], f"{dispatch}"
assert get_dims(dispatch_and_copy) == [5, 9], f"{dispatch_and_copy}"
assert get_dims(hip) == [14, 9], f"{hip}"
assert get_dims(hip_and_marker) == expected_hip_and_marker_dims, f"{hip_and_marker}"
assert get_dims(total) == [24, 9], f"{total}"
assert get_dims(total) == [25, 9], f"{total}"
def test_perfetto_data(pftrace_data, json_data):
@@ -256,7 +256,7 @@ def test_perfetto_data(pftrace_data, json_data):
rocprofv3.test_perfetto_data(
pftrace_data,
json_data,
("hip", "marker", "kernel", "memory_copy", "memory_allocation"),
("hip", "marker", "kernel", "memory_copy"),
)
+33 -30
Просмотреть файл
@@ -49,6 +49,7 @@
#include <rocprofiler-sdk/internal_threading.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <rocprofiler-sdk/cxx/utility.hpp>
#include <unistd.h>
#include <algorithm>
@@ -1843,6 +1844,7 @@ write_perfetto()
auto tids = std::set<rocprofiler_thread_id_t>{};
auto agent_ids = std::set<uint64_t>{};
auto agent_ids_alloc = std::set<uint64_t>{};
auto agent_queue_ids = std::map<uint64_t, std::set<uint64_t>>{};
auto _get_agent = [](uint64_t id_handle) -> const rocprofiler_agent_t* {
@@ -1875,7 +1877,7 @@ write_perfetto()
for(auto itr : memory_allocation_bf_records)
{
tids.emplace(itr.thread_id);
agent_ids.emplace(itr.agent_id.handle);
agent_ids_alloc.emplace(itr.agent_id.handle);
}
for(auto itr : kernel_dispatch_bf_records)
@@ -1934,6 +1936,36 @@ write_perfetto()
agent_tracks.emplace(itr, _track);
}
for(auto itr : agent_ids_alloc)
{
const auto* _agent = _get_agent(itr);
auto _namess = std::stringstream{};
if(_agent != nullptr)
{
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << "CPU MEMORY OPERATION [" << itr << "] ";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << "GPU MEMORY OPERATION [" << itr << "] ";
if(!std::string_view{_agent->model_name}.empty())
_namess << _agent->model_name;
else
_namess << _agent->product_name;
}
else
{
_namess << "UNKNOWN MEMORY OPERATION [" << itr << "] ";
}
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_tracks.emplace(itr, _track);
}
auto agent_queue_tracks =
std::unordered_map<uint64_t, std::unordered_map<uint64_t, ::perfetto::Track>>{};
@@ -2155,35 +2187,6 @@ write_perfetto()
itr.end_timestamp);
}
for(auto itr : memory_allocation_bf_records)
{
auto name = buffer_names.at(itr.kind, itr.operation);
auto& track = agent_tracks.at(itr.agent_id.handle);
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_allocation>::name,
::perfetto::StaticString(name.data()),
track,
itr.start_timestamp,
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
"begin_ns",
itr.start_timestamp,
"kind",
itr.kind,
"operation",
itr.operation,
"agent",
agents_map.at(itr.agent_id).logical_node_id,
"Allocation_size",
itr.allocation_size,
"Starting_address",
itr.starting_address);
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_allocation>::name,
track,
itr.end_timestamp,
"end_ns",
itr.end_timestamp);
}
auto demangled = std::unordered_map<std::string_view, std::string>{};
for(auto itr : kernel_dispatch_bf_records)
{