SWDEV-492625 memory free functions (#11)
* SWDEV-492625: Track free memory HSA functions to help determine total amount of memory allocated on the system at any one time
* Minor fixes to address comments
* Update allocation size description
* Moved get function back to specialization, minor typo fixes
* Removed memory_operation_type field, removed memory_pool allocation enum, converted starting address to hex string for json format.
* Made conversion to hex_string a function, changed address to use union rocprofiler_address_t type, changed VMEM descriptors
* Removed as_hex from the global namespace
* Formatting
* Removed TRACK_EVENT for memory allocation, now TRACK_COUNTER for memory allocation is being performed
* Check if address was recorded before retrieving allocation size in generate Perfetto
* Formatting
* Update source/lib/output/generatePerfetto.cpp
* Explicitly disable app-abort tests
* Remove excluding app-abort test from workflow CI
- redundant bc these tests are explicitly marked as disabled now
---------
Co-authored-by: Madsen, Jonathan <Jonathan.Madsen@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
[ROCm/rocprofiler-sdk commit: 79006bb896]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
a79f8a0198
Коммит
792329fefd
+4
-4
@@ -23,10 +23,10 @@ env:
|
||||
ROCM_PATH: "/opt/rocm"
|
||||
GPU_TARGETS: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"
|
||||
PATH: "/usr/bin:$PATH"
|
||||
navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
|
||||
vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
|
||||
mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
|
||||
mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate)|rocprofv3-test-(execute|validate)-app-abort)$"
|
||||
navi3_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
|
||||
vega20_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
|
||||
mi200_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
|
||||
mi300_EXCLUDE_TESTS_REGEX: "^(test-page-migration-(execute|validate))$"
|
||||
navi3_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$"
|
||||
vega20_EXCLUDE_LABEL_REGEX: "^(pc-sampling|openmp-target)$"
|
||||
mi200_EXCLUDE_LABEL_REGEX: "^(openmp-target)$"
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
"Kind","Operation","Agent_Id","Allocation_Size","Starting_Address","Correlation_Id","Start_Timestamp","End_Timestamp"
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497356288,1,65788054621500,65788055678893
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497348096,1,65788055691832,65788056666844
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497339904,1,65788056672061,65788057643457
|
||||
"Kind","Operation","Agent_Id","Allocation_Size","Address","Correlation_Id","Start_Timestamp","End_Timestamp"
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,11,3721742710532634,3721742710584854
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,12,3721742710596404,3721742710933366
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,13,3721742710941416,3721742710960916
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,14,3721742710967236,3721742711197647
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,0x7fb2d0005000,15,3721742711204077,3721742711219717
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_FREE",0,0,0x7fb2d0005000,16,3721742711225857,3721742711466018
|
||||
|
||||
|
@@ -1318,5 +1318,5 @@ Properties
|
||||
- **`thread_id`** *(integer, required)*: Thread ID.
|
||||
- **`agent_id`** *(object, required)*: Agent ID.
|
||||
- **`handle`** *(integer, required)*: Handle of the agent.
|
||||
- **`starting_address`** *(string, required)*: Starting address of allocation.
|
||||
- **`address`** *(string, required)*: Starting address of allocation.
|
||||
- **`allocation_size`** *(integer, required)*: Size of allocation.
|
||||
|
||||
@@ -1566,13 +1566,13 @@
|
||||
"handle"
|
||||
]
|
||||
},
|
||||
"starting_address": {
|
||||
"type": "integer",
|
||||
"description": "Starting address of allocation"
|
||||
"address": {
|
||||
"type": "string",
|
||||
"description": "Starting address for allocation or freeing memory."
|
||||
},
|
||||
"allocation_size": {
|
||||
"type": "integer",
|
||||
"description": "allocation_size"
|
||||
"description": "Size of memory allocation. Free operations not tracked currently."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
@@ -1584,7 +1584,7 @@
|
||||
"end_timestamp",
|
||||
"thread_id",
|
||||
"agent_id",
|
||||
"starting_address",
|
||||
"address",
|
||||
"allocation_size"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -217,9 +217,9 @@ typedef struct
|
||||
rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation
|
||||
uint64_t starting_address; ///< starting address for memory allocation
|
||||
uint64_t allocation_size; ///< size for memory allocation
|
||||
rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation
|
||||
rocprofiler_address_t address; ///< starting address for memory allocation
|
||||
uint64_t allocation_size; ///< size for memory allocation
|
||||
/// @var kind
|
||||
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION
|
||||
/// @var operation
|
||||
|
||||
@@ -211,16 +211,16 @@ typedef struct
|
||||
} rocprofiler_callback_tracing_memory_copy_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Memory Copy Allocation Tracer Record.
|
||||
* @brief ROCProfiler Memory Allocation Tracer Record.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation
|
||||
uint64_t starting_address; ///< starting address for memory allocation
|
||||
uint64_t allocation_size; ///< size of memory allocation
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation
|
||||
rocprofiler_address_t address; ///< starting address for memory allocation
|
||||
uint64_t allocation_size; ///< size of memory allocation
|
||||
} rocprofiler_callback_tracing_memory_allocation_data_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#
|
||||
#
|
||||
set(ROCPROFILER_CXX_HEADER_FILES hash.hpp name_info.hpp operators.hpp perfetto.hpp
|
||||
serialization.hpp)
|
||||
utility.hpp serialization.hpp)
|
||||
|
||||
install(
|
||||
FILES ${ROCPROFILER_CXX_HEADER_FILES}
|
||||
|
||||
+5
-2
@@ -30,6 +30,8 @@
|
||||
#include <rocprofiler-sdk/internal_threading.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include <rocprofiler-sdk/cxx/name_info.hpp>
|
||||
#include <rocprofiler-sdk/cxx/perfetto.hpp>
|
||||
#include <rocprofiler-sdk/cxx/utility.hpp>
|
||||
|
||||
#include <cereal/archives/binary.hpp>
|
||||
#include <cereal/archives/json.hpp>
|
||||
@@ -64,6 +66,7 @@
|
||||
|
||||
#define ROCP_SDK_SAVE_DATA_FIELD(FIELD) ar(make_nvp(#FIELD, data.FIELD))
|
||||
#define ROCP_SDK_SAVE_DATA_VALUE(NAME, VALUE) ar(make_nvp(NAME, data.VALUE))
|
||||
#define ROCP_SDK_SAVE_VALUE(NAME, VALUE) ar(make_nvp(NAME, VALUE))
|
||||
#define ROCP_SDK_SAVE_DATA_CSTR(FIELD) \
|
||||
ar(make_nvp(#FIELD, std::string{data.FIELD ? data.FIELD : ""}))
|
||||
#define ROCP_SDK_SAVE_DATA_BITFIELD(NAME, VALUE) \
|
||||
@@ -338,7 +341,7 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_memory_allocation_data_t data)
|
||||
ROCP_SDK_SAVE_DATA_FIELD(start_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(end_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(agent_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(starting_address);
|
||||
ROCP_SDK_SAVE_VALUE("address", rocprofiler::sdk::utility::as_hex(data.address.value, 16));
|
||||
ROCP_SDK_SAVE_DATA_FIELD(allocation_size);
|
||||
}
|
||||
|
||||
@@ -543,7 +546,7 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_allocation_record_t data)
|
||||
ROCP_SDK_SAVE_DATA_FIELD(start_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(end_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(agent_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(starting_address);
|
||||
ROCP_SDK_SAVE_VALUE("address", rocprofiler::sdk::utility::as_hex(data.address.value, 16));
|
||||
ROCP_SDK_SAVE_DATA_FIELD(allocation_size);
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace sdk
|
||||
{
|
||||
namespace utility
|
||||
{
|
||||
template <typename Tp>
|
||||
auto
|
||||
_as_hex(Tp val, size_t width = 0)
|
||||
{
|
||||
auto ss = std::stringstream{};
|
||||
ss << "0x" << std::hex << std::setw(width) << std::setfill('0') << val;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
#define ROCPROFILER_CXX_DEFINE_AS_HEX(TYPE) \
|
||||
inline auto as_hex(TYPE val, size_t width = 0) \
|
||||
{ \
|
||||
return ::rocprofiler::sdk::utility::_as_hex(val, width); \
|
||||
}
|
||||
|
||||
ROCPROFILER_CXX_DEFINE_AS_HEX(uint64_t)
|
||||
#undef ROCPROFILER_CXX_DEFINE_AS_HEX
|
||||
} // namespace utility
|
||||
} // namespace sdk
|
||||
} // namespace rocprofiler
|
||||
@@ -238,10 +238,11 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
*/
|
||||
typedef enum // NOLINT(performance-enum-size)
|
||||
{
|
||||
ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool
|
||||
ROCPROFILER_MEMORY_ALLOCATION_VMEM_HANDLE_CREATE, ///< Allocate vmem memory handle
|
||||
ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE, ///< Allocate vmem memory handle
|
||||
ROCPROFILER_MEMORY_ALLOCATION_FREE, ///< Free memory function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_VMEM_FREE, ///< Release vmem memory handle
|
||||
ROCPROFILER_MEMORY_ALLOCATION_LAST,
|
||||
} rocprofiler_memory_allocation_operation_t;
|
||||
|
||||
@@ -458,11 +459,6 @@ typedef enum
|
||||
*/
|
||||
typedef uint64_t rocprofiler_timestamp_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Address.
|
||||
*/
|
||||
typedef uint64_t rocprofiler_address_t;
|
||||
|
||||
/**
|
||||
* @brief Thread ID. Value will be equivalent to `syscall(__NR_gettid)`
|
||||
*/
|
||||
@@ -519,6 +515,16 @@ typedef union rocprofiler_user_data_t
|
||||
void* ptr; ///< usage example: set to address of data allocation
|
||||
} rocprofiler_user_data_t;
|
||||
|
||||
/**
|
||||
* @brief Stores memory address for profiling
|
||||
*
|
||||
*/
|
||||
typedef union rocprofiler_address_t
|
||||
{
|
||||
uint64_t value; ///< usage example: store address in uint64_t format
|
||||
void* ptr; ///< usage example: generic form of address
|
||||
} rocprofiler_address_t;
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
//
|
||||
// STRUCTS
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/marker/api_id.h>
|
||||
#include <rocprofiler-sdk/cxx/operators.hpp>
|
||||
#include <rocprofiler-sdk/cxx/utility.hpp>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <cstdint>
|
||||
@@ -456,7 +457,7 @@ generate_csv(const output_config&
|
||||
"Operation",
|
||||
"Agent_Id",
|
||||
"Allocation_Size",
|
||||
"Starting_Address",
|
||||
"Address",
|
||||
"Correlation_Id",
|
||||
"Start_Timestamp",
|
||||
"End_Timestamp"}};
|
||||
@@ -464,15 +465,24 @@ generate_csv(const output_config&
|
||||
{
|
||||
for(auto record : data.get(ditr))
|
||||
{
|
||||
uint64_t agent_info{0};
|
||||
// Free functions currently do not track agent information. Only set it on allocation
|
||||
// operations, otherwise set it to 0 currently
|
||||
if(record.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE ||
|
||||
record.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE)
|
||||
{
|
||||
agent_info = tool_metadata.get_node_id(record.agent_id);
|
||||
}
|
||||
auto api_name = tool_metadata.get_operation_name(record.kind, record.operation);
|
||||
auto row_ss = std::stringstream{};
|
||||
|
||||
rocprofiler::tool::csv::memory_allocation_csv_encoder::write_row(
|
||||
row_ss,
|
||||
tool_metadata.get_kind_name(record.kind),
|
||||
api_name,
|
||||
tool_metadata.get_node_id(record.agent_id),
|
||||
agent_info,
|
||||
record.allocation_size,
|
||||
record.starting_address,
|
||||
rocprofiler::sdk::utility::as_hex(record.address.value, 16),
|
||||
record.correlation_id.internal,
|
||||
record.start_timestamp,
|
||||
record.end_timestamp);
|
||||
|
||||
@@ -486,17 +486,25 @@ write_otf2(
|
||||
{
|
||||
for(auto& [agent, evt] : itr)
|
||||
{
|
||||
const auto* _agent = _get_agent(agent);
|
||||
auto _type_name = std::string_view{"UNK"};
|
||||
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
// Free functions do not track agent information. Below handles case where
|
||||
// null rocprof agent id is passed to generate OTF2
|
||||
constexpr auto null_rocp_agent_id =
|
||||
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
|
||||
const rocprofiler_agent_t* _agent = nullptr;
|
||||
if(agent != null_rocp_agent_id)
|
||||
{
|
||||
_agent = _get_agent(agent);
|
||||
}
|
||||
auto _type_name = std::string_view{"UNK"};
|
||||
if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
_type_name = "CPU";
|
||||
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
_type_name = "GPU";
|
||||
|
||||
evt.name = fmt::format("Thread {}, Memory Allocation at {} {}",
|
||||
evt.name = fmt::format("Thread {}, Memory Operation at {} {}",
|
||||
tid,
|
||||
_type_name,
|
||||
_agent->logical_node_type_id);
|
||||
_agent == nullptr ? 0 : _agent->logical_node_type_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -860,6 +868,12 @@ write_otf2(
|
||||
for(auto& [agent, evt] : itr)
|
||||
{
|
||||
auto _hash = get_hash_id(evt.name);
|
||||
// Using max numeric limits results in an out-of-bound runtime error for OTF2
|
||||
// and perfetto for agent ids. Setting handle to 0 for free functions.
|
||||
constexpr auto null_rocp_agent_id =
|
||||
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
|
||||
auto handle = agent.handle;
|
||||
if(agent == null_rocp_agent_id) handle = 0;
|
||||
|
||||
add_write_string(_hash, evt.name);
|
||||
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
|
||||
@@ -867,7 +881,7 @@ write_otf2(
|
||||
_hash,
|
||||
OTF2_LOCATION_TYPE_ACCELERATOR_STREAM,
|
||||
2 * evt.event_count, // # events
|
||||
agent.handle // location group
|
||||
handle // location group
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,33 +241,6 @@ write_perfetto(
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& itr : agent_thread_ids_alloc)
|
||||
{
|
||||
const auto* _agent = _get_agent(itr.first);
|
||||
|
||||
for(auto titr : itr.second)
|
||||
{
|
||||
auto _namess = std::stringstream{};
|
||||
_namess << "MEMORY ALLOCATION on AGENT [" << _agent->logical_node_id << "] THREAD ["
|
||||
<< thread_indexes.at(titr) << "] ";
|
||||
|
||||
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
_namess << "(CPU)";
|
||||
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
_namess << "(GPU)";
|
||||
else
|
||||
_namess << "(UNK)";
|
||||
|
||||
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
|
||||
auto _desc = _track.Serialize();
|
||||
_desc.set_name(_namess.str());
|
||||
|
||||
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
|
||||
|
||||
agent_thread_tracks_alloc[itr.first].emplace(titr, _track);
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& aitr : agent_queue_ids)
|
||||
{
|
||||
uint32_t nqueue = 0;
|
||||
@@ -463,47 +436,6 @@ write_perfetto(
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
|
||||
for(auto ditr : memory_allocation_gen)
|
||||
for(auto itr : memory_allocation_gen.get(ditr))
|
||||
{
|
||||
auto name = buffer_names.at(itr.kind, itr.operation);
|
||||
auto& track = agent_thread_tracks_alloc.at(itr.agent_id).at(itr.thread_id);
|
||||
std::stringstream hex_stream;
|
||||
hex_stream << "0x" << std::hex << std::setw(16) << std::setfill('0')
|
||||
<< itr.starting_address;
|
||||
std::string hex_starting_address(hex_stream.str());
|
||||
|
||||
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
::perfetto::StaticString(name.data()),
|
||||
track,
|
||||
itr.start_timestamp,
|
||||
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
|
||||
"begin_ns",
|
||||
itr.start_timestamp,
|
||||
"end_ns",
|
||||
itr.end_timestamp,
|
||||
"delta_ns",
|
||||
(itr.end_timestamp - itr.start_timestamp),
|
||||
"kind",
|
||||
itr.kind,
|
||||
"operation",
|
||||
itr.operation,
|
||||
"agent",
|
||||
agents_map.at(itr.agent_id).logical_node_id,
|
||||
"allocation_size",
|
||||
itr.allocation_size,
|
||||
"starting_address",
|
||||
hex_starting_address,
|
||||
"corr_id",
|
||||
itr.correlation_id.internal,
|
||||
"tid",
|
||||
itr.thread_id);
|
||||
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
track,
|
||||
itr.end_timestamp);
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
|
||||
for(auto ditr : kernel_dispatch_gen)
|
||||
for(auto itr : kernel_dispatch_gen.get(ditr))
|
||||
{
|
||||
@@ -634,6 +566,99 @@ write_perfetto(
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
}
|
||||
|
||||
// memory allocation counter track
|
||||
auto mem_alloc_endpoints = std::map<rocprofiler_agent_id_t, std::map<uint64_t, uint64_t>>{};
|
||||
auto mem_alloc_extremes = std::pair<uint64_t, uint64_t>{};
|
||||
auto address_to_size = std::unordered_map<uint64_t, uint64_t>{};
|
||||
for(auto ditr : memory_allocation_gen)
|
||||
for(auto itr : memory_allocation_gen.get(ditr))
|
||||
{
|
||||
uint64_t _mean_timestamp =
|
||||
itr.start_timestamp + (0.5 * (itr.end_timestamp - itr.start_timestamp));
|
||||
|
||||
mem_alloc_endpoints[itr.agent_id].emplace(itr.start_timestamp - 1000, 0);
|
||||
mem_alloc_endpoints[itr.agent_id].emplace(itr.start_timestamp, 0);
|
||||
mem_alloc_endpoints[itr.agent_id].emplace(_mean_timestamp, 0);
|
||||
mem_alloc_endpoints[itr.agent_id].emplace(itr.end_timestamp, 0);
|
||||
mem_alloc_endpoints[itr.agent_id].emplace(itr.end_timestamp + 1000, 0);
|
||||
|
||||
mem_alloc_extremes =
|
||||
std::make_pair(std::min(mem_alloc_extremes.first, itr.start_timestamp),
|
||||
std::max(mem_alloc_extremes.second, itr.end_timestamp));
|
||||
if(itr.operation == ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE ||
|
||||
itr.operation == ROCPROFILER_MEMORY_ALLOCATION_VMEM_ALLOCATE)
|
||||
{
|
||||
address_to_size.emplace(itr.address.value, itr.allocation_size);
|
||||
}
|
||||
}
|
||||
|
||||
for(auto ditr : memory_allocation_gen)
|
||||
for(auto itr : memory_allocation_gen.get(ditr))
|
||||
{
|
||||
auto alloc_beg =
|
||||
mem_alloc_endpoints.at(itr.agent_id).lower_bound(itr.start_timestamp);
|
||||
auto alloc_end =
|
||||
mem_alloc_endpoints.at(itr.agent_id).upper_bound(itr.end_timestamp);
|
||||
|
||||
LOG_IF(FATAL, alloc_beg == alloc_end)
|
||||
<< "Missing range for timestamp [" << itr.start_timestamp << ", "
|
||||
<< itr.end_timestamp << "]";
|
||||
|
||||
for(auto alloc_itr = alloc_beg; alloc_itr != alloc_end; ++alloc_itr)
|
||||
{
|
||||
if(address_to_size.count(itr.address.value) > 0)
|
||||
{
|
||||
alloc_itr->second += address_to_size.at(itr.address.value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto mem_alloc_tracks =
|
||||
std::unordered_map<rocprofiler_agent_id_t, ::perfetto::CounterTrack>{};
|
||||
auto mem_alloc_cnt_names = std::vector<std::string>{};
|
||||
constexpr auto null_rocp_agent_id =
|
||||
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
|
||||
mem_alloc_cnt_names.reserve(mem_alloc_endpoints.size());
|
||||
for(auto& alloc_itr : mem_alloc_endpoints)
|
||||
{
|
||||
mem_alloc_endpoints[alloc_itr.first].emplace(mem_alloc_extremes.first - 5000, 0);
|
||||
mem_alloc_endpoints[alloc_itr.first].emplace(mem_alloc_extremes.second + 5000, 0);
|
||||
|
||||
auto _track_name = std::stringstream{};
|
||||
const rocprofiler_agent_t* _agent = nullptr;
|
||||
if(alloc_itr.first != null_rocp_agent_id)
|
||||
{
|
||||
_agent = _get_agent(alloc_itr.first);
|
||||
}
|
||||
|
||||
if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
_track_name << "ALLOCATE BYTES on AGENT [" << _agent->logical_node_id << "] (CPU)";
|
||||
else if(_agent != nullptr && _agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
_track_name << "ALLOCATE BYTES on AGENT [" << _agent->logical_node_id << "] (GPU)";
|
||||
else
|
||||
_track_name << "FREE BYTES";
|
||||
|
||||
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
|
||||
auto& _name = mem_alloc_cnt_names.emplace_back(_track_name.str());
|
||||
mem_alloc_tracks.emplace(alloc_itr.first,
|
||||
::perfetto::CounterTrack{_name.c_str()}
|
||||
.set_unit(_unit)
|
||||
.set_unit_multiplier(bytes_multiplier)
|
||||
.set_is_incremental(false));
|
||||
}
|
||||
|
||||
for(auto& alloc_itr : mem_alloc_endpoints)
|
||||
{
|
||||
for(auto itr : alloc_itr.second)
|
||||
{
|
||||
TRACE_COUNTER(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
mem_alloc_tracks.at(alloc_itr.first),
|
||||
itr.first,
|
||||
itr.second / bytes_multiplier);
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
::perfetto::TrackEvent::Flush();
|
||||
|
||||
@@ -481,6 +481,9 @@ HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, hsa_memory_allocate, hsa_memory_allocate_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_free, hsa_memory_free, hsa_memory_free_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_free, hsa_amd_memory_pool_free, hsa_amd_memory_pool_free_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_release, hsa_amd_vmem_handle_release, hsa_amd_vmem_handle_release_fn)
|
||||
// clang-format on
|
||||
|
||||
#else
|
||||
|
||||
+260
-50
@@ -68,49 +68,125 @@ using memory_pool_to_agent_map = std::unordered_map<hsa_amd_memory_pool_t, rocpr
|
||||
using region_to_agent_pair = std::pair<region_to_agent_map*, rocprofiler_agent_id_t>;
|
||||
using map_pool_to_agent_pair = std::pair<memory_pool_to_agent_map*, rocprofiler_agent_id_t>;
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
memory_allocation_impl(Args... args);
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
memory_free_impl(Args... args);
|
||||
|
||||
// Local enum to specify implementation of memory function wrappers
|
||||
typedef enum
|
||||
{
|
||||
HSA_NONE = 0, ///< Unknown memory allocation function
|
||||
HSA_MEMORY_ALLOCATE, ///< Allocate memory function
|
||||
HSA_AMD_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool
|
||||
HSA_AMD_VMEM_ALLOCATE, ///< Allocate vmem memory handle
|
||||
HSA_MEMORY_FREE, ///< Free memory function
|
||||
HSA_AMD_MEMORY_POOL_FREE, ///< Free memory pool
|
||||
HSA_AMD_VMEM_FREE, ///< Release vmem memory handle
|
||||
HSA_LAST,
|
||||
} hsa_memory_operation_functions_t;
|
||||
|
||||
// Set up information to identify agent from regions/pool
|
||||
template <size_t OpIdx>
|
||||
struct memory_allocation_info;
|
||||
|
||||
#define SPECIALIZE_MEMORY_ALLOCATION_INFO(FUNCTION, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC) \
|
||||
#define SPECIALIZE_MEMORY_ALLOCATION_INFO( \
|
||||
FUNCTION, ENUM, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC, IMPLEMENTATION) \
|
||||
template <> \
|
||||
struct memory_allocation_info<ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION> \
|
||||
struct memory_allocation_info<FUNCTION> \
|
||||
{ \
|
||||
using maptype = MAPTYPE; \
|
||||
using pairtype = PAIRTYPE; \
|
||||
using searchtype = SEARCHTYPE; \
|
||||
auto& operator()() const { return ITERATEFUNC; } \
|
||||
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \
|
||||
static constexpr auto name = "MEMORY_ALLOCATION_" #FUNCTION; \
|
||||
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##ENUM; \
|
||||
static constexpr auto name = "MEMORY_ALLOCATION_" #ENUM; \
|
||||
\
|
||||
template <size_t TableIdx, size_t OpIdx, typename RetT, typename... Args> \
|
||||
static auto get_memory_allocation_impl(RetT (*)(Args...)) \
|
||||
{ \
|
||||
return &IMPLEMENTATION<TableIdx, OpIdx, Args...>; \
|
||||
} \
|
||||
};
|
||||
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(NONE,
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_NONE,
|
||||
NONE,
|
||||
region_to_agent_map,
|
||||
region_to_agent_pair,
|
||||
hsa_region_t,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(ALLOCATE,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn,
|
||||
memory_allocation_impl)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_MEMORY_ALLOCATE,
|
||||
ALLOCATE,
|
||||
region_to_agent_map,
|
||||
region_to_agent_pair,
|
||||
hsa_region_t,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(MEMORY_POOL_ALLOCATE,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn,
|
||||
memory_allocation_impl)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_MEMORY_POOL_ALLOCATE,
|
||||
ALLOCATE,
|
||||
memory_pool_to_agent_map,
|
||||
map_pool_to_agent_pair,
|
||||
hsa_amd_memory_pool_t,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(VMEM_HANDLE_CREATE,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
|
||||
memory_allocation_impl)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_VMEM_ALLOCATE,
|
||||
VMEM_ALLOCATE,
|
||||
memory_pool_to_agent_map,
|
||||
map_pool_to_agent_pair,
|
||||
hsa_amd_memory_pool_t,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn)
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
|
||||
memory_allocation_impl)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_MEMORY_FREE,
|
||||
FREE,
|
||||
region_to_agent_map,
|
||||
region_to_agent_pair,
|
||||
hsa_region_t,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn,
|
||||
memory_free_impl)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_MEMORY_POOL_FREE,
|
||||
FREE,
|
||||
memory_pool_to_agent_map,
|
||||
map_pool_to_agent_pair,
|
||||
hsa_amd_memory_pool_t,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
|
||||
memory_free_impl)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(HSA_AMD_VMEM_FREE,
|
||||
VMEM_FREE,
|
||||
memory_pool_to_agent_map,
|
||||
map_pool_to_agent_pair,
|
||||
hsa_amd_memory_pool_t,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn,
|
||||
memory_free_impl)
|
||||
#undef SPECIALIZE_MEMORY_ALLOCATION_INFO
|
||||
|
||||
// Map rocprofiler_memory_allocation_operation_t to respective name
|
||||
template <size_t OpIdx>
|
||||
struct memory_allocation_name;
|
||||
|
||||
#define MEMORY_ALLOCATION_NAME(ENUM) \
|
||||
template <> \
|
||||
struct memory_allocation_name<ROCPROFILER_MEMORY_ALLOCATION_##ENUM> \
|
||||
{ \
|
||||
static constexpr auto name = "MEMORY_ALLOCATION_" #ENUM; \
|
||||
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##ENUM; \
|
||||
};
|
||||
|
||||
MEMORY_ALLOCATION_NAME(NONE)
|
||||
MEMORY_ALLOCATION_NAME(ALLOCATE)
|
||||
MEMORY_ALLOCATION_NAME(VMEM_ALLOCATE)
|
||||
MEMORY_ALLOCATION_NAME(FREE)
|
||||
MEMORY_ALLOCATION_NAME(VMEM_FREE)
|
||||
#undef MEMORY_ALLOCATION_NAME
|
||||
|
||||
template <size_t Idx, size_t... IdxTail>
|
||||
const char*
|
||||
name_by_id(const uint32_t id, std::index_sequence<Idx, IdxTail...>)
|
||||
{
|
||||
if(Idx == id) return memory_allocation_info<Idx>::name;
|
||||
if(Idx == id) return memory_allocation_name<Idx>::name;
|
||||
if constexpr(sizeof...(IdxTail) > 0)
|
||||
return name_by_id(id, std::index_sequence<IdxTail...>{});
|
||||
else
|
||||
@@ -121,8 +197,8 @@ template <size_t Idx, size_t... IdxTail>
|
||||
uint32_t
|
||||
id_by_name(const char* name, std::index_sequence<Idx, IdxTail...>)
|
||||
{
|
||||
if(std::string_view{memory_allocation_info<Idx>::name} == std::string_view{name})
|
||||
return memory_allocation_info<Idx>::operation_idx;
|
||||
if(std::string_view{memory_allocation_name<Idx>::name} == std::string_view{name})
|
||||
return memory_allocation_name<Idx>::operation_idx;
|
||||
if constexpr(sizeof...(IdxTail) > 0)
|
||||
return id_by_name(name, std::index_sequence<IdxTail...>{});
|
||||
else
|
||||
@@ -137,7 +213,7 @@ get_ids(std::vector<uint32_t>& _id_list, std::index_sequence<Idx...>)
|
||||
if(_v < static_cast<uint32_t>(ROCPROFILER_MEMORY_ALLOCATION_LAST)) _vec.emplace_back(_v);
|
||||
};
|
||||
|
||||
(_emplace(_id_list, memory_allocation_info<Idx>::operation_idx), ...);
|
||||
(_emplace(_id_list, memory_allocation_name<Idx>::operation_idx), ...);
|
||||
}
|
||||
|
||||
template <size_t... Idx>
|
||||
@@ -148,7 +224,7 @@ get_names(std::vector<const char*>& _name_list, std::index_sequence<Idx...>)
|
||||
if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v);
|
||||
};
|
||||
|
||||
(_emplace(_name_list, memory_allocation_info<Idx>::name), ...);
|
||||
(_emplace(_name_list, memory_allocation_name<Idx>::name), ...);
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -169,17 +245,24 @@ context_filter(const context::context* ctx)
|
||||
enum memory_allocation_core_id
|
||||
{
|
||||
memory_allocation_core_allocate_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate,
|
||||
memory_allocation_core_free_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_free,
|
||||
};
|
||||
using memory_allocation_core_index_seq_t = std::index_sequence<memory_allocation_core_allocate_id>;
|
||||
using memory_allocation_core_index_seq_t =
|
||||
std::index_sequence<memory_allocation_core_allocate_id, memory_allocation_core_free_id>;
|
||||
|
||||
enum memory_allocation_amd_ext_id
|
||||
{
|
||||
memory_allocation_amd_ext_allocate_id =
|
||||
ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate,
|
||||
memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create
|
||||
memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create,
|
||||
memory_allocation_amd_ext_free_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_free,
|
||||
memory_allocation_vmem_release_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_release,
|
||||
};
|
||||
using memory_allocation_amd_ext_index_seq_t =
|
||||
std::index_sequence<memory_allocation_amd_ext_allocate_id, memory_allocation_vmem_allocate_id>;
|
||||
std::index_sequence<memory_allocation_amd_ext_allocate_id,
|
||||
memory_allocation_vmem_allocate_id,
|
||||
memory_allocation_amd_ext_free_id,
|
||||
memory_allocation_vmem_release_id>;
|
||||
|
||||
template <size_t TableIdx>
|
||||
struct memory_allocation_seq;
|
||||
@@ -203,14 +286,17 @@ struct arg_indices;
|
||||
template <> \
|
||||
struct arg_indices<ENUM_ID> \
|
||||
{ \
|
||||
static constexpr auto starting_address_idx = STARTING_ADDRESS_IDX; \
|
||||
static constexpr auto size_idx = SIZE_IDX; \
|
||||
static constexpr auto region_idx = REGION_IDX; \
|
||||
static constexpr auto address_idx = STARTING_ADDRESS_IDX; \
|
||||
static constexpr auto size_idx = SIZE_IDX; \
|
||||
static constexpr auto region_idx = REGION_IDX; \
|
||||
};
|
||||
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_allocate_id, 2, 1, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_allocate_id, 3, 1, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_allocate_id, 4, 1, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_free_id, 0, 0, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_free_id, 0, 0, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_release_id, 0, 0, 0)
|
||||
|
||||
// Define operation indices for each tracked functions
|
||||
template <size_t Idx>
|
||||
@@ -220,12 +306,15 @@ struct memory_allocation_op;
|
||||
template <> \
|
||||
struct memory_allocation_op<ENUM_ID> \
|
||||
{ \
|
||||
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \
|
||||
static constexpr auto operation_idx = FUNCTION; \
|
||||
};
|
||||
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, ALLOCATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, MEMORY_POOL_ALLOCATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, VMEM_HANDLE_CREATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, HSA_MEMORY_ALLOCATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, HSA_AMD_MEMORY_POOL_ALLOCATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, HSA_AMD_VMEM_ALLOCATE)
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_free_id, HSA_MEMORY_FREE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_free_id, HSA_AMD_MEMORY_POOL_FREE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_release_id, HSA_AMD_VMEM_FREE);
|
||||
|
||||
template <typename FuncT, typename ArgsT, size_t... Idx>
|
||||
decltype(auto)
|
||||
@@ -255,7 +344,7 @@ struct memory_allocation_data
|
||||
rocprofiler_thread_id_t tid = common::get_tid();
|
||||
rocprofiler_agent_id_t agent = null_rocp_agent_id;
|
||||
uint64_t size_allocated = 0;
|
||||
uint64_t starting_addr = 0;
|
||||
rocprofiler_address_t address = {.value = 0};
|
||||
uint64_t start_ts = 0;
|
||||
context::correlation_id* correlation_id = nullptr;
|
||||
tracing::tracing_data tracing_data = {};
|
||||
@@ -271,7 +360,7 @@ memory_allocation_data::callback_data_t
|
||||
memory_allocation_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const
|
||||
{
|
||||
return common::init_public_api_struct(
|
||||
callback_data_t{}, _beg, _end, agent, starting_addr, size_allocated);
|
||||
callback_data_t{}, _beg, _end, agent, address, size_allocated);
|
||||
}
|
||||
|
||||
memory_allocation_data::buffered_data_t
|
||||
@@ -291,7 +380,7 @@ memory_allocation_data::get_buffered_record(const context_t* _ctx,
|
||||
_beg,
|
||||
_end,
|
||||
agent,
|
||||
starting_addr,
|
||||
address,
|
||||
size_allocated);
|
||||
}
|
||||
|
||||
@@ -336,16 +425,32 @@ get_agent(T val, IterateFunc iterate_func, CallbackFunc callback)
|
||||
return existing.count(val) == 0 ? null_rocp_agent_id : existing.at(val);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
void*
|
||||
handle_starting_addr(void** starting_addr_pointer)
|
||||
{
|
||||
return reinterpret_cast<uint64_t>(*starting_addr_pointer);
|
||||
return *starting_addr_pointer;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
// The handle field of hsa_amd_vmem_alloc_handle_t is the starting address
|
||||
// cast as uint64_t, so returning the handle field after casting to void* suffices
|
||||
void*
|
||||
handle_starting_addr(hsa_amd_vmem_alloc_handle_t* vmem_alloc_handle)
|
||||
{
|
||||
return vmem_alloc_handle->handle;
|
||||
return reinterpret_cast<void*>(vmem_alloc_handle->handle);
|
||||
}
|
||||
|
||||
// Handling starting address for free memory operations
|
||||
void*
|
||||
handle_starting_addr(void* starting_addr_pointer)
|
||||
{
|
||||
return starting_addr_pointer;
|
||||
}
|
||||
|
||||
// Handles starting address for releasing handle
|
||||
void*
|
||||
handle_starting_addr(hsa_amd_vmem_alloc_handle_t vmem_alloc_handle)
|
||||
{
|
||||
return reinterpret_cast<void*>(vmem_alloc_handle.handle);
|
||||
}
|
||||
|
||||
// Wrapper implementation that stores memory allocation information
|
||||
@@ -353,11 +458,12 @@ template <size_t TableIdx, size_t OpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
memory_allocation_impl(Args... args)
|
||||
{
|
||||
constexpr auto N = sizeof...(Args);
|
||||
constexpr auto starting_address_idx = arg_indices<OpIdx>::starting_address_idx;
|
||||
constexpr auto size_idx = arg_indices<OpIdx>::size_idx;
|
||||
constexpr auto region_idx = arg_indices<OpIdx>::region_idx;
|
||||
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
|
||||
constexpr auto N = sizeof...(Args);
|
||||
constexpr auto address_idx = arg_indices<OpIdx>::address_idx;
|
||||
constexpr auto size_idx = arg_indices<OpIdx>::size_idx;
|
||||
constexpr auto region_idx = arg_indices<OpIdx>::region_idx;
|
||||
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
|
||||
constexpr auto rocprofiler_enum = memory_allocation_info<operation>::operation_idx;
|
||||
|
||||
auto&& _tied_args = std::tie(args...);
|
||||
memory_allocation_data _data{};
|
||||
@@ -380,7 +486,7 @@ memory_allocation_impl(Args... args)
|
||||
}
|
||||
|
||||
auto& tracing_data = _data.tracing_data;
|
||||
auto starting_addr_pointer = std::get<starting_address_idx>(_tied_args);
|
||||
auto starting_addr_pointer = std::get<address_idx>(_tied_args);
|
||||
auto region_or_pool = std::get<region_idx>(_tied_args);
|
||||
|
||||
_data.tid = common::get_tid();
|
||||
@@ -389,7 +495,7 @@ memory_allocation_impl(Args... args)
|
||||
memory_allocation_info<operation>{}(),
|
||||
callback_populate_map<operation, typename memory_allocation_info<operation>::searchtype>);
|
||||
_data.size_allocated = std::get<size_idx>(_tied_args);
|
||||
_data.func = operation;
|
||||
_data.func = rocprofiler_enum;
|
||||
_data.correlation_id = context::get_latest_correlation_id();
|
||||
|
||||
if(!_data.correlation_id)
|
||||
@@ -405,7 +511,7 @@ memory_allocation_impl(Args... args)
|
||||
tracing_data.external_correlation_ids,
|
||||
thr_id,
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
rocprofiler_enum,
|
||||
_data.correlation_id->internal);
|
||||
|
||||
if(!tracing_data.callback_contexts.empty())
|
||||
@@ -417,7 +523,7 @@ memory_allocation_impl(Args... args)
|
||||
_data.correlation_id->internal,
|
||||
tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
rocprofiler_enum,
|
||||
_tracer_data);
|
||||
// enter callback may update the external correlation id field
|
||||
tracing::update_external_correlation_ids(
|
||||
@@ -433,7 +539,7 @@ memory_allocation_impl(Args... args)
|
||||
// checks before retrieving starting address?
|
||||
if(starting_addr_pointer != nullptr)
|
||||
{
|
||||
_data.starting_addr = handle_starting_addr(starting_addr_pointer);
|
||||
_data.address.ptr = handle_starting_addr(starting_addr_pointer);
|
||||
}
|
||||
|
||||
if(!tracing_data.empty())
|
||||
@@ -445,7 +551,7 @@ memory_allocation_impl(Args... args)
|
||||
tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts,
|
||||
_data.tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
rocprofiler_enum,
|
||||
_tracer_data);
|
||||
}
|
||||
|
||||
@@ -458,7 +564,7 @@ memory_allocation_impl(Args... args)
|
||||
_data.correlation_id->internal,
|
||||
_data.tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
rocprofiler_enum,
|
||||
record);
|
||||
}
|
||||
}
|
||||
@@ -468,11 +574,113 @@ memory_allocation_impl(Args... args)
|
||||
return _ret;
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx, typename RetT, typename... Args>
|
||||
auto get_memory_allocation_impl(RetT (*)(Args...))
|
||||
// Wrapper implementation that stores memory free operation information
|
||||
template <size_t TableIdx, size_t OpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
memory_free_impl(Args... args)
|
||||
{
|
||||
return &memory_allocation_impl<TableIdx, OpIdx, Args...>;
|
||||
constexpr auto N = sizeof...(Args);
|
||||
constexpr auto address_idx = arg_indices<OpIdx>::address_idx;
|
||||
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
|
||||
constexpr auto rocprofiler_enum = memory_allocation_info<operation>::operation_idx;
|
||||
|
||||
auto&& _tied_args = std::tie(args...);
|
||||
memory_allocation_data _data{};
|
||||
|
||||
{
|
||||
auto tracing_data = tracing::tracing_data{};
|
||||
|
||||
tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
OpIdx,
|
||||
tracing_data);
|
||||
// if no contexts are tracing memory copies for this direction, execute as usual
|
||||
if(tracing_data.empty())
|
||||
{
|
||||
return invoke(get_next_dispatch<TableIdx, OpIdx>(),
|
||||
std::move(_tied_args),
|
||||
std::make_index_sequence<N>{});
|
||||
}
|
||||
_data.tracing_data = std::move(tracing_data);
|
||||
}
|
||||
|
||||
auto& tracing_data = _data.tracing_data;
|
||||
|
||||
_data.tid = common::get_tid();
|
||||
_data.func = rocprofiler_enum;
|
||||
_data.correlation_id = context::get_latest_correlation_id();
|
||||
_data.address.ptr = handle_starting_addr(std::get<address_idx>(_tied_args));
|
||||
|
||||
if(!_data.correlation_id)
|
||||
{
|
||||
constexpr auto ref_count = 1;
|
||||
_data.correlation_id = context::correlation_tracing_service::construct(ref_count);
|
||||
}
|
||||
|
||||
// increase the reference count to denote that this correlation id is being used in a kernel
|
||||
_data.correlation_id->add_ref_count();
|
||||
auto thr_id = _data.correlation_id->thread_idx;
|
||||
tracing::populate_external_correlation_ids(
|
||||
tracing_data.external_correlation_ids,
|
||||
thr_id,
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION,
|
||||
rocprofiler_enum,
|
||||
_data.correlation_id->internal);
|
||||
|
||||
if(!tracing_data.callback_contexts.empty())
|
||||
{
|
||||
auto _tracer_data = _data.get_callback_data();
|
||||
|
||||
tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts,
|
||||
thr_id,
|
||||
_data.correlation_id->internal,
|
||||
tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
rocprofiler_enum,
|
||||
_tracer_data);
|
||||
// enter callback may update the external correlation id field
|
||||
tracing::update_external_correlation_ids(
|
||||
tracing_data.external_correlation_ids,
|
||||
thr_id,
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION);
|
||||
}
|
||||
auto start_ts = common::timestamp_ns();
|
||||
auto _ret = invoke(
|
||||
get_next_dispatch<TableIdx, OpIdx>(), std::move(_tied_args), std::make_index_sequence<N>{});
|
||||
auto end_ts = common::timestamp_ns();
|
||||
|
||||
if(!tracing_data.empty())
|
||||
{
|
||||
if(!_data.tracing_data.callback_contexts.empty())
|
||||
{
|
||||
auto _tracer_data = _data.get_callback_data(start_ts, end_ts);
|
||||
|
||||
tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts,
|
||||
_data.tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
rocprofiler_enum,
|
||||
_tracer_data);
|
||||
}
|
||||
|
||||
if(!_data.tracing_data.buffered_contexts.empty())
|
||||
{
|
||||
auto record = _data.get_buffered_record(nullptr, start_ts, end_ts);
|
||||
|
||||
tracing::execute_buffer_record_emplace(_data.tracing_data.buffered_contexts,
|
||||
_data.tid,
|
||||
_data.correlation_id->internal,
|
||||
_data.tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
rocprofiler_enum,
|
||||
record);
|
||||
}
|
||||
}
|
||||
|
||||
// decrement the reference count after usage in the callback/buffers
|
||||
_data.correlation_id->sub_ref_count();
|
||||
return _ret;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
// check out the assembly here... this compiles to a switch statement
|
||||
const char*
|
||||
@@ -573,7 +781,9 @@ memory_allocation_wrap(Tp* _orig, std::integral_constant<size_t, OpIdx>)
|
||||
|
||||
auto& _dispatch = get_next_dispatch<TableIdx, OpIdx>();
|
||||
CHECK_NOTNULL(_dispatch);
|
||||
_func = get_memory_allocation_impl<TableIdx, OpIdx>(_func);
|
||||
constexpr auto LocalIdx = memory_allocation_op<OpIdx>::operation_idx;
|
||||
_func = memory_allocation_info<LocalIdx>::template get_memory_allocation_impl<TableIdx, OpIdx>(
|
||||
_func);
|
||||
}
|
||||
|
||||
template <size_t TableIdx, typename Tp, size_t OpIdx, size_t... OpIdxTail>
|
||||
|
||||
+7
-1
@@ -170,6 +170,8 @@ call_hsa_memory_allocate(const size_t i, const size_t base_size, hsa_agent_t age
|
||||
|
||||
status = hsa_memory_allocate(region_list[0], base_size, &addr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
status = hsa_memory_free(addr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,6 +201,8 @@ call_hsa_memory_pool_allocate(const size_t i, const size_t base_size, hsa_agent_
|
||||
|
||||
status = hsa_amd_memory_pool_allocate(memory_pool_list[0], base_size, flags, &addr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
status = hsa_amd_memory_pool_free(addr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,6 +247,8 @@ call_hsa_vmem_allocate(const size_t i, hsa_agent_t agent)
|
||||
status = hsa_amd_vmem_handle_create(
|
||||
memory_pool_list[0], size, MEMORY_TYPE_NONE, 0, &memory_handle);
|
||||
RET_IF_HSA_ERR(status)
|
||||
status = hsa_amd_vmem_handle_release(memory_handle);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,7 +263,7 @@ main()
|
||||
hsa_agent_t cpu_agent = get_cpu_agent(agents);
|
||||
hsa_agent_t gpu_agent = get_gpu_agent(agents);
|
||||
call_hsa_memory_allocate(6, 1024, cpu_agent);
|
||||
call_hsa_memory_pool_allocate(9, 512, gpu_agent);
|
||||
call_hsa_memory_pool_allocate(9, 2048, gpu_agent);
|
||||
// Virtual memory API not supported in CI. Will add back if this changes
|
||||
// call_hsa_vmem_allocate(3, gpu_agent);
|
||||
|
||||
|
||||
@@ -175,25 +175,26 @@ def test_memory_alloc_sizes(input_data):
|
||||
# Op values:
|
||||
# 0 == ??? (unknown)
|
||||
# 1 == hsa_memory_allocate
|
||||
# 2 == hsa_amd_ext_memory_allocate
|
||||
# 3 == hsa_amd_vmem_handle_create
|
||||
# 2 == hsa_amd_vmem_handle_create
|
||||
# 3 == hsa_memory_free
|
||||
# 4 == hsa_amd_vmem_handle_release
|
||||
memory_alloc_cnt = dict(
|
||||
[
|
||||
(idx, {"agent": set(), "starting_addr": set(), "size": set(), "count": 0})
|
||||
for idx in range(1, 4)
|
||||
for idx in range(1, 5)
|
||||
]
|
||||
)
|
||||
for itr in sdk_data["buffer_records"]["memory_allocations"]:
|
||||
op_id = itr["operation"]
|
||||
assert op_id > 0 and op_id <= 3, f"{itr}"
|
||||
assert op_id > 0 and op_id <= 5, f"{itr}"
|
||||
memory_alloc_cnt[op_id]["count"] += 1
|
||||
memory_alloc_cnt[op_id]["starting_addr"].add(itr.starting_address)
|
||||
memory_alloc_cnt[op_id]["starting_addr"].add(itr.address)
|
||||
memory_alloc_cnt[op_id]["size"].add(itr.allocation_size)
|
||||
memory_alloc_cnt[op_id]["agent"].add(itr.agent_id.handle)
|
||||
|
||||
for itr in sdk_data["callback_records"]["memory_copies"]:
|
||||
op_id = itr.operation
|
||||
assert op_id > 0 and op_id <= 3, f"{itr}"
|
||||
assert op_id > 0 and op_id <= 5, f"{itr}"
|
||||
memory_alloc_cnt[op_id]["count"] += 1
|
||||
|
||||
phase = itr.phase
|
||||
@@ -210,7 +211,7 @@ def test_memory_alloc_sizes(input_data):
|
||||
assert pitr.end_timestamp > 0, f"{itr}"
|
||||
assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}"
|
||||
|
||||
memory_alloc_cnt[op_id]["starting_addr"].add(pitr.starting_address)
|
||||
memory_alloc_cnt[op_id]["starting_addr"].add(pitr.address)
|
||||
memory_alloc_cnt[op_id]["size"].add(pitr.allocation_size)
|
||||
memory_alloc_cnt[op_id]["agent"].add(pitr.agent_id.handle)
|
||||
else:
|
||||
@@ -218,24 +219,22 @@ def test_memory_alloc_sizes(input_data):
|
||||
|
||||
# In the memory allocation test which generates this file
|
||||
# 6 hsa_memory_allocation calls with 1024 bytes were called
|
||||
# and 9 hsa_amd_memory_pool_allocations with 512 bytes
|
||||
# and 9 hsa_amd_memory_pool_allocations with 2048 bytes
|
||||
# were called
|
||||
assert memory_alloc_cnt[1]["count"] == 6
|
||||
assert memory_alloc_cnt[2]["count"] == 9
|
||||
assert memory_alloc_cnt[1]["count"] == 15
|
||||
assert memory_alloc_cnt[3]["count"] == 15
|
||||
# assert memory_alloc_cnt[3]["count"] == 3
|
||||
assert len(memory_alloc_cnt[1]["starting_addr"]) == 6
|
||||
assert len(memory_alloc_cnt[2]["starting_addr"]) == 9
|
||||
assert len(memory_alloc_cnt[1]["starting_addr"]) == len(
|
||||
memory_alloc_cnt[3]["starting_addr"]
|
||||
)
|
||||
|
||||
# assert len(memory_alloc_cnt[3]["starting_addr"]) == 3
|
||||
assert len(memory_alloc_cnt[1]["size"]) == 1
|
||||
assert len(memory_alloc_cnt[2]["size"]) == 1
|
||||
assert len(memory_alloc_cnt[1]["size"]) == 2
|
||||
# assert len(memory_alloc_cnt[3]["size"]) == 1
|
||||
assert 1024 in memory_alloc_cnt[1]["size"]
|
||||
assert 512 in memory_alloc_cnt[2]["size"]
|
||||
assert len(memory_alloc_cnt[1]["agent"]) == 1
|
||||
assert len(memory_alloc_cnt[2]["agent"]) == 1
|
||||
assert 2048 in memory_alloc_cnt[1]["size"]
|
||||
assert len(memory_alloc_cnt[1]["agent"]) == 2
|
||||
# assert len(memory_alloc_cnt[3]["agent"]) == 1
|
||||
assert memory_alloc_cnt[1]["agent"] != memory_alloc_cnt[2]["agent"]
|
||||
# assert memory_alloc_cnt[2]["agent"] == memory_alloc_cnt[3]["agent"]
|
||||
|
||||
|
||||
def test_retired_correlation_ids(input_data):
|
||||
|
||||
@@ -19,9 +19,11 @@ string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
|
||||
# disable this test for thread sanitizers because of "signal-unsafe call inside signal"
|
||||
# issues on mi200 and mi300 (works fine on vega20 and navi32)
|
||||
if(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer")
|
||||
set(IS_THREAD_SANITIZER ON)
|
||||
set(DISABLE_THIS_TEST ON)
|
||||
else()
|
||||
set(IS_THREAD_SANITIZER OFF)
|
||||
# set(DISABLE_THIS_TEST OFF)
|
||||
set(DISABLE_THIS_TEST ON) # this test is currently unstable so we are disabling it
|
||||
# unconditionally for now
|
||||
endif()
|
||||
|
||||
set(aborted-app-env "${PRELOAD_ENV}" ROCPROF_TESTING_RAISE_SIGNAL=1
|
||||
@@ -46,7 +48,7 @@ set_tests_properties(
|
||||
WILL_FAIL
|
||||
TRUE
|
||||
DISABLED
|
||||
"${IS_THREAD_SANITIZER}")
|
||||
"${DISABLE_THIS_TEST}")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-validate-app-abort
|
||||
@@ -64,4 +66,4 @@ set_tests_properties(
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}"
|
||||
DISABLED
|
||||
"${IS_THREAD_SANITIZER}")
|
||||
"${DISABLE_THIS_TEST}")
|
||||
|
||||
@@ -21,7 +21,7 @@ add_test(
|
||||
NAME rocprofv3-test-memory-allocation-tracing-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --memory-allocation-trace -d
|
||||
${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace otf2
|
||||
${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json otf2
|
||||
--log-level env -- $<TARGET_FILE:hsa-memory-allocation>)
|
||||
|
||||
set_tests_properties(
|
||||
@@ -35,8 +35,6 @@ add_test(
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.json
|
||||
--pftrace-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.pftrace
|
||||
--otf2-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.otf2)
|
||||
|
||||
|
||||
@@ -17,12 +17,6 @@ def pytest_addoption(parser):
|
||||
default="memory-allocation-tracing/out_results.json",
|
||||
help="Input JSON",
|
||||
)
|
||||
parser.addoption(
|
||||
"--pftrace-input",
|
||||
action="store",
|
||||
default="memory-allocation-tracing/out_results.pftrace",
|
||||
help="Input JSON",
|
||||
)
|
||||
parser.addoption(
|
||||
"--otf2-input",
|
||||
action="store",
|
||||
@@ -38,12 +32,6 @@ def json_data(request):
|
||||
return dotdict(collapse_dict_list(json.load(inp)))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pftrace_data(request):
|
||||
filename = request.config.getoption("--pftrace-input")
|
||||
return PerfettoReader(filename).read()[0]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def otf2_data(request):
|
||||
filename = request.config.getoption("--otf2-input")
|
||||
|
||||
@@ -35,7 +35,7 @@ def test_memory_allocation(json_data):
|
||||
|
||||
_, bf_op_names = get_operation(data, "MEMORY_ALLOCATION")
|
||||
|
||||
assert len(bf_op_names) == 4
|
||||
assert len(bf_op_names) == 5
|
||||
|
||||
allocation_reported_agent_ids = set()
|
||||
# check buffering data
|
||||
@@ -49,12 +49,12 @@ def test_memory_allocation(json_data):
|
||||
assert "thread_id" in node
|
||||
|
||||
assert "agent_id" in node
|
||||
assert "starting_address" in node
|
||||
assert "address" in node
|
||||
assert "allocation_size" in node
|
||||
|
||||
assert node.size > 0
|
||||
assert node.allocation_size > 0
|
||||
assert node.starting_address > 0
|
||||
assert node.allocation_size >= 0
|
||||
assert len(node.address) > 0
|
||||
assert node.thread_id > 0
|
||||
assert node.agent_id.handle > 0
|
||||
assert node.start_timestamp > 0
|
||||
@@ -69,14 +69,6 @@ def test_memory_allocation(json_data):
|
||||
|
||||
allocation_reported_agent_ids.add(node["agent_id"]["handle"])
|
||||
|
||||
assert 2**64 - 1 not in allocation_reported_agent_ids
|
||||
|
||||
|
||||
def test_perfetto_data(pftrace_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
rocprofv3.test_perfetto_data(pftrace_data, json_data, ("memory_allocation",))
|
||||
|
||||
|
||||
def test_otf2_data(otf2_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
@@ -242,12 +242,12 @@ def test_summary_display_data(json_data, summary_data):
|
||||
|
||||
assert get_dims(marker) == [7, 9], f"{marker}"
|
||||
assert get_dims(memcpy) == [2, 9], f"{memcpy}"
|
||||
assert get_dims(memalloc) == [1, 9], f"{memalloc}"
|
||||
assert get_dims(memalloc) == [2, 9], f"{memalloc}"
|
||||
assert get_dims(dispatch) == [3, 9], f"{dispatch}"
|
||||
assert get_dims(dispatch_and_copy) == [5, 9], f"{dispatch_and_copy}"
|
||||
assert get_dims(hip) == [14, 9], f"{hip}"
|
||||
assert get_dims(hip_and_marker) == expected_hip_and_marker_dims, f"{hip_and_marker}"
|
||||
assert get_dims(total) == [24, 9], f"{total}"
|
||||
assert get_dims(total) == [25, 9], f"{total}"
|
||||
|
||||
|
||||
def test_perfetto_data(pftrace_data, json_data):
|
||||
@@ -256,7 +256,7 @@ def test_perfetto_data(pftrace_data, json_data):
|
||||
rocprofv3.test_perfetto_data(
|
||||
pftrace_data,
|
||||
json_data,
|
||||
("hip", "marker", "kernel", "memory_copy", "memory_allocation"),
|
||||
("hip", "marker", "kernel", "memory_copy"),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
#include <rocprofiler-sdk/internal_threading.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
#include <rocprofiler-sdk/cxx/utility.hpp>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
@@ -1843,6 +1844,7 @@ write_perfetto()
|
||||
|
||||
auto tids = std::set<rocprofiler_thread_id_t>{};
|
||||
auto agent_ids = std::set<uint64_t>{};
|
||||
auto agent_ids_alloc = std::set<uint64_t>{};
|
||||
auto agent_queue_ids = std::map<uint64_t, std::set<uint64_t>>{};
|
||||
|
||||
auto _get_agent = [](uint64_t id_handle) -> const rocprofiler_agent_t* {
|
||||
@@ -1875,7 +1877,7 @@ write_perfetto()
|
||||
for(auto itr : memory_allocation_bf_records)
|
||||
{
|
||||
tids.emplace(itr.thread_id);
|
||||
agent_ids.emplace(itr.agent_id.handle);
|
||||
agent_ids_alloc.emplace(itr.agent_id.handle);
|
||||
}
|
||||
|
||||
for(auto itr : kernel_dispatch_bf_records)
|
||||
@@ -1934,6 +1936,36 @@ write_perfetto()
|
||||
agent_tracks.emplace(itr, _track);
|
||||
}
|
||||
|
||||
for(auto itr : agent_ids_alloc)
|
||||
{
|
||||
const auto* _agent = _get_agent(itr);
|
||||
auto _namess = std::stringstream{};
|
||||
|
||||
if(_agent != nullptr)
|
||||
{
|
||||
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
_namess << "CPU MEMORY OPERATION [" << itr << "] ";
|
||||
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
_namess << "GPU MEMORY OPERATION [" << itr << "] ";
|
||||
|
||||
if(!std::string_view{_agent->model_name}.empty())
|
||||
_namess << _agent->model_name;
|
||||
else
|
||||
_namess << _agent->product_name;
|
||||
}
|
||||
else
|
||||
{
|
||||
_namess << "UNKNOWN MEMORY OPERATION [" << itr << "] ";
|
||||
}
|
||||
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
|
||||
auto _desc = _track.Serialize();
|
||||
_desc.set_name(_namess.str());
|
||||
|
||||
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
|
||||
|
||||
agent_tracks.emplace(itr, _track);
|
||||
}
|
||||
|
||||
auto agent_queue_tracks =
|
||||
std::unordered_map<uint64_t, std::unordered_map<uint64_t, ::perfetto::Track>>{};
|
||||
|
||||
@@ -2155,35 +2187,6 @@ write_perfetto()
|
||||
itr.end_timestamp);
|
||||
}
|
||||
|
||||
for(auto itr : memory_allocation_bf_records)
|
||||
{
|
||||
auto name = buffer_names.at(itr.kind, itr.operation);
|
||||
auto& track = agent_tracks.at(itr.agent_id.handle);
|
||||
|
||||
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
::perfetto::StaticString(name.data()),
|
||||
track,
|
||||
itr.start_timestamp,
|
||||
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
|
||||
"begin_ns",
|
||||
itr.start_timestamp,
|
||||
"kind",
|
||||
itr.kind,
|
||||
"operation",
|
||||
itr.operation,
|
||||
"agent",
|
||||
agents_map.at(itr.agent_id).logical_node_id,
|
||||
"Allocation_size",
|
||||
itr.allocation_size,
|
||||
"Starting_address",
|
||||
itr.starting_address);
|
||||
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
track,
|
||||
itr.end_timestamp,
|
||||
"end_ns",
|
||||
itr.end_timestamp);
|
||||
}
|
||||
|
||||
auto demangled = std::unordered_map<std::string_view, std::string>{};
|
||||
for(auto itr : kernel_dispatch_bf_records)
|
||||
{
|
||||
|
||||
Ссылка в новой задаче
Block a user