Memory Allocation Tracking (#1142)
* Initial commit: Need to implement wrapper function to collect data and test that wrapper function is correctly replacing core HSA functions * Attempted to implement wrapper implementation for hsa memory allocation functions. Need to modify generate record files and test if implementation is working as expected * Debugging and implementing generateCSV function * Memory allocation size and starting address outputted to csv and json file formats * Formatting * Initial setup for OTF2 and Perfetto generation * Collecting agent id for memory_allocation and formatting * Modified memory_allocation.cpp to set up code for AMD_EXT commands * Support for memory_pool_allocate added * Removed accidently added file * Made flag optional and added more OTF2 and Perfetto code. Needs testing to ensure perfetto and OTF2 works * Formatting * Fixed perfetto and otf2 output * Fixed flag issue due to incorrect buffer use * Updated documentation * Small cleaning and comments * Added test for HSA memory allocation tracing * Fixed summary test validation errors due to allocation tracing. Added type to location_base to create unique event ids for allocation due to OTF2 trace error * Decreased lower limit of hip calls for test * Modified summary tests to vary number of allocate requests * Minor fixes to address comments. Still need to address OTF2 comments * Fix docs and changed OTF2 to use enum for type specified in location_base construction * Fixed schema error * Added vmem command tracking. Need to add test * Updated test to work with vmem command and updated generateCSV to output int instead of hex string. * OTF2 enum update and mispelling fix * CI does not support Virtual Memory API. Removed vmem test. Will add back if CI is modifed to suport vmem API * Update CMakeLists.txt for memory allocation test * Updated summary test * Minor fixes to address comments * Moved domain_type.hpp enum to before LAST * Fixed compile errors and formatting * Fixed stats summary domain name error * Added rocprofv3 test * Page migration test fix * Undo page migration test changes. Failures do not appear to have to do with memory allocation
This commit is contained in:
@@ -108,6 +108,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
|
||||
- Start and end timestamp columns to the counter collection csv output
|
||||
- Check to force tools to initialize context id with zero
|
||||
- Support to specify hardware counters for collection using rocprofv3 as `rocprofv3 --pmc [COUNTER [COUNTER ...]]`
|
||||
- Memory Allocation Tracing
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
+10
-2
@@ -141,13 +141,13 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
aggregate_tracing_options,
|
||||
"-r",
|
||||
"--runtime-trace",
|
||||
help="Collect tracing data for HIP runtime API, Marker (ROCTx) API, RCCL API, Memory operations (copies and scratch), and Kernel dispatches. Similar to --sys-trace but without tracing HIP compiler API and the underlying HSA API.",
|
||||
help="Collect tracing data for HIP runtime API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocation), and Kernel dispatches. Similar to --sys-trace but without tracing HIP compiler API and the underlying HSA API.",
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
aggregate_tracing_options,
|
||||
"-s",
|
||||
"--sys-trace",
|
||||
help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies and scratch), and Kernel dispatches.",
|
||||
help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocations), and Kernel dispatches.",
|
||||
)
|
||||
|
||||
basic_tracing_options = parser.add_argument_group("Basic tracing options")
|
||||
@@ -173,6 +173,11 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
"--memory-copy-trace",
|
||||
help="For collecting Memory Copy Traces. This was part of HIP and HSA traces in previous rocprof versions but is now a separate option",
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
basic_tracing_options,
|
||||
"--memory-allocation-trace",
|
||||
help="For collecting Memory Allocation Traces. Displays starting address, allocation size, and agent where allocation occurred.",
|
||||
)
|
||||
add_parser_bool_argument(
|
||||
basic_tracing_options,
|
||||
"--scratch-memory-trace",
|
||||
@@ -686,6 +691,7 @@ def run(app_args, args, **kwargs):
|
||||
"marker_trace",
|
||||
"kernel_trace",
|
||||
"memory_copy_trace",
|
||||
"memory_allocation_trace",
|
||||
"scratch_memory_trace",
|
||||
"rccl_trace",
|
||||
):
|
||||
@@ -697,6 +703,7 @@ def run(app_args, args, **kwargs):
|
||||
"marker_trace",
|
||||
"kernel_trace",
|
||||
"memory_copy_trace",
|
||||
"memory_allocation_trace",
|
||||
"scratch_memory_trace",
|
||||
"rccl_trace",
|
||||
):
|
||||
@@ -724,6 +731,7 @@ def run(app_args, args, **kwargs):
|
||||
["rccl_trace", "RCCL_API_TRACE"],
|
||||
["kernel_trace", "KERNEL_TRACE"],
|
||||
["memory_copy_trace", "MEMORY_COPY_TRACE"],
|
||||
["memory_allocation_trace", "MEMORY_ALLOCATION_TRACE"],
|
||||
["scratch_memory_trace", "SCRATCH_MEMORY_TRACE"],
|
||||
]
|
||||
).items():
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
"Kind","Operation","Agent_Id","Allocation_Size","Starting_Address","Correlation_Id","Start_Timestamp","End_Timestamp"
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497356288,1,65788054621500,65788055678893
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497348096,1,65788055691832,65788056666844
|
||||
"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497339904,1,65788056672061,65788057643457
|
||||
|
@@ -55,11 +55,11 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt
|
||||
- Output control
|
||||
|
||||
* - ``-r`` \| ``--runtime-trace``
|
||||
- Collects HIP (runtime), memory copy, marker, scratch memory, and kernel dispatch traces.
|
||||
- Collects HIP (runtime), memory copy, memory allocation, marker, scratch memory, and kernel dispatch traces.
|
||||
- Application Tracing
|
||||
|
||||
* - ``-s`` \| ``--sys-trace``
|
||||
- Collects HIP, HSA, memory copy, marker, scratch memory, and kernel dispatch traces.
|
||||
- Collects HIP, HSA, memory copy, memory allocation, marker, scratch memory, and kernel dispatch traces.
|
||||
- Application Tracing
|
||||
|
||||
* - ``--hip-trace``
|
||||
@@ -78,6 +78,10 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt
|
||||
- Collects memory copy traces.
|
||||
- Application tracing
|
||||
|
||||
* - ``--memory-allocation-trace``
|
||||
- Collects memory allocation traces.
|
||||
- Application tracing
|
||||
|
||||
* - ``--scratch-memory-trace``
|
||||
- Collects scratch memory operations traces.
|
||||
- Application tracing
|
||||
@@ -356,6 +360,30 @@ Here are the contents of ``memory_copy_trace.csv`` file:
|
||||
|
||||
For the description of the fields in the output file, see :ref:`output-file-fields`.
|
||||
|
||||
Memory allocation trace
|
||||
+++++++++++++++++++++++++
|
||||
|
||||
To trace memory allocations during the application run, use:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
rocprofv3 –-memory-allocation-trace -- < app_path >
|
||||
|
||||
The above command generates a ``memory_allocation_trace.csv`` file prefixed with the process ID.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
$ cat 6489_memory_allocation_trace.csv
|
||||
|
||||
Here are the contents of ``memory_allocation_trace.csv`` file:
|
||||
|
||||
.. csv-table:: Memory allocation trace
|
||||
:file: /data/memory_allocation_trace.csv
|
||||
:widths: 10,10,10,10,10,10,20,20
|
||||
:header-rows: 1
|
||||
|
||||
For the description of the fields in the output file, see :ref:`output-file-fields`.
|
||||
|
||||
Runtime trace
|
||||
+++++++++++++++
|
||||
|
||||
@@ -374,7 +402,7 @@ memory operations (copies and scratch).
|
||||
|
||||
rocprofv3 –-runtime-trace -- < app_relative_path >
|
||||
|
||||
Running the above command generates ``hip_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, ``scratch_memory_trace.csv``,and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID.
|
||||
Running the above command generates ``hip_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, ``scratch_memory_trace.csv``, ``memory_allocation_trace.csv``, and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID.
|
||||
|
||||
System trace
|
||||
++++++++++++++
|
||||
@@ -385,7 +413,7 @@ This is an all-inclusive option to collect all the above-mentioned traces.
|
||||
|
||||
rocprofv3 –-sys-trace -- < app_relative_path >
|
||||
|
||||
Running the above command generates ``hip_api_trace.csv``, ``hsa_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID.
|
||||
Running the above command generates ``hip_api_trace.csv``, ``hsa_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, ``memory_allocation_trace.csv``, and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID.
|
||||
|
||||
Scratch memory trace
|
||||
++++++++++++++++++++++
|
||||
@@ -464,6 +492,8 @@ Properties
|
||||
Dispatch Traces.
|
||||
- **``memory_copy_trace``** *(boolean)*: For Collecting Memory
|
||||
Copy Traces.
|
||||
- **``memory_allocation_trace``** *(boolean)*: For Collecting Memory
|
||||
Allocation Traces.
|
||||
- **``scratch_memory_trace``** *(boolean)*: For Collecting
|
||||
Scratch Memory operations Traces.
|
||||
- **``stats``** *(boolean)*: For Collecting statistics of enabled
|
||||
@@ -479,8 +509,8 @@ Properties
|
||||
- **``hsa_image_trace``** *(boolean)*: For Collecting HSA API
|
||||
Traces (Image-extension API).
|
||||
- **``sys_trace``** *(boolean)*: For Collecting HIP, HSA, Marker
|
||||
(ROCTx), Memory copy, Scratch memory, and Kernel dispatch
|
||||
traces.
|
||||
(ROCTx), Memory copy, Memory allocation, Scratch memory, and
|
||||
Kernel dispatch traces.
|
||||
- **``mangled_kernels``** *(boolean)*: Do not demangle the kernel
|
||||
names.
|
||||
- **``truncate_kernels``** *(boolean)*: Truncate the demangled
|
||||
@@ -990,3 +1020,18 @@ Properties
|
||||
- **`src_agent_id`** *(object, required)*: Source Agent ID.
|
||||
- **`handle`** *(integer, required)*: Handle of the agent.
|
||||
- **`bytes`** *(integer, required)*: Bytes copied.
|
||||
- **`memory_allocation`** *(array)*: Memory allocation records.
|
||||
- **Items** *(object)*
|
||||
- **`size`** *(integer, required)*: Size of the Marker API record.
|
||||
- **`kind`** *(integer, required)*: Kind of the Marker API.
|
||||
- **`operation`** *(integer, required)*: Operation of the Marker API.
|
||||
- **`correlation_id`** *(object, required)*: Correlation ID information.
|
||||
- **`internal`** *(integer, required)*: Internal correlation ID.
|
||||
- **`external`** *(integer, required)*: External correlation ID.
|
||||
- **`start_timestamp`** *(integer, required)*: Start timestamp.
|
||||
- **`end_timestamp`** *(integer, required)*: End timestamp.
|
||||
- **`thread_id`** *(integer, required)*: Thread ID.
|
||||
- **`agent_id`** *(object, required)*: Agent ID.
|
||||
- **`handle`** *(integer, required)*: Handle of the agent.
|
||||
- **`starting_address`** *(string, required)*: Starting address of allocation.
|
||||
- **`allocation_size`** *(integer, required)*: Size of allocation.
|
||||
|
||||
@@ -1374,6 +1374,90 @@
|
||||
"bytes"
|
||||
]
|
||||
}
|
||||
},
|
||||
"memory_allocation": {
|
||||
"type": "array",
|
||||
"description": "Memory allocation records.",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"size": {
|
||||
"type": "integer",
|
||||
"description": "Size of the Marker API record."
|
||||
},
|
||||
"kind": {
|
||||
"type": "integer",
|
||||
"description": "Kind of the Marker API."
|
||||
},
|
||||
"operation": {
|
||||
"type": "integer",
|
||||
"description": "Operation of the Marker API."
|
||||
},
|
||||
"correlation_id": {
|
||||
"type": "object",
|
||||
"description": "Correlation ID information.",
|
||||
"properties": {
|
||||
"internal": {
|
||||
"type": "integer",
|
||||
"description": "Internal correlation ID."
|
||||
},
|
||||
"external": {
|
||||
"type": "integer",
|
||||
"description": "External correlation ID."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"internal",
|
||||
"external"
|
||||
]
|
||||
},
|
||||
"start_timestamp": {
|
||||
"type": "integer",
|
||||
"description": "Start timestamp."
|
||||
},
|
||||
"end_timestamp": {
|
||||
"type": "integer",
|
||||
"description": "End timestamp."
|
||||
},
|
||||
"thread_id": {
|
||||
"type": "integer",
|
||||
"description": "Thread ID."
|
||||
},
|
||||
"agent_id": {
|
||||
"type": "object",
|
||||
"description": "Agent ID.",
|
||||
"properties": {
|
||||
"handle": {
|
||||
"type": "integer",
|
||||
"description": "Handle of the agent."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"handle"
|
||||
]
|
||||
},
|
||||
"starting_address": {
|
||||
"type": "integer",
|
||||
"description": "Starting address of allocation"
|
||||
},
|
||||
"allocation_size": {
|
||||
"type": "integer",
|
||||
"description": "allocation_size"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"size",
|
||||
"kind",
|
||||
"operation",
|
||||
"correlation_id",
|
||||
"start_timestamp",
|
||||
"end_timestamp",
|
||||
"thread_id",
|
||||
"agent_id",
|
||||
"starting_address",
|
||||
"allocation_size"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,6 +61,11 @@
|
||||
"description": "For Collecting Memory Copy Traces"
|
||||
},
|
||||
|
||||
"memory_allocation_trace": {
|
||||
"type": "boolean",
|
||||
"description": "For Collecting Memory Allocation Traces"
|
||||
},
|
||||
|
||||
"scratch_memory_trace": {
|
||||
"type": "boolean",
|
||||
"description": "For Collecting Scratch Memory operations Traces"
|
||||
@@ -98,7 +103,7 @@
|
||||
|
||||
"sys_trace" : {
|
||||
"type": "boolean",
|
||||
"description": "For Collecting HIP, HSA, Marker (ROCTx), Memory copy, Scratch memory, and Kernel dispatch traces"
|
||||
"description": "For Collecting HIP, HSA, Marker (ROCTx), Memory copy, Memory allocation, Scratch memory, and Kernel dispatch traces"
|
||||
},
|
||||
|
||||
"mangled_kernels": {
|
||||
|
||||
@@ -203,6 +203,28 @@ typedef struct
|
||||
/// ::rocprofiler_memory_copy_operation_t)
|
||||
} rocprofiler_buffer_tracing_memory_copy_record_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Buffer Memory Allocation Tracer Record.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_buffer_tracing_kind_t kind;
|
||||
rocprofiler_memory_allocation_operation_t operation;
|
||||
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
|
||||
rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation
|
||||
uint64_t starting_address; ///< starting address for memory allocation
|
||||
uint64_t allocation_size; ///< size for memory allocation
|
||||
/// @var kind
|
||||
/// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION
|
||||
/// @var operation
|
||||
/// @brief Specification of the memory allocation function (@see
|
||||
/// ::rocprofiler_memory_allocation_operation_t
|
||||
} rocprofiler_buffer_tracing_memory_allocation_record_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Buffer Kernel Dispatch Tracer Record.
|
||||
*/
|
||||
|
||||
@@ -210,6 +210,19 @@ typedef struct
|
||||
uint64_t bytes; ///< bytes copied
|
||||
} rocprofiler_callback_tracing_memory_copy_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Memory Copy Allocation Tracer Record.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation
|
||||
uint64_t starting_address; ///< starting address for memory allocation
|
||||
uint64_t allocation_size; ///< size of memory allocation
|
||||
} rocprofiler_callback_tracing_memory_allocation_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Scratch Memory Callback Data.
|
||||
*/
|
||||
|
||||
@@ -66,6 +66,8 @@ ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(rocprofiler_callback_thread_t)
|
||||
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_agent_t)
|
||||
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_signal_t)
|
||||
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_executable_t)
|
||||
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_region_t)
|
||||
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_amd_memory_pool_t)
|
||||
|
||||
#undef ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER
|
||||
} // namespace std
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/defines.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
#include <rocprofiler-sdk/internal_threading.h>
|
||||
|
||||
#include <tuple>
|
||||
@@ -104,6 +105,8 @@ ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_signal_t)
|
||||
ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_executable_t)
|
||||
ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_agent_v0_t&)
|
||||
ROCPROFILER_CXX_DECLARE_OPERATORS(rocprofiler_dim3_t)
|
||||
ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_region_t)
|
||||
ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_amd_memory_pool_t)
|
||||
|
||||
// definitions of operator==
|
||||
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_context_id_t)
|
||||
@@ -116,6 +119,8 @@ ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_callback_thread_t)
|
||||
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_agent_t)
|
||||
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_signal_t)
|
||||
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_executable_t)
|
||||
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_region_t)
|
||||
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_amd_memory_pool_t)
|
||||
|
||||
inline bool
|
||||
operator==(const rocprofiler_agent_v0_t& lhs, const rocprofiler_agent_v0_t& rhs)
|
||||
@@ -142,6 +147,8 @@ ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_signal_t)
|
||||
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_executable_t)
|
||||
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(const rocprofiler_agent_v0_t&)
|
||||
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_dim3_t)
|
||||
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_region_t)
|
||||
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_amd_memory_pool_t)
|
||||
|
||||
// definitions of operator<
|
||||
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_context_id_t)
|
||||
@@ -154,6 +161,8 @@ ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_callback_thread_t)
|
||||
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_agent_t)
|
||||
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_signal_t)
|
||||
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_executable_t)
|
||||
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_region_t)
|
||||
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_amd_memory_pool_t)
|
||||
|
||||
inline bool
|
||||
operator<(const rocprofiler_agent_v0_t& lhs, const rocprofiler_agent_v0_t& rhs)
|
||||
@@ -185,6 +194,8 @@ ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_signal_t)
|
||||
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_executable_t)
|
||||
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(const rocprofiler_agent_v0_t&)
|
||||
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_dim3_t)
|
||||
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_region_t)
|
||||
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_amd_memory_pool_t)
|
||||
|
||||
// cleanup defines
|
||||
#undef ROCPROFILER_CXX_DECLARE_OPERATORS
|
||||
|
||||
@@ -80,6 +80,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, marker_api, "Marker API region")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, rccl_api, "RCCL API function")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, kernel_dispatch, "GPU kernel dispatch")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy")
|
||||
ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation")
|
||||
|
||||
#define ROCPROFILER_PERFETTO_CATEGORIES \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \
|
||||
@@ -87,7 +88,8 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy")
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::marker_api), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::rccl_api), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy)
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy), \
|
||||
ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation)
|
||||
|
||||
#include <perfetto.h>
|
||||
|
||||
|
||||
@@ -330,6 +330,18 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_memory_copy_data_t data)
|
||||
ROCP_SDK_SAVE_DATA_FIELD(bytes);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_callback_tracing_memory_allocation_data_t data)
|
||||
{
|
||||
ROCP_SDK_SAVE_DATA_FIELD(size);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(start_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(end_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(agent_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(starting_address);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(allocation_size);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_rccl_api_retval_t data)
|
||||
@@ -461,6 +473,22 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data)
|
||||
ROCP_SDK_SAVE_DATA_FIELD(bytes);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_allocation_record_t data)
|
||||
{
|
||||
ROCP_SDK_SAVE_DATA_FIELD(size);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(kind);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(operation);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(thread_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(correlation_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(start_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(end_timestamp);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(agent_id);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(starting_address);
|
||||
ROCP_SDK_SAVE_DATA_FIELD(allocation_size);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, const rocprofiler_page_migration_page_fault_start_t& data)
|
||||
|
||||
@@ -68,6 +68,7 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_SCRATCH_MEMORY, ///<
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_RCCL_API, ///<
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_OPENMP, ///<
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION, ///<
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_LAST,
|
||||
} rocprofiler_external_correlation_id_request_kind_t;
|
||||
|
||||
|
||||
@@ -167,10 +167,12 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, ///< @see ::rocprofiler_marker_name_api_id_t
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_RCCL_API, ///< @RCCL tracing
|
||||
ROCPROFILER_CALLBACK_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_RCCL_API, ///< @RCCL tracing
|
||||
ROCPROFILER_CALLBACK_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, ///< @see
|
||||
///< ::rocprofiler_memory_allocation_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_LAST,
|
||||
} rocprofiler_callback_tracing_kind_t;
|
||||
|
||||
@@ -197,6 +199,8 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, ///< Correlation ID in no longer in use
|
||||
ROCPROFILER_BUFFER_TRACING_RCCL_API, ///< RCCL tracing
|
||||
ROCPROFILER_BUFFER_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, ///< @see
|
||||
///< ::rocprofiler_memory_allocation_operation_t
|
||||
ROCPROFILER_BUFFER_TRACING_LAST,
|
||||
} rocprofiler_buffer_tracing_kind_t;
|
||||
|
||||
@@ -224,6 +228,18 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_MEMORY_COPY_LAST,
|
||||
} rocprofiler_memory_copy_operation_t;
|
||||
|
||||
/**
|
||||
* @brief Memory Allocation Operation.
|
||||
*/
|
||||
typedef enum // NOLINT(performance-enum-size)
|
||||
{
|
||||
ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function
|
||||
ROCPROFILER_MEMORY_ALLOCATION_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool
|
||||
ROCPROFILER_MEMORY_ALLOCATION_VMEM_HANDLE_CREATE, ///< Allocate vmem memory handle
|
||||
ROCPROFILER_MEMORY_ALLOCATION_LAST,
|
||||
} rocprofiler_memory_allocation_operation_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Kernel Dispatch Tracing Operation Types.
|
||||
*/
|
||||
|
||||
@@ -153,5 +153,8 @@ using counter_collection_buffered_output_t =
|
||||
using scratch_memory_buffered_output_t =
|
||||
buffered_output<rocprofiler_buffer_tracing_scratch_memory_record_t,
|
||||
domain_type::SCRATCH_MEMORY>;
|
||||
using memory_allocation_buffered_output_t =
|
||||
buffered_output<rocprofiler_buffer_tracing_memory_allocation_record_t,
|
||||
domain_type::MEMORY_ALLOCATION>;
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -104,6 +104,7 @@ using agent_info_csv_encoder = csv_encoder<53>;
|
||||
using kernel_trace_csv_encoder = csv_encoder<18>;
|
||||
using counter_collection_csv_encoder = csv_encoder<18>;
|
||||
using memory_copy_csv_encoder = csv_encoder<7>;
|
||||
using memory_allocation_csv_encoder = csv_encoder<8>;
|
||||
using marker_csv_encoder = csv_encoder<7>;
|
||||
using list_basic_metrics_csv_encoder = csv_encoder<5>;
|
||||
using list_derived_metrics_csv_encoder = csv_encoder<5>;
|
||||
|
||||
@@ -52,6 +52,10 @@ DEFINE_BUFFER_TYPE_NAME(COUNTER_COLLECTION,
|
||||
"counter_collection",
|
||||
"counter_collection_stats")
|
||||
DEFINE_BUFFER_TYPE_NAME(RCCL, "RCCL_API", "rccl_api_trace", "rccl_api_stats")
|
||||
DEFINE_BUFFER_TYPE_NAME(MEMORY_ALLOCATION,
|
||||
"MEMORY_ALLOCATION",
|
||||
"memory_allocation",
|
||||
"memory_allocation_stats")
|
||||
|
||||
#undef DEFINE_BUFFER_TYPE_NAME
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ enum class domain_type
|
||||
SCRATCH_MEMORY,
|
||||
COUNTER_COLLECTION,
|
||||
RCCL,
|
||||
MEMORY_ALLOCATION,
|
||||
LAST,
|
||||
};
|
||||
|
||||
|
||||
@@ -437,6 +437,50 @@ generate_csv(const output_config& c
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& data,
|
||||
const stats_entry_t& stats)
|
||||
{
|
||||
if(data.empty()) return;
|
||||
|
||||
if(cfg.stats && stats)
|
||||
write_stats(get_stats_output_file(cfg, domain_type::MEMORY_ALLOCATION), stats.entries);
|
||||
|
||||
auto ofs = tool::csv_output_file{cfg,
|
||||
domain_type::MEMORY_ALLOCATION,
|
||||
tool::csv::memory_allocation_csv_encoder{},
|
||||
{"Kind",
|
||||
"Operation",
|
||||
"Agent_Id",
|
||||
"Allocation_Size",
|
||||
"Starting_Address",
|
||||
"Correlation_Id",
|
||||
"Start_Timestamp",
|
||||
"End_Timestamp"}};
|
||||
for(auto ditr : data)
|
||||
{
|
||||
for(auto record : data.get(ditr))
|
||||
{
|
||||
auto api_name = tool_metadata.get_operation_name(record.kind, record.operation);
|
||||
auto row_ss = std::stringstream{};
|
||||
rocprofiler::tool::csv::memory_allocation_csv_encoder::write_row(
|
||||
row_ss,
|
||||
tool_metadata.get_kind_name(record.kind),
|
||||
api_name,
|
||||
tool_metadata.get_node_id(record.agent_id),
|
||||
record.allocation_size,
|
||||
record.starting_address,
|
||||
record.correlation_id.internal,
|
||||
record.start_timestamp,
|
||||
record.end_timestamp);
|
||||
|
||||
ofs << row_ss.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
|
||||
@@ -87,6 +87,12 @@ generate_csv(const output_config& cfg,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& data,
|
||||
const stats_entry_t& stats);
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& data,
|
||||
const stats_entry_t& stats);
|
||||
|
||||
void
|
||||
generate_csv(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
|
||||
@@ -169,15 +169,16 @@ void
|
||||
write_json(json_output& json_ar,
|
||||
const output_config& /*cfg*/,
|
||||
const metadata& /*tool_metadata*/,
|
||||
const domain_stats_vec_t& domain_stats,
|
||||
generator<rocprofiler_buffer_tracing_hip_api_record_t>&& hip_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_kernel_dispatch_record_t> kernel_dispatch_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_copy_record_t> memory_copy_gen,
|
||||
generator<tool_counter_record_t> counter_collection_gen,
|
||||
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
|
||||
generator<rocprofiler_buffer_tracing_rccl_api_record_t> rccl_api_gen)
|
||||
const domain_stats_vec_t& domain_stats,
|
||||
generator<rocprofiler_buffer_tracing_hip_api_record_t>&& hip_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_kernel_dispatch_record_t> kernel_dispatch_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_copy_record_t> memory_copy_gen,
|
||||
generator<tool_counter_record_t> counter_collection_gen,
|
||||
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
|
||||
generator<rocprofiler_buffer_tracing_rccl_api_record_t> rccl_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_allocation_record_t> memory_allocation_gen)
|
||||
|
||||
{
|
||||
// summary
|
||||
@@ -216,6 +217,7 @@ write_json(json_output& json_ar,
|
||||
json_ar(cereal::make_nvp("marker_api", marker_api_gen));
|
||||
json_ar(cereal::make_nvp("rccl_api", rccl_api_gen));
|
||||
json_ar(cereal::make_nvp("memory_copy", memory_copy_gen));
|
||||
json_ar(cereal::make_nvp("memory_allocation", memory_allocation_gen));
|
||||
json_ar(cereal::make_nvp("scratch_memory", scratch_memory_gen));
|
||||
json_ar.finishNode();
|
||||
}
|
||||
|
||||
@@ -81,17 +81,18 @@ void
|
||||
write_json(json_output&, const output_config& cfg, const metadata& tool_metadata, uint64_t pid);
|
||||
|
||||
void
|
||||
write_json(json_output& json_ar,
|
||||
const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const domain_stats_vec_t& domain_stats,
|
||||
generator<rocprofiler_buffer_tracing_hip_api_record_t>&& hip_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_kernel_dispatch_record_t> kernel_dispatch_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_copy_record_t> memory_copy_gen,
|
||||
generator<tool_counter_record_t> counter_collection_gen,
|
||||
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
|
||||
generator<rocprofiler_buffer_tracing_rccl_api_record_t> rccl_api_gen);
|
||||
write_json(json_output& json_ar,
|
||||
const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const domain_stats_vec_t& domain_stats,
|
||||
generator<rocprofiler_buffer_tracing_hip_api_record_t>&& hip_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_kernel_dispatch_record_t> kernel_dispatch_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_copy_record_t> memory_copy_gen,
|
||||
generator<tool_counter_record_t> counter_collection_gen,
|
||||
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
|
||||
generator<rocprofiler_buffer_tracing_rccl_api_record_t> rccl_api_gen,
|
||||
generator<rocprofiler_buffer_tracing_memory_allocation_record_t> memory_allocation_gen);
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -127,43 +127,55 @@ auto main_tid = common::get_tid();
|
||||
archive_t* archive = nullptr;
|
||||
auto flush_callbacks = OTF2_FlushCallbacks{pre_flush, post_flush};
|
||||
|
||||
enum rocprofiler_location_type_t
|
||||
{
|
||||
ROCPROFILER_AGENT_NO_TYPE = 0,
|
||||
ROCPROFILER_AGENT_MEMORY_COPY_TYPE,
|
||||
ROCPROFILER_AGENT_DISPATCH_TYPE,
|
||||
ROCPROFILER_AGENT_MEMORY_ALLOC_TYPE
|
||||
};
|
||||
|
||||
struct location_base
|
||||
{
|
||||
uint64_t pid = 0;
|
||||
rocprofiler_thread_id_t tid = 0;
|
||||
rocprofiler_agent_id_t agent = {.handle = 0};
|
||||
rocprofiler_queue_id_t queue = {.handle = 0};
|
||||
uint64_t pid = 0;
|
||||
rocprofiler_thread_id_t tid = 0;
|
||||
rocprofiler_agent_id_t agent = {.handle = 0};
|
||||
rocprofiler_queue_id_t queue = {.handle = 0};
|
||||
rocprofiler_location_type_t type = ROCPROFILER_AGENT_NO_TYPE;
|
||||
|
||||
location_base(uint64_t _pid,
|
||||
rocprofiler_thread_id_t _tid,
|
||||
rocprofiler_agent_id_t _agent = {.handle = 0},
|
||||
rocprofiler_queue_id_t _queue = {.handle = 0})
|
||||
location_base(uint64_t _pid,
|
||||
rocprofiler_thread_id_t _tid,
|
||||
rocprofiler_agent_id_t _agent = {.handle = 0},
|
||||
rocprofiler_location_type_t _type = ROCPROFILER_AGENT_NO_TYPE,
|
||||
rocprofiler_queue_id_t _queue = {.handle = 0})
|
||||
: pid{_pid}
|
||||
, tid{_tid}
|
||||
, agent{_agent}
|
||||
, queue{_queue}
|
||||
, type{_type}
|
||||
{}
|
||||
|
||||
auto hash() const
|
||||
{
|
||||
return array_hash<uint64_t, 4>{}(pid, tid, agent.handle + 1, queue.handle + 1);
|
||||
return array_hash<uint64_t, 5>{}(pid, tid, agent.handle + 1, queue.handle + 1, type);
|
||||
}
|
||||
};
|
||||
|
||||
bool
|
||||
operator<(const location_base& lhs, const location_base& rhs)
|
||||
{
|
||||
return std::tie(lhs.pid, lhs.tid, lhs.agent.handle, lhs.queue.handle) <
|
||||
std::tie(rhs.pid, rhs.tid, rhs.agent.handle, rhs.queue.handle);
|
||||
return std::tie(lhs.pid, lhs.tid, lhs.agent.handle, lhs.queue.handle, lhs.type) <
|
||||
std::tie(rhs.pid, rhs.tid, rhs.agent.handle, rhs.queue.handle, rhs.type);
|
||||
}
|
||||
|
||||
struct location_data : location_base
|
||||
{
|
||||
location_data(uint64_t _pid,
|
||||
rocprofiler_thread_id_t _tid,
|
||||
rocprofiler_agent_id_t _agent = {.handle = 0},
|
||||
rocprofiler_queue_id_t _queue = {.handle = 0})
|
||||
: location_base{_pid, _tid, _agent, _queue}
|
||||
location_data(uint64_t _pid,
|
||||
rocprofiler_thread_id_t _tid,
|
||||
rocprofiler_agent_id_t _agent = {.handle = 0},
|
||||
rocprofiler_location_type_t _type = ROCPROFILER_AGENT_NO_TYPE,
|
||||
rocprofiler_queue_id_t _queue = {.handle = 0})
|
||||
: location_base{_pid, _tid, _agent, _type, _queue}
|
||||
, index{++index_counter}
|
||||
, event_writer{OTF2_Archive_GetEvtWriter(CHECK_NOTNULL(archive), index)}
|
||||
{
|
||||
@@ -206,7 +218,7 @@ get_location(const location_base& _location, bool _init = false)
|
||||
if(_init)
|
||||
return get_locations()
|
||||
.emplace_back(std::make_unique<location_data>(
|
||||
_location.pid, _location.tid, _location.agent, _location.queue))
|
||||
_location.pid, _location.tid, _location.agent, _location.type, _location.queue))
|
||||
.get();
|
||||
|
||||
return nullptr;
|
||||
@@ -299,7 +311,7 @@ setup(const output_config& cfg)
|
||||
OTF2_CHECK(OTF2_Pthread_Archive_SetLockingCallbacks(archive, nullptr));
|
||||
OTF2_CHECK(OTF2_Archive_OpenEvtFiles(archive));
|
||||
|
||||
ROCP_ERROR << "Opened result file: " << _filename << ".oft2";
|
||||
ROCP_ERROR << "Opened result file: " << _filename << ".otf2";
|
||||
}
|
||||
|
||||
void
|
||||
@@ -343,17 +355,19 @@ create_attribute_list()
|
||||
} // namespace
|
||||
|
||||
void
|
||||
write_otf2(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
uint64_t pid,
|
||||
const std::vector<agent_info>& agent_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hip_api_record_t>* hip_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>* kernel_dispatch_data,
|
||||
std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>* memory_copy_data,
|
||||
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* /*scratch_memory_data*/,
|
||||
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data)
|
||||
write_otf2(
|
||||
const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
uint64_t pid,
|
||||
const std::vector<agent_info>& agent_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hip_api_record_t>* hip_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>* kernel_dispatch_data,
|
||||
std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>* memory_copy_data,
|
||||
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* /*scratch_memory_data*/,
|
||||
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>* memory_allocation_data)
|
||||
{
|
||||
namespace sdk = ::rocprofiler::sdk;
|
||||
|
||||
@@ -366,6 +380,8 @@ write_otf2(const output_config& cfg,
|
||||
const auto& buffer_names = tool_metadata.buffer_names;
|
||||
auto tids = std::set<rocprofiler_thread_id_t>{};
|
||||
auto agent_thread_ids = std::map<rocprofiler_thread_id_t, std::set<rocprofiler_agent_id_t>>{};
|
||||
auto agent_thread_ids_alloc =
|
||||
std::map<rocprofiler_thread_id_t, std::set<rocprofiler_agent_id_t>>{};
|
||||
auto agent_queue_ids =
|
||||
std::map<rocprofiler_thread_id_t,
|
||||
std::map<rocprofiler_agent_id_t, std::unordered_set<rocprofiler_queue_id_t>>>{};
|
||||
@@ -373,6 +389,8 @@ write_otf2(const output_config& cfg,
|
||||
auto thread_event_info = std::map<rocprofiler_thread_id_t, event_info>{};
|
||||
auto agent_memcpy_info =
|
||||
std::map<rocprofiler_thread_id_t, std::map<rocprofiler_agent_id_t, event_info>>{};
|
||||
auto agent_memalloc_info =
|
||||
std::map<rocprofiler_thread_id_t, std::map<rocprofiler_agent_id_t, event_info>>{};
|
||||
auto agent_dispatch_info =
|
||||
std::map<rocprofiler_thread_id_t,
|
||||
std::map<rocprofiler_agent_id_t, std::map<rocprofiler_queue_id_t, event_info>>>{};
|
||||
@@ -407,6 +425,12 @@ write_otf2(const output_config& cfg,
|
||||
agent_thread_ids[itr.thread_id].emplace(itr.dst_agent_id);
|
||||
}
|
||||
|
||||
for(auto itr : *memory_allocation_data)
|
||||
{
|
||||
tids.emplace(itr.thread_id);
|
||||
agent_thread_ids_alloc[itr.thread_id].emplace(itr.agent_id);
|
||||
}
|
||||
|
||||
for(auto itr : *kernel_dispatch_data)
|
||||
{
|
||||
tids.emplace(itr.thread_id);
|
||||
@@ -421,13 +445,20 @@ write_otf2(const output_config& cfg,
|
||||
|
||||
for(const auto& [tid, itr] : agent_thread_ids)
|
||||
for(auto agent : itr)
|
||||
agent_memcpy_info[tid].emplace(agent, location_base{pid, tid, agent});
|
||||
agent_memcpy_info[tid].emplace(
|
||||
agent, location_base{pid, tid, agent, ROCPROFILER_AGENT_MEMORY_COPY_TYPE});
|
||||
|
||||
for(const auto& [tid, itr] : agent_thread_ids_alloc)
|
||||
for(auto agent : itr)
|
||||
agent_memalloc_info[tid].emplace(
|
||||
agent, location_base{pid, tid, agent, ROCPROFILER_AGENT_MEMORY_ALLOC_TYPE});
|
||||
|
||||
for(const auto& [tid, itr] : agent_queue_ids)
|
||||
for(const auto& [agent, qitr] : itr)
|
||||
for(auto queue : qitr)
|
||||
agent_dispatch_info[tid][agent].emplace(queue,
|
||||
location_base{pid, tid, agent, queue});
|
||||
agent_dispatch_info[tid][agent].emplace(
|
||||
queue,
|
||||
location_base{pid, tid, agent, ROCPROFILER_AGENT_DISPATCH_TYPE, queue});
|
||||
}
|
||||
|
||||
for(auto& [tid, evt] : thread_event_info)
|
||||
@@ -451,6 +482,24 @@ write_otf2(const output_config& cfg,
|
||||
}
|
||||
}
|
||||
|
||||
for(auto& [tid, itr] : agent_memalloc_info)
|
||||
{
|
||||
for(auto& [agent, evt] : itr)
|
||||
{
|
||||
const auto* _agent = _get_agent(agent);
|
||||
auto _type_name = std::string_view{"UNK"};
|
||||
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
_type_name = "CPU";
|
||||
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
_type_name = "GPU";
|
||||
|
||||
evt.name = fmt::format("Thread {}, Memory Allocation at {} {}",
|
||||
tid,
|
||||
_type_name,
|
||||
_agent->logical_node_type_id);
|
||||
}
|
||||
}
|
||||
|
||||
auto _queue_ids = std::map<rocprofiler_queue_id_t, uint64_t>{};
|
||||
for(auto& [tid, itr] : agent_dispatch_info)
|
||||
for(auto& [agent, qitr] : itr)
|
||||
@@ -583,6 +632,30 @@ write_otf2(const output_config& cfg,
|
||||
nullptr});
|
||||
}
|
||||
|
||||
for(auto itr : *memory_allocation_data)
|
||||
{
|
||||
auto name = buffer_names.at(itr.kind, itr.operation);
|
||||
_hash_data.emplace(
|
||||
get_hash_id(name),
|
||||
region_info{std::string{name}, OTF2_REGION_ROLE_ALLOCATE, OTF2_PARADIGM_HIP});
|
||||
|
||||
// TODO: add attributes for memory allocation parameters
|
||||
|
||||
auto& _evt_info = agent_memalloc_info.at(itr.thread_id).at(itr.agent_id);
|
||||
_evt_info.event_count += 1;
|
||||
|
||||
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER,
|
||||
name,
|
||||
_evt_info.get_location(),
|
||||
itr.start_timestamp,
|
||||
get_attr(sdk::category::memory_allocation{})});
|
||||
_data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT,
|
||||
name,
|
||||
_evt_info.get_location(),
|
||||
itr.end_timestamp,
|
||||
nullptr});
|
||||
}
|
||||
|
||||
for(auto itr : *kernel_dispatch_data)
|
||||
{
|
||||
const auto& info = itr.dispatch_info;
|
||||
@@ -781,6 +854,24 @@ write_otf2(const output_config& cfg,
|
||||
}
|
||||
}
|
||||
|
||||
// Memalloc Events
|
||||
for(auto& [tid, itr] : agent_memalloc_info)
|
||||
{
|
||||
for(auto& [agent, evt] : itr)
|
||||
{
|
||||
auto _hash = get_hash_id(evt.name);
|
||||
|
||||
add_write_string(_hash, evt.name);
|
||||
OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer,
|
||||
evt.id(), // id
|
||||
_hash,
|
||||
OTF2_LOCATION_TYPE_ACCELERATOR_STREAM,
|
||||
2 * evt.event_count, // # events
|
||||
agent.handle // location group
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Dispatch Events
|
||||
for(auto& [tid, itr] : agent_dispatch_info)
|
||||
{
|
||||
|
||||
@@ -34,16 +34,18 @@ namespace rocprofiler
|
||||
namespace tool
|
||||
{
|
||||
void
|
||||
write_otf2(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
uint64_t pid,
|
||||
const std::vector<agent_info>& agent_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hip_api_record_t>* hip_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>* kernel_dispatch_data,
|
||||
std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>* memory_copy_data,
|
||||
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* scratch_memory_data,
|
||||
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data);
|
||||
write_otf2(
|
||||
const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
uint64_t pid,
|
||||
const std::vector<agent_info>& agent_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hip_api_record_t>* hip_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>* kernel_dispatch_data,
|
||||
std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>* memory_copy_data,
|
||||
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* scratch_memory_data,
|
||||
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data,
|
||||
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>* memory_allocation_data);
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -71,7 +71,8 @@ write_perfetto(
|
||||
const generator<rocprofiler_buffer_tracing_memory_copy_record_t>& memory_copy_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& /*scratch_memory_gen*/,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen)
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen)
|
||||
{
|
||||
namespace sdk = ::rocprofiler::sdk;
|
||||
|
||||
@@ -127,9 +128,10 @@ write_perfetto(
|
||||
tracing_session->Setup(cfg);
|
||||
tracing_session->StartBlocking();
|
||||
|
||||
auto tids = std::set<rocprofiler_thread_id_t>{};
|
||||
auto demangled = std::unordered_map<std::string_view, std::string>{};
|
||||
auto agent_thread_ids = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
|
||||
auto tids = std::set<rocprofiler_thread_id_t>{};
|
||||
auto demangled = std::unordered_map<std::string_view, std::string>{};
|
||||
auto agent_thread_ids = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
|
||||
auto agent_thread_ids_alloc = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
|
||||
auto agent_queue_ids =
|
||||
std::unordered_map<rocprofiler_agent_id_t, std::unordered_set<rocprofiler_queue_id_t>>{};
|
||||
auto thread_indexes = std::unordered_map<rocprofiler_thread_id_t, uint64_t>{};
|
||||
@@ -138,6 +140,9 @@ write_perfetto(
|
||||
auto agent_thread_tracks =
|
||||
std::unordered_map<rocprofiler_agent_id_t,
|
||||
std::unordered_map<uint64_t, ::perfetto::Track>>{};
|
||||
auto agent_thread_tracks_alloc =
|
||||
std::unordered_map<rocprofiler_agent_id_t,
|
||||
std::unordered_map<uint64_t, ::perfetto::Track>>{};
|
||||
auto agent_queue_tracks =
|
||||
std::unordered_map<rocprofiler_agent_id_t,
|
||||
std::unordered_map<rocprofiler_queue_id_t, ::perfetto::Track>>{};
|
||||
@@ -171,6 +176,13 @@ write_perfetto(
|
||||
agent_thread_ids[itr.dst_agent_id].emplace(itr.thread_id);
|
||||
}
|
||||
|
||||
for(auto ditr : memory_allocation_gen)
|
||||
for(auto itr : memory_allocation_gen.get(ditr))
|
||||
{
|
||||
tids.emplace(itr.thread_id);
|
||||
agent_thread_ids_alloc[itr.agent_id].emplace(itr.thread_id);
|
||||
}
|
||||
|
||||
for(auto ditr : kernel_dispatch_gen)
|
||||
for(auto itr : kernel_dispatch_gen.get(ditr))
|
||||
{
|
||||
@@ -229,6 +241,33 @@ write_perfetto(
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& itr : agent_thread_ids_alloc)
|
||||
{
|
||||
const auto* _agent = _get_agent(itr.first);
|
||||
|
||||
for(auto titr : itr.second)
|
||||
{
|
||||
auto _namess = std::stringstream{};
|
||||
_namess << "MEMORY ALLOCATION on AGENT [" << _agent->logical_node_id << "] THREAD ["
|
||||
<< thread_indexes.at(titr) << "] ";
|
||||
|
||||
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
|
||||
_namess << "(CPU)";
|
||||
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
|
||||
_namess << "(GPU)";
|
||||
else
|
||||
_namess << "(UNK)";
|
||||
|
||||
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
|
||||
auto _desc = _track.Serialize();
|
||||
_desc.set_name(_namess.str());
|
||||
|
||||
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
|
||||
|
||||
agent_thread_tracks_alloc[itr.first].emplace(titr, _track);
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto& aitr : agent_queue_ids)
|
||||
{
|
||||
uint32_t nqueue = 0;
|
||||
@@ -424,6 +463,47 @@ write_perfetto(
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
|
||||
for(auto ditr : memory_allocation_gen)
|
||||
for(auto itr : memory_allocation_gen.get(ditr))
|
||||
{
|
||||
auto name = buffer_names.at(itr.kind, itr.operation);
|
||||
auto& track = agent_thread_tracks_alloc.at(itr.agent_id).at(itr.thread_id);
|
||||
std::stringstream hex_stream;
|
||||
hex_stream << "0x" << std::hex << std::setw(16) << std::setfill('0')
|
||||
<< itr.starting_address;
|
||||
std::string hex_starting_address(hex_stream.str());
|
||||
|
||||
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
::perfetto::StaticString(name.data()),
|
||||
track,
|
||||
itr.start_timestamp,
|
||||
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
|
||||
"begin_ns",
|
||||
itr.start_timestamp,
|
||||
"end_ns",
|
||||
itr.end_timestamp,
|
||||
"delta_ns",
|
||||
(itr.end_timestamp - itr.start_timestamp),
|
||||
"kind",
|
||||
itr.kind,
|
||||
"operation",
|
||||
itr.operation,
|
||||
"agent",
|
||||
agents_map.at(itr.agent_id).logical_node_id,
|
||||
"allocation_size",
|
||||
itr.allocation_size,
|
||||
"starting_address",
|
||||
hex_starting_address,
|
||||
"corr_id",
|
||||
itr.correlation_id.internal,
|
||||
"tid",
|
||||
itr.thread_id);
|
||||
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
track,
|
||||
itr.end_timestamp);
|
||||
tracing_session->FlushBlocking();
|
||||
}
|
||||
|
||||
for(auto ditr : kernel_dispatch_gen)
|
||||
for(auto itr : kernel_dispatch_gen.get(ditr))
|
||||
{
|
||||
|
||||
@@ -36,15 +36,16 @@ namespace tool
|
||||
{
|
||||
void
|
||||
write_perfetto(
|
||||
const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
std::vector<agent_info> agent_data,
|
||||
const generator<rocprofiler_buffer_tracing_hip_api_record_t>& hip_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_kernel_dispatch_record_t>& kernel_dispatch_gen,
|
||||
const generator<rocprofiler_buffer_tracing_memory_copy_record_t>& memory_copy_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen);
|
||||
const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
std::vector<agent_info> agent_data,
|
||||
const generator<rocprofiler_buffer_tracing_hip_api_record_t>& hip_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_kernel_dispatch_record_t>& kernel_dispatch_gen,
|
||||
const generator<rocprofiler_buffer_tracing_memory_copy_record_t>& memory_copy_gen,
|
||||
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen);
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -166,6 +166,24 @@ generate_stats(const output_config& /*cfg*/,
|
||||
return get_stats(marker_stats);
|
||||
}
|
||||
|
||||
stats_entry_t
|
||||
generate_stats(const output_config& /*cfg*/,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& data)
|
||||
{
|
||||
auto memory_allocation_stats = stats_map_t{};
|
||||
for(auto ditr : data)
|
||||
{
|
||||
for(auto record : data.get(ditr))
|
||||
{
|
||||
auto api_name = tool_metadata.get_operation_name(record.kind, record.operation);
|
||||
memory_allocation_stats[api_name] += (record.end_timestamp - record.start_timestamp);
|
||||
}
|
||||
}
|
||||
|
||||
return get_stats(memory_allocation_stats);
|
||||
}
|
||||
|
||||
stats_entry_t
|
||||
generate_stats(const output_config& /*cfg*/,
|
||||
const metadata& /*tool_metadata*/,
|
||||
|
||||
@@ -70,6 +70,11 @@ generate_stats(const output_config& cf
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& data);
|
||||
|
||||
stats_entry_t
|
||||
generate_stats(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& data);
|
||||
|
||||
void
|
||||
generate_stats(const output_config& cfg,
|
||||
const metadata& tool_metadata,
|
||||
|
||||
@@ -84,6 +84,7 @@ struct config : output_config
|
||||
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
|
||||
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
||||
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
||||
bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false);
|
||||
bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false);
|
||||
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
||||
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
|
||||
@@ -123,6 +124,7 @@ config::save(ArchiveT& ar) const
|
||||
CFG_SERIALIZE_MEMBER(hsa_finalizer_ext_api_trace);
|
||||
CFG_SERIALIZE_MEMBER(marker_api_trace);
|
||||
CFG_SERIALIZE_MEMBER(memory_copy_trace);
|
||||
CFG_SERIALIZE_MEMBER(memory_allocation_trace);
|
||||
CFG_SERIALIZE_MEMBER(scratch_memory_trace);
|
||||
CFG_SERIALIZE_MEMBER(counter_collection);
|
||||
CFG_SERIALIZE_MEMBER(hip_runtime_api_trace);
|
||||
|
||||
@@ -148,20 +148,22 @@ get_list_derived_metrics_file()
|
||||
|
||||
struct buffer_ids
|
||||
{
|
||||
rocprofiler_buffer_id_t hsa_api_trace = {};
|
||||
rocprofiler_buffer_id_t hip_api_trace = {};
|
||||
rocprofiler_buffer_id_t kernel_trace = {};
|
||||
rocprofiler_buffer_id_t memory_copy_trace = {};
|
||||
rocprofiler_buffer_id_t counter_collection = {};
|
||||
rocprofiler_buffer_id_t scratch_memory = {};
|
||||
rocprofiler_buffer_id_t rccl_api_trace = {};
|
||||
rocprofiler_buffer_id_t hsa_api_trace = {};
|
||||
rocprofiler_buffer_id_t hip_api_trace = {};
|
||||
rocprofiler_buffer_id_t kernel_trace = {};
|
||||
rocprofiler_buffer_id_t memory_copy_trace = {};
|
||||
rocprofiler_buffer_id_t memory_allocation_trace = {};
|
||||
rocprofiler_buffer_id_t counter_collection = {};
|
||||
rocprofiler_buffer_id_t scratch_memory = {};
|
||||
rocprofiler_buffer_id_t rccl_api_trace = {};
|
||||
|
||||
auto as_array() const
|
||||
{
|
||||
return std::array<rocprofiler_buffer_id_t, 7>{hsa_api_trace,
|
||||
return std::array<rocprofiler_buffer_id_t, 8>{hsa_api_trace,
|
||||
hip_api_trace,
|
||||
kernel_trace,
|
||||
memory_copy_trace,
|
||||
memory_allocation_trace,
|
||||
counter_collection,
|
||||
scratch_memory,
|
||||
rccl_api_trace};
|
||||
@@ -635,6 +637,13 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
|
||||
tool::write_ring_buffer(*record, domain_type::MEMORY_COPY);
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION)
|
||||
{
|
||||
auto* record = static_cast<rocprofiler_buffer_tracing_memory_allocation_record_t*>(
|
||||
header->payload);
|
||||
|
||||
tool::write_ring_buffer(*record, domain_type::MEMORY_ALLOCATION);
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* record = static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
|
||||
@@ -1110,6 +1119,26 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
"buffer tracing service for memory copy configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().memory_allocation_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
nullptr,
|
||||
&get_buffers().memory_allocation_trace),
|
||||
"create memory allocation buffer");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().memory_allocation_trace),
|
||||
"buffer tracing service for memory allocation configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().scratch_memory_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
@@ -1357,6 +1386,8 @@ tool_fini(void* /*tool_data*/)
|
||||
auto scratch_memory_output =
|
||||
tool::scratch_memory_buffered_output_t{tool::get_config().scratch_memory_trace};
|
||||
auto rccl_output = tool::rccl_buffered_output_t{tool::get_config().rccl_api_trace};
|
||||
auto memory_allocation_output =
|
||||
tool::memory_allocation_buffered_output_t{tool::get_config().memory_allocation_trace};
|
||||
|
||||
auto node_id_sort = [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; };
|
||||
|
||||
@@ -1374,6 +1405,7 @@ tool_fini(void* /*tool_data*/)
|
||||
generate_output(hsa_output, contributions);
|
||||
generate_output(hip_output, contributions);
|
||||
generate_output(memory_copy_output, contributions);
|
||||
generate_output(memory_allocation_output, contributions);
|
||||
generate_output(marker_output, contributions);
|
||||
generate_output(rccl_output, contributions);
|
||||
generate_output(counters_output, contributions);
|
||||
@@ -1401,7 +1433,8 @@ tool_fini(void* /*tool_data*/)
|
||||
counters_output.get_generator(),
|
||||
marker_output.get_generator(),
|
||||
scratch_memory_output.get_generator(),
|
||||
rccl_output.get_generator());
|
||||
rccl_output.get_generator(),
|
||||
memory_allocation_output.get_generator());
|
||||
json_ar.finish_process();
|
||||
|
||||
tool::close_json(json_ar);
|
||||
@@ -1418,18 +1451,20 @@ tool_fini(void* /*tool_data*/)
|
||||
memory_copy_output.get_generator(),
|
||||
marker_output.get_generator(),
|
||||
scratch_memory_output.get_generator(),
|
||||
rccl_output.get_generator());
|
||||
rccl_output.get_generator(),
|
||||
memory_allocation_output.get_generator());
|
||||
}
|
||||
|
||||
if(tool::get_config().otf2_output)
|
||||
{
|
||||
auto hip_elem_data = hip_output.load_all();
|
||||
auto hsa_elem_data = hsa_output.load_all();
|
||||
auto kernel_dispatch_elem_data = kernel_dispatch_output.load_all();
|
||||
auto memory_copy_elem_data = memory_copy_output.load_all();
|
||||
auto marker_elem_data = marker_output.load_all();
|
||||
auto scratch_memory_elem_data = scratch_memory_output.load_all();
|
||||
auto rccl_elem_data = rccl_output.load_all();
|
||||
auto hip_elem_data = hip_output.load_all();
|
||||
auto hsa_elem_data = hsa_output.load_all();
|
||||
auto kernel_dispatch_elem_data = kernel_dispatch_output.load_all();
|
||||
auto memory_copy_elem_data = memory_copy_output.load_all();
|
||||
auto marker_elem_data = marker_output.load_all();
|
||||
auto scratch_memory_elem_data = scratch_memory_output.load_all();
|
||||
auto rccl_elem_data = rccl_output.load_all();
|
||||
auto memory_allocation_elem_data = memory_allocation_output.load_all();
|
||||
|
||||
tool::write_otf2(tool::get_config(),
|
||||
*tool_metadata,
|
||||
@@ -1441,7 +1476,8 @@ tool_fini(void* /*tool_data*/)
|
||||
&memory_copy_elem_data,
|
||||
&marker_elem_data,
|
||||
&scratch_memory_elem_data,
|
||||
&rccl_elem_data);
|
||||
&rccl_elem_data,
|
||||
&memory_allocation_elem_data);
|
||||
}
|
||||
|
||||
if(tool::get_config().summary_output)
|
||||
@@ -1455,6 +1491,7 @@ tool_fini(void* /*tool_data*/)
|
||||
destroy_output(hsa_output);
|
||||
destroy_output(hip_output);
|
||||
destroy_output(memory_copy_output);
|
||||
destroy_output(memory_allocation_output);
|
||||
destroy_output(marker_output);
|
||||
destroy_output(counters_output);
|
||||
destroy_output(scratch_memory_output);
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "lib/rocprofiler-sdk/hip/hip.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/async_copy.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp"
|
||||
#include "lib/rocprofiler-sdk/marker/marker.hpp"
|
||||
@@ -80,6 +81,7 @@ ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_CORE_API)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_CONTROL_API)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_NAME_API)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_COPY)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_ALLOCATION)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(KERNEL_DISPATCH)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(PAGE_MIGRATION)
|
||||
ROCPROFILER_BUFFER_TRACING_KIND_STRING(SCRATCH_MEMORY)
|
||||
@@ -218,6 +220,11 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_
|
||||
val = rocprofiler::hsa::async_copy::name_by_id(operation);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION:
|
||||
{
|
||||
val = rocprofiler::hsa::memory_allocation::name_by_id(operation);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY:
|
||||
{
|
||||
val = rocprofiler::hsa::scratch_memory::name_by_id(operation);
|
||||
@@ -339,6 +346,11 @@ rocprofiler_iterate_buffer_tracing_kind_operations(
|
||||
ops = rocprofiler::hsa::async_copy::get_ids();
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION:
|
||||
{
|
||||
ops = rocprofiler::hsa::memory_allocation::get_ids();
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY:
|
||||
{
|
||||
ops = rocprofiler::hsa::scratch_memory::get_ids();
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "lib/rocprofiler-sdk/hip/hip.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/async_copy.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp"
|
||||
#include "lib/rocprofiler-sdk/marker/marker.hpp"
|
||||
@@ -81,6 +82,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(CODE_OBJECT)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(SCRATCH_MEMORY)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_COPY)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_ALLOCATION)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(RCCL_API)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(OPENMP)
|
||||
|
||||
@@ -256,6 +258,11 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac
|
||||
return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
|
||||
{
|
||||
val = rocprofiler::hsa::memory_allocation::name_by_id(operation);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
if(!val)
|
||||
@@ -375,6 +382,11 @@ rocprofiler_iterate_callback_tracing_kind_operations(
|
||||
return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
|
||||
{
|
||||
ops = rocprofiler::hsa::memory_allocation::get_ids();
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
for(const auto& itr : ops)
|
||||
@@ -504,6 +516,7 @@ rocprofiler_iterate_callback_tracing_kind_operation_args(
|
||||
case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT:
|
||||
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY:
|
||||
case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION:
|
||||
case ROCPROFILER_CALLBACK_TRACING_RCCL_API:
|
||||
case ROCPROFILER_CALLBACK_TRACING_OPENMP:
|
||||
{
|
||||
|
||||
@@ -6,6 +6,7 @@ set(ROCPROFILER_LIB_HSA_SOURCES
|
||||
async_copy.cpp
|
||||
hsa_barrier.cpp
|
||||
hsa.cpp
|
||||
memory_allocation.cpp
|
||||
pc_sampling.hpp
|
||||
profile_serializer.cpp
|
||||
queue_controller.cpp
|
||||
@@ -16,6 +17,7 @@ set(ROCPROFILER_LIB_HSA_HEADERS
|
||||
agent_cache.hpp
|
||||
aql_packet.hpp
|
||||
async_copy.hpp
|
||||
memory_allocation.hpp
|
||||
defines.hpp
|
||||
hsa_barrier.hpp
|
||||
hsa.hpp
|
||||
|
||||
@@ -474,6 +474,15 @@ HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_async_copy_rect, hsa_amd_memory_async_copy_rect, hsa_amd_memory_async_copy_rect_fn)
|
||||
// clang-format on
|
||||
|
||||
#elif defined(ROCPROFILER_LIB_ROCPROFILER_HSA_MEMORY_ALLOCATION_CPP_IMPL) && \
|
||||
ROCPROFILER_LIB_ROCPROFILER_HSA_MEMORY_ALLOCATION_CPP_IMPL == 1
|
||||
|
||||
// clang-format off
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, hsa_memory_allocate, hsa_memory_allocate_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate_fn)
|
||||
HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create_fn)
|
||||
// clang-format on
|
||||
|
||||
#else
|
||||
# error "Do not compile this file directly. It is included by lib/rocprofiler/hsa/hsa.cpp"
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,635 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp"
|
||||
|
||||
#include "lib/common/logging.hpp"
|
||||
#include "lib/common/scope_destructor.hpp"
|
||||
#include "lib/common/static_object.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/agent.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
#include "lib/rocprofiler-sdk/tracing/fwd.hpp"
|
||||
#include "lib/rocprofiler-sdk/tracing/tracing.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/external_correlation.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa/api_id.h>
|
||||
#include <rocprofiler-sdk/hsa/table_id.h>
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include <hsa/amd_hsa_signal.h>
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <type_traits>
|
||||
|
||||
#define ROCPROFILER_LIB_ROCPROFILER_HSA_MEMORY_ALLOCATION_CPP_IMPL 1
|
||||
|
||||
// template specializations
|
||||
#include "hsa.def.cpp"
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
namespace memory_allocation
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using context_t = context::context;
|
||||
using external_corr_id_map_t = std::unordered_map<const context_t*, rocprofiler_user_data_t>;
|
||||
using region_to_agent_map = std::unordered_map<hsa_region_t, rocprofiler_agent_id_t>;
|
||||
using memory_pool_to_agent_map = std::unordered_map<hsa_amd_memory_pool_t, rocprofiler_agent_id_t>;
|
||||
using region_to_agent_pair = std::pair<region_to_agent_map*, rocprofiler_agent_id_t>;
|
||||
using map_pool_to_agent_pair = std::pair<memory_pool_to_agent_map*, rocprofiler_agent_id_t>;
|
||||
|
||||
// Set up information to identify agent from regions/pool
|
||||
template <size_t OpIdx>
|
||||
struct memory_allocation_info;
|
||||
|
||||
#define SPECIALIZE_MEMORY_ALLOCATION_INFO(FUNCTION, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC) \
|
||||
template <> \
|
||||
struct memory_allocation_info<ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION> \
|
||||
{ \
|
||||
using maptype = MAPTYPE; \
|
||||
using pairtype = PAIRTYPE; \
|
||||
using searchtype = SEARCHTYPE; \
|
||||
auto& operator()() const { return ITERATEFUNC; } \
|
||||
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \
|
||||
static constexpr auto name = "MEMORY_ALLOCATION_" #FUNCTION; \
|
||||
};
|
||||
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(NONE,
|
||||
region_to_agent_map,
|
||||
region_to_agent_pair,
|
||||
hsa_region_t,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(ALLOCATE,
|
||||
region_to_agent_map,
|
||||
region_to_agent_pair,
|
||||
hsa_region_t,
|
||||
get_core_table()->hsa_agent_iterate_regions_fn)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(MEMORY_POOL_ALLOCATE,
|
||||
memory_pool_to_agent_map,
|
||||
map_pool_to_agent_pair,
|
||||
hsa_amd_memory_pool_t,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn)
|
||||
SPECIALIZE_MEMORY_ALLOCATION_INFO(VMEM_HANDLE_CREATE,
|
||||
memory_pool_to_agent_map,
|
||||
map_pool_to_agent_pair,
|
||||
hsa_amd_memory_pool_t,
|
||||
get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn)
|
||||
#undef SPECIALIZE_MEMORY_ALLOCATION_INFO
|
||||
|
||||
template <size_t Idx, size_t... IdxTail>
|
||||
const char*
|
||||
name_by_id(const uint32_t id, std::index_sequence<Idx, IdxTail...>)
|
||||
{
|
||||
if(Idx == id) return memory_allocation_info<Idx>::name;
|
||||
if constexpr(sizeof...(IdxTail) > 0)
|
||||
return name_by_id(id, std::index_sequence<IdxTail...>{});
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <size_t Idx, size_t... IdxTail>
|
||||
uint32_t
|
||||
id_by_name(const char* name, std::index_sequence<Idx, IdxTail...>)
|
||||
{
|
||||
if(std::string_view{memory_allocation_info<Idx>::name} == std::string_view{name})
|
||||
return memory_allocation_info<Idx>::operation_idx;
|
||||
if constexpr(sizeof...(IdxTail) > 0)
|
||||
return id_by_name(name, std::index_sequence<IdxTail...>{});
|
||||
else
|
||||
return ROCPROFILER_MEMORY_ALLOCATION_LAST;
|
||||
}
|
||||
|
||||
template <size_t... Idx>
|
||||
void
|
||||
get_ids(std::vector<uint32_t>& _id_list, std::index_sequence<Idx...>)
|
||||
{
|
||||
auto _emplace = [](auto& _vec, uint32_t _v) {
|
||||
if(_v < static_cast<uint32_t>(ROCPROFILER_MEMORY_ALLOCATION_LAST)) _vec.emplace_back(_v);
|
||||
};
|
||||
|
||||
(_emplace(_id_list, memory_allocation_info<Idx>::operation_idx), ...);
|
||||
}
|
||||
|
||||
template <size_t... Idx>
|
||||
void
|
||||
get_names(std::vector<const char*>& _name_list, std::index_sequence<Idx...>)
|
||||
{
|
||||
auto _emplace = [](auto& _vec, const char* _v) {
|
||||
if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v);
|
||||
};
|
||||
|
||||
(_emplace(_name_list, memory_allocation_info<Idx>::name), ...);
|
||||
}
|
||||
|
||||
bool
|
||||
context_filter(const context::context* ctx)
|
||||
{
|
||||
auto has_buffered =
|
||||
(ctx->buffered_tracer &&
|
||||
(ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION)));
|
||||
|
||||
auto has_callback =
|
||||
(ctx->callback_tracer &&
|
||||
(ctx->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION)));
|
||||
|
||||
return (has_buffered || has_callback);
|
||||
}
|
||||
|
||||
// Sequence of HSA functions being tracked. Add to these to trace new commands
|
||||
enum memory_allocation_core_id
|
||||
{
|
||||
memory_allocation_core_allocate_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate,
|
||||
};
|
||||
using memory_allocation_core_index_seq_t = std::index_sequence<memory_allocation_core_allocate_id>;
|
||||
|
||||
enum memory_allocation_amd_ext_id
|
||||
{
|
||||
memory_allocation_amd_ext_allocate_id =
|
||||
ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate,
|
||||
memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create
|
||||
};
|
||||
using memory_allocation_amd_ext_index_seq_t =
|
||||
std::index_sequence<memory_allocation_amd_ext_allocate_id, memory_allocation_vmem_allocate_id>;
|
||||
|
||||
template <size_t TableIdx>
|
||||
struct memory_allocation_seq;
|
||||
|
||||
#define MEMORY_ALLOCATION_DEFINE_SEQ(TABLE_TYPE, SEQ) \
|
||||
template <> \
|
||||
struct memory_allocation_seq<TABLE_TYPE> \
|
||||
{ \
|
||||
static constexpr auto memory_allocation_index_seq_t = SEQ{}; \
|
||||
};
|
||||
|
||||
MEMORY_ALLOCATION_DEFINE_SEQ(ROCPROFILER_HSA_TABLE_ID_Core, memory_allocation_core_index_seq_t)
|
||||
MEMORY_ALLOCATION_DEFINE_SEQ(ROCPROFILER_HSA_TABLE_ID_AmdExt, memory_allocation_amd_ext_index_seq_t)
|
||||
|
||||
// Set argument indices for tracked functions
|
||||
template <size_t Idx>
|
||||
struct arg_indices;
|
||||
|
||||
#define HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES( \
|
||||
ENUM_ID, STARTING_ADDRESS_IDX, SIZE_IDX, REGION_IDX) \
|
||||
template <> \
|
||||
struct arg_indices<ENUM_ID> \
|
||||
{ \
|
||||
static constexpr auto starting_address_idx = STARTING_ADDRESS_IDX; \
|
||||
static constexpr auto size_idx = SIZE_IDX; \
|
||||
static constexpr auto region_idx = REGION_IDX; \
|
||||
};
|
||||
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_allocate_id, 2, 1, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_allocate_id, 3, 1, 0)
|
||||
HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_allocate_id, 4, 1, 0)
|
||||
|
||||
// Define operation indices for each tracked functions
|
||||
template <size_t Idx>
|
||||
struct memory_allocation_op;
|
||||
|
||||
#define MEMORY_ALLOCATE_OPERATION_IDX(ENUM_ID, FUNCTION) \
|
||||
template <> \
|
||||
struct memory_allocation_op<ENUM_ID> \
|
||||
{ \
|
||||
static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \
|
||||
};
|
||||
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, ALLOCATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, MEMORY_POOL_ALLOCATE);
|
||||
MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, VMEM_HANDLE_CREATE);
|
||||
|
||||
template <typename FuncT, typename ArgsT, size_t... Idx>
|
||||
decltype(auto)
|
||||
invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence<Idx...>)
|
||||
{
|
||||
return std::forward<FuncT>(_func)(std::get<Idx>(_args)...);
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx>
|
||||
auto&
|
||||
get_next_dispatch()
|
||||
{
|
||||
using function_t = typename hsa_api_meta<TableIdx, OpIdx>::function_type;
|
||||
static function_t _v = nullptr;
|
||||
return _v;
|
||||
}
|
||||
|
||||
constexpr auto null_rocp_agent_id =
|
||||
rocprofiler_agent_id_t{.handle = std::numeric_limits<uint64_t>::max()};
|
||||
|
||||
struct memory_allocation_data
|
||||
{
|
||||
using timestamp_t = rocprofiler_timestamp_t;
|
||||
using callback_data_t = rocprofiler_callback_tracing_memory_allocation_data_t;
|
||||
using buffered_data_t = rocprofiler_buffer_tracing_memory_allocation_record_t;
|
||||
|
||||
rocprofiler_thread_id_t tid = common::get_tid();
|
||||
rocprofiler_agent_id_t agent = null_rocp_agent_id;
|
||||
uint64_t size_allocated = 0;
|
||||
uint64_t starting_addr = 0;
|
||||
uint64_t start_ts = 0;
|
||||
context::correlation_id* correlation_id = nullptr;
|
||||
tracing::tracing_data tracing_data = {};
|
||||
rocprofiler_memory_allocation_operation_t func = ROCPROFILER_MEMORY_ALLOCATION_NONE;
|
||||
|
||||
callback_data_t get_callback_data(timestamp_t _beg = 0, timestamp_t _end = 0) const;
|
||||
buffered_data_t get_buffered_record(const context_t* _ctx,
|
||||
timestamp_t _beg = 0,
|
||||
timestamp_t _end = 0) const;
|
||||
};
|
||||
|
||||
memory_allocation_data::callback_data_t
|
||||
memory_allocation_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const
|
||||
{
|
||||
return common::init_public_api_struct(
|
||||
callback_data_t{}, _beg, _end, agent, starting_addr, size_allocated);
|
||||
}
|
||||
|
||||
memory_allocation_data::buffered_data_t
|
||||
memory_allocation_data::get_buffered_record(const context_t* _ctx,
|
||||
timestamp_t _beg,
|
||||
timestamp_t _end) const
|
||||
{
|
||||
auto _external_corr_id =
|
||||
(_ctx) ? tracing_data.external_correlation_ids.at(_ctx) : context::null_user_data;
|
||||
auto _corr_id = rocprofiler_correlation_id_t{correlation_id->internal, _external_corr_id};
|
||||
|
||||
return common::init_public_api_struct(buffered_data_t{},
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
func,
|
||||
_corr_id,
|
||||
correlation_id->thread_idx,
|
||||
_beg,
|
||||
_end,
|
||||
agent,
|
||||
starting_addr,
|
||||
size_allocated);
|
||||
}
|
||||
|
||||
// Callback function to populate the mapping of agents to regions
|
||||
template <size_t OpIdx, typename T>
|
||||
hsa_status_t
|
||||
callback_populate_map(T region_or_pool, void* data)
|
||||
{
|
||||
auto _agent_map_pair = static_cast<typename memory_allocation_info<OpIdx>::pairtype*>(data);
|
||||
auto _rocprof_agent = _agent_map_pair->second;
|
||||
auto existing_map = _agent_map_pair->first;
|
||||
|
||||
existing_map->insert({region_or_pool, _rocprof_agent});
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Returns the rocprofiler agent when given the region/pool
|
||||
template <size_t OpIdx, typename T, typename IterateFunc, typename CallbackFunc>
|
||||
rocprofiler_agent_id_t
|
||||
get_agent(T val, IterateFunc iterate_func, CallbackFunc callback)
|
||||
{
|
||||
static auto existing = typename memory_allocation_info<OpIdx>::maptype();
|
||||
|
||||
if(existing.count(val) == 0)
|
||||
{
|
||||
auto agents = rocprofiler::agent::get_agents();
|
||||
for(auto itr : agents)
|
||||
{
|
||||
auto hsa_agent = rocprofiler::agent::get_hsa_agent(itr);
|
||||
if(hsa_agent)
|
||||
{
|
||||
auto rocprof_agent = rocprofiler::agent::get_rocprofiler_agent(*hsa_agent);
|
||||
if(rocprof_agent)
|
||||
{
|
||||
auto data = typename memory_allocation_info<OpIdx>::pairtype{&existing,
|
||||
rocprof_agent->id};
|
||||
iterate_func(*hsa_agent, callback, &data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return existing.count(val) == 0 ? null_rocp_agent_id : existing.at(val);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
handle_starting_addr(void** starting_addr_pointer)
|
||||
{
|
||||
return reinterpret_cast<uint64_t>(*starting_addr_pointer);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
handle_starting_addr(hsa_amd_vmem_alloc_handle_t* vmem_alloc_handle)
|
||||
{
|
||||
return vmem_alloc_handle->handle;
|
||||
}
|
||||
|
||||
// Wrapper implementation that stores memory allocation information
|
||||
template <size_t TableIdx, size_t OpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
memory_allocation_impl(Args... args)
|
||||
{
|
||||
constexpr auto N = sizeof...(Args);
|
||||
constexpr auto starting_address_idx = arg_indices<OpIdx>::starting_address_idx;
|
||||
constexpr auto size_idx = arg_indices<OpIdx>::size_idx;
|
||||
constexpr auto region_idx = arg_indices<OpIdx>::region_idx;
|
||||
constexpr auto operation = memory_allocation_op<OpIdx>::operation_idx;
|
||||
|
||||
auto&& _tied_args = std::tie(args...);
|
||||
memory_allocation_data _data{};
|
||||
|
||||
{
|
||||
auto tracing_data = tracing::tracing_data{};
|
||||
|
||||
tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
OpIdx,
|
||||
tracing_data);
|
||||
// if no contexts are tracing memory copies for this direction, execute as usual
|
||||
if(tracing_data.empty())
|
||||
{
|
||||
return invoke(get_next_dispatch<TableIdx, OpIdx>(),
|
||||
std::move(_tied_args),
|
||||
std::make_index_sequence<N>{});
|
||||
}
|
||||
_data.tracing_data = std::move(tracing_data);
|
||||
}
|
||||
|
||||
auto& tracing_data = _data.tracing_data;
|
||||
auto starting_addr_pointer = std::get<starting_address_idx>(_tied_args);
|
||||
auto region_or_pool = std::get<region_idx>(_tied_args);
|
||||
|
||||
_data.tid = common::get_tid();
|
||||
_data.agent = get_agent<operation>(
|
||||
region_or_pool,
|
||||
memory_allocation_info<operation>{}(),
|
||||
callback_populate_map<operation, typename memory_allocation_info<operation>::searchtype>);
|
||||
_data.size_allocated = std::get<size_idx>(_tied_args);
|
||||
_data.func = operation;
|
||||
_data.correlation_id = context::get_latest_correlation_id();
|
||||
|
||||
if(!_data.correlation_id)
|
||||
{
|
||||
constexpr auto ref_count = 1;
|
||||
_data.correlation_id = context::correlation_tracing_service::construct(ref_count);
|
||||
}
|
||||
|
||||
// increase the reference count to denote that this correlation id is being used in a kernel
|
||||
_data.correlation_id->add_ref_count();
|
||||
auto thr_id = _data.correlation_id->thread_idx;
|
||||
tracing::populate_external_correlation_ids(
|
||||
tracing_data.external_correlation_ids,
|
||||
thr_id,
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
_data.correlation_id->internal);
|
||||
|
||||
if(!tracing_data.callback_contexts.empty())
|
||||
{
|
||||
auto _tracer_data = _data.get_callback_data();
|
||||
|
||||
tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts,
|
||||
thr_id,
|
||||
_data.correlation_id->internal,
|
||||
tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
_tracer_data);
|
||||
// enter callback may update the external correlation id field
|
||||
tracing::update_external_correlation_ids(
|
||||
tracing_data.external_correlation_ids,
|
||||
thr_id,
|
||||
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION);
|
||||
}
|
||||
auto start_ts = common::timestamp_ns();
|
||||
auto _ret = invoke(
|
||||
get_next_dispatch<TableIdx, OpIdx>(), std::move(_tied_args), std::make_index_sequence<N>{});
|
||||
auto end_ts = common::timestamp_ns();
|
||||
// Starting address is set after memory_allocation function is run. May need additional safety
|
||||
// checks before retrieving starting address?
|
||||
if(starting_addr_pointer != nullptr)
|
||||
{
|
||||
_data.starting_addr = handle_starting_addr(starting_addr_pointer);
|
||||
}
|
||||
|
||||
if(!tracing_data.empty())
|
||||
{
|
||||
if(!_data.tracing_data.callback_contexts.empty())
|
||||
{
|
||||
auto _tracer_data = _data.get_callback_data(start_ts, end_ts);
|
||||
|
||||
tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts,
|
||||
_data.tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
_tracer_data);
|
||||
}
|
||||
|
||||
if(!_data.tracing_data.buffered_contexts.empty())
|
||||
{
|
||||
auto record = _data.get_buffered_record(nullptr, start_ts, end_ts);
|
||||
|
||||
tracing::execute_buffer_record_emplace(_data.tracing_data.buffered_contexts,
|
||||
_data.tid,
|
||||
_data.correlation_id->internal,
|
||||
_data.tracing_data.external_correlation_ids,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
operation,
|
||||
record);
|
||||
}
|
||||
}
|
||||
|
||||
// decrement the reference count after usage in the callback/buffers
|
||||
_data.correlation_id->sub_ref_count();
|
||||
return _ret;
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx, typename RetT, typename... Args>
|
||||
auto get_memory_allocation_impl(RetT (*)(Args...))
|
||||
{
|
||||
return &memory_allocation_impl<TableIdx, OpIdx, Args...>;
|
||||
}
|
||||
} // namespace
|
||||
// check out the assembly here... this compiles to a switch statement
|
||||
const char*
|
||||
name_by_id(uint32_t id)
|
||||
{
|
||||
return name_by_id(id, std::make_index_sequence<ROCPROFILER_MEMORY_ALLOCATION_LAST>{});
|
||||
}
|
||||
|
||||
uint32_t
|
||||
id_by_name(const char* name)
|
||||
{
|
||||
return id_by_name(name, std::make_index_sequence<ROCPROFILER_MEMORY_ALLOCATION_LAST>{});
|
||||
}
|
||||
|
||||
std::vector<uint32_t>
|
||||
get_ids()
|
||||
{
|
||||
auto _data = std::vector<uint32_t>{};
|
||||
_data.reserve(ROCPROFILER_MEMORY_ALLOCATION_LAST);
|
||||
get_ids(_data, std::make_index_sequence<ROCPROFILER_MEMORY_ALLOCATION_LAST>{});
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::vector<const char*>
|
||||
get_names()
|
||||
{
|
||||
auto _data = std::vector<const char*>{};
|
||||
_data.reserve(ROCPROFILER_MEMORY_ALLOCATION_LAST);
|
||||
get_names(_data, std::make_index_sequence<ROCPROFILER_MEMORY_ALLOCATION_LAST>{});
|
||||
return _data;
|
||||
}
|
||||
|
||||
template <size_t TableIdx, typename LookupT = internal_table, typename Tp, size_t OpIdx>
|
||||
void
|
||||
memory_allocation_save(Tp* _orig, uint64_t _tbl_instance, std::integral_constant<size_t, OpIdx>)
|
||||
{
|
||||
using table_type = typename hsa_table_lookup<TableIdx>::type;
|
||||
|
||||
if constexpr(std::is_same<table_type, Tp>::value)
|
||||
{
|
||||
auto _meta = hsa_api_meta<TableIdx, OpIdx>{};
|
||||
|
||||
// original table and function
|
||||
auto& _orig_table = _meta.get_table(_orig);
|
||||
auto& _orig_func = _meta.get_table_func(_orig_table);
|
||||
|
||||
// table with copy function
|
||||
auto& _allocate_func = get_next_dispatch<TableIdx, OpIdx>();
|
||||
|
||||
ROCP_FATAL_IF(_allocate_func && _tbl_instance == 0)
|
||||
<< _meta.name << " has non-null function pointer " << _allocate_func
|
||||
<< " despite this being the first instance of the library being copies";
|
||||
|
||||
if(!_allocate_func)
|
||||
{
|
||||
ROCP_TRACE << "copying table entry for " << _meta.name;
|
||||
_allocate_func = _orig_func;
|
||||
}
|
||||
else
|
||||
{
|
||||
ROCP_TRACE << "skipping copying table entry for " << _meta.name
|
||||
<< " from table instance " << _tbl_instance;
|
||||
}
|
||||
}
|
||||
}
|
||||
template <size_t TableIdx,
|
||||
typename LookupT = internal_table,
|
||||
typename Tp,
|
||||
size_t OpIdx,
|
||||
size_t... OpIdxTail>
|
||||
void
|
||||
memory_allocation_save(Tp* _orig, uint64_t _tbl_instance, std::index_sequence<OpIdx, OpIdxTail...>)
|
||||
{
|
||||
memory_allocation_save<TableIdx, LookupT>(
|
||||
_orig, _tbl_instance, std::integral_constant<size_t, OpIdx>{});
|
||||
if constexpr(sizeof...(OpIdxTail) > 0)
|
||||
memory_allocation_save<TableIdx, LookupT>(
|
||||
_orig, _tbl_instance, std::index_sequence<OpIdxTail...>{});
|
||||
}
|
||||
|
||||
template <typename TableT>
|
||||
void
|
||||
memory_allocation_save(TableT* _orig, uint64_t _tbl_instance)
|
||||
{
|
||||
constexpr auto TableIdx = hsa_table_id_lookup<TableT>::value;
|
||||
if(_orig)
|
||||
memory_allocation_save<TableIdx, internal_table>(
|
||||
_orig, _tbl_instance, memory_allocation_seq<TableIdx>::memory_allocation_index_seq_t);
|
||||
}
|
||||
|
||||
template <size_t TableIdx, typename Tp, size_t OpIdx>
|
||||
void
|
||||
memory_allocation_wrap(Tp* _orig, std::integral_constant<size_t, OpIdx>)
|
||||
{
|
||||
auto _meta = hsa_api_meta<TableIdx, OpIdx>{};
|
||||
auto& _table = _meta.get_table(_orig);
|
||||
auto& _func = _meta.get_table_func(_table);
|
||||
|
||||
auto& _dispatch = get_next_dispatch<TableIdx, OpIdx>();
|
||||
CHECK_NOTNULL(_dispatch);
|
||||
_func = get_memory_allocation_impl<TableIdx, OpIdx>(_func);
|
||||
}
|
||||
|
||||
template <size_t TableIdx, typename Tp, size_t OpIdx, size_t... OpIdxTail>
|
||||
void
|
||||
memory_allocation_wrap(Tp* _orig, std::index_sequence<OpIdx, OpIdxTail...>)
|
||||
{
|
||||
memory_allocation_wrap<TableIdx>(_orig, std::integral_constant<size_t, OpIdx>{});
|
||||
if constexpr(sizeof...(OpIdxTail) > 0)
|
||||
memory_allocation_wrap<TableIdx>(_orig, std::index_sequence<OpIdxTail...>{});
|
||||
}
|
||||
|
||||
template <typename TableT>
|
||||
void
|
||||
memory_allocation_wrap(TableT* _orig)
|
||||
{
|
||||
constexpr auto TableIdx = hsa_table_id_lookup<TableT>::value;
|
||||
if(_orig)
|
||||
{
|
||||
memory_allocation_wrap<TableIdx>(
|
||||
_orig, memory_allocation_seq<TableIdx>::memory_allocation_index_seq_t);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace memory_allocation
|
||||
|
||||
template <typename TableT>
|
||||
void
|
||||
memory_allocation_init(TableT* _orig, uint64_t _tbl_instance)
|
||||
{
|
||||
constexpr auto TableIdx = hsa_table_id_lookup<TableT>::value;
|
||||
if(_orig)
|
||||
{
|
||||
memory_allocation::memory_allocation_save<TableIdx>(
|
||||
_orig,
|
||||
_tbl_instance,
|
||||
memory_allocation::memory_allocation_seq<TableIdx>::memory_allocation_index_seq_t);
|
||||
|
||||
auto ctxs = context::get_registered_contexts(memory_allocation::context_filter);
|
||||
if(!ctxs.empty())
|
||||
{
|
||||
memory_allocation::memory_allocation_wrap<TableIdx>(
|
||||
_orig,
|
||||
memory_allocation::memory_allocation_seq<TableIdx>::memory_allocation_index_seq_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_MEMORY_ALLOC_FUNC(TABLE_TYPE, TABLE_IDX) \
|
||||
template void memory_allocation_init<TABLE_TYPE>(TABLE_TYPE * _tbl, uint64_t _instv); \
|
||||
template void memory_allocation::memory_allocation_save<TABLE_TYPE>(TABLE_TYPE * _tbl, \
|
||||
uint64_t _instv); \
|
||||
template void memory_allocation::memory_allocation_wrap<TABLE_TYPE>(TABLE_TYPE * _tbl);
|
||||
|
||||
INSTANTIATE_MEMORY_ALLOC_FUNC(hsa_core_table_t, ROCPROFILER_HSA_TABLE_ID_Core)
|
||||
INSTANTIATE_MEMORY_ALLOC_FUNC(hsa_amd_ext_table_t, ROCPROFILER_HSA_TABLE_ID_AmdExt)
|
||||
#undef INSTANTIATE_MEMORY_ALLOC_FUNC
|
||||
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
@@ -0,0 +1,53 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
namespace memory_allocation
|
||||
{
|
||||
const char*
|
||||
name_by_id(uint32_t id);
|
||||
|
||||
uint32_t
|
||||
id_by_name(const char* name);
|
||||
|
||||
std::vector<const char*>
|
||||
get_names();
|
||||
|
||||
std::vector<uint32_t>
|
||||
get_ids();
|
||||
} // namespace memory_allocation
|
||||
|
||||
template <typename TableT>
|
||||
void
|
||||
memory_allocation_init(TableT* _orig, uint64_t _tbl_instance);
|
||||
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "lib/rocprofiler-sdk/hip/hip.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/async_copy.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
@@ -790,6 +791,8 @@ rocprofiler_set_api_table(const char* name,
|
||||
rocprofiler::counters::device_counting_service_hsa_registration();
|
||||
|
||||
rocprofiler::hsa::async_copy_init(hsa_api_table, lib_instance);
|
||||
rocprofiler::hsa::memory_allocation_init(hsa_api_table->core_, lib_instance);
|
||||
rocprofiler::hsa::memory_allocation_init(hsa_api_table->amd_ext_, lib_instance);
|
||||
rocprofiler::code_object::initialize(hsa_api_table);
|
||||
rocprofiler::thread_trace::initialize(hsa_api_table);
|
||||
#if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0
|
||||
|
||||
@@ -54,6 +54,7 @@ add_subdirectory(bin)
|
||||
# validation tests
|
||||
add_subdirectory(kernel-tracing)
|
||||
add_subdirectory(async-copy-tracing)
|
||||
add_subdirectory(hsa-memory-allocation)
|
||||
add_subdirectory(scratch-memory-tracing)
|
||||
add_subdirectory(c-tool)
|
||||
add_subdirectory(page-migration)
|
||||
|
||||
@@ -24,3 +24,4 @@ add_subdirectory(scratch-memory)
|
||||
add_subdirectory(page-migration)
|
||||
add_subdirectory(hsa-queue-dependency)
|
||||
add_subdirectory(hip-graph)
|
||||
add_subdirectory(hsa-memory-allocation)
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tool-test-app-hsa-memory-allocation LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(hsa-memory-allocation.cpp PROPERTIES LANGUAGE HIP)
|
||||
add_executable(hsa-memory-allocation)
|
||||
target_sources(hsa-memory-allocation PRIVATE hsa-memory-allocation.cpp)
|
||||
target_compile_options(hsa-memory-allocation PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow
|
||||
-Werror)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(hsa-memory-allocation PRIVATE Threads::Threads hsa-runtime64
|
||||
rocprofiler-sdk::tests-common-library)
|
||||
@@ -0,0 +1,267 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#define RET_IF_HSA_ERR(err) \
|
||||
{ \
|
||||
if((err) != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
char err_val[12]; \
|
||||
char* err_str = nullptr; \
|
||||
if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
sprintf(&(err_val[0]), "%#x", (uint32_t) err); \
|
||||
err_str = &(err_val[0]); \
|
||||
} \
|
||||
printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \
|
||||
printf("Call returned %s\n", err_str); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Callback function to get the list of agents
|
||||
hsa_status_t
|
||||
get_agents(hsa_agent_t agent, void* data)
|
||||
{
|
||||
hsa_agent_t** agent_list = (hsa_agent_t**) data;
|
||||
**agent_list = agent;
|
||||
++(*agent_list);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Callback function to get the number of agents
|
||||
hsa_status_t
|
||||
get_num_agents(hsa_agent_t agent, void* data)
|
||||
{
|
||||
(void) agent;
|
||||
int* num_agents = (int*) data;
|
||||
++(*num_agents);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Callback function to get the number of regions of an agent
|
||||
hsa_status_t
|
||||
callback_get_num_regions(hsa_region_t region, void* data)
|
||||
{
|
||||
(void) region;
|
||||
int* num_regions = (int*) data;
|
||||
++(*num_regions);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Callback function to get the number of memory pools of an agent
|
||||
hsa_status_t
|
||||
callback_get_num_pools(hsa_amd_memory_pool_t memory_pool, void* data)
|
||||
{
|
||||
(void) memory_pool;
|
||||
int* num_pools = (int*) data;
|
||||
++(*num_pools);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Callback function to get the list of regions of an agent
|
||||
hsa_status_t
|
||||
callback_get_regions(hsa_region_t region, void* data)
|
||||
{
|
||||
hsa_region_t** region_list = (hsa_region_t**) data;
|
||||
**region_list = region;
|
||||
++(*region_list);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Callback function to get the list of memory pools of an agent
|
||||
hsa_status_t
|
||||
callback_get_memory_pools(hsa_amd_memory_pool_t memory_pool, void* data)
|
||||
{
|
||||
hsa_amd_memory_pool_t** pool_list = (hsa_amd_memory_pool_t**) data;
|
||||
**pool_list = memory_pool;
|
||||
++(*pool_list);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<hsa_agent_t>
|
||||
get_agent_list()
|
||||
{
|
||||
size_t num_agents = 0;
|
||||
hsa_status_t status;
|
||||
// Get number of agents
|
||||
status = hsa_iterate_agents(get_num_agents, &num_agents);
|
||||
RET_IF_HSA_ERR(status)
|
||||
if(num_agents < 2)
|
||||
{
|
||||
printf("Not enough HSA agents available\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
// Create a array of size num_agents to store the agent list
|
||||
std::vector<hsa_agent_t> agents(num_agents);
|
||||
|
||||
// Get the agent list
|
||||
hsa_agent_t* agent_iter = &agents[0];
|
||||
status = hsa_iterate_agents(get_agents, &agent_iter);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
return agents;
|
||||
}
|
||||
|
||||
hsa_agent_t
|
||||
get_cpu_agent(std::vector<hsa_agent_t>& agents)
|
||||
{
|
||||
for(hsa_agent_t agent : agents)
|
||||
{
|
||||
hsa_device_type_t ag_type;
|
||||
hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
if(ag_type == HSA_DEVICE_TYPE_CPU)
|
||||
{
|
||||
return agent;
|
||||
}
|
||||
}
|
||||
std::cerr << "No CPU agents available" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
hsa_agent_t
|
||||
get_gpu_agent(std::vector<hsa_agent_t>& agents)
|
||||
{
|
||||
for(hsa_agent_t agent : agents)
|
||||
{
|
||||
hsa_device_type_t ag_type;
|
||||
hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
if(ag_type == HSA_DEVICE_TYPE_GPU)
|
||||
{
|
||||
return agent;
|
||||
}
|
||||
}
|
||||
std::cerr << "No GPU agents available" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
void
|
||||
call_hsa_memory_allocate(const size_t i, const size_t base_size, hsa_agent_t agent)
|
||||
{
|
||||
// Getting total number of regions for the agent
|
||||
int num_regions = 0;
|
||||
hsa_status_t status = hsa_agent_iterate_regions(agent, callback_get_num_regions, &num_regions);
|
||||
RET_IF_HSA_ERR(status)
|
||||
if(num_regions < 1)
|
||||
{
|
||||
printf("No HSA regions available\n");
|
||||
abort();
|
||||
}
|
||||
// Allocate memory to hold region list of an agent
|
||||
std::vector<hsa_region_t> region_list(num_regions);
|
||||
hsa_region_t* ptr_reg = ®ion_list[0];
|
||||
status = hsa_agent_iterate_regions(agent, callback_get_regions, &ptr_reg);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
for(size_t j = 0; j < i; ++j)
|
||||
{
|
||||
void* addr = 0;
|
||||
|
||||
status = hsa_memory_allocate(region_list[0], base_size, &addr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
call_hsa_memory_pool_allocate(const size_t i, const size_t base_size, hsa_agent_t agent)
|
||||
{
|
||||
// Getting total number of regions for the agent
|
||||
int num_pools = 0;
|
||||
hsa_status_t status =
|
||||
hsa_amd_agent_iterate_memory_pools(agent, callback_get_num_pools, &num_pools);
|
||||
RET_IF_HSA_ERR(status)
|
||||
if(num_pools < 1)
|
||||
{
|
||||
printf("No memory pools available\n");
|
||||
abort();
|
||||
}
|
||||
// Allocate memory to hold region list of an agent
|
||||
std::vector<hsa_amd_memory_pool_t> memory_pool_list(num_pools);
|
||||
hsa_amd_memory_pool_t* ptr_memory_pool = &memory_pool_list[0];
|
||||
status = hsa_amd_agent_iterate_memory_pools(agent, callback_get_memory_pools, &ptr_memory_pool);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
for(size_t j = 0; j < i; ++j)
|
||||
{
|
||||
void* addr = 0;
|
||||
uint32_t flags = 0;
|
||||
|
||||
status = hsa_amd_memory_pool_allocate(memory_pool_list[0], base_size, flags, &addr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
call_hsa_vmem_allocate(const size_t i, hsa_agent_t agent)
|
||||
{
|
||||
// Getting total number of regions for the agent
|
||||
int num_pools = 0;
|
||||
hsa_status_t status =
|
||||
hsa_amd_agent_iterate_memory_pools(agent, callback_get_num_pools, &num_pools);
|
||||
RET_IF_HSA_ERR(status)
|
||||
if(num_pools < 1)
|
||||
{
|
||||
printf("No memory pools available\n");
|
||||
abort();
|
||||
}
|
||||
// Allocate memory to hold region list of an agent
|
||||
std::vector<hsa_amd_memory_pool_t> memory_pool_list(num_pools);
|
||||
hsa_amd_memory_pool_t* ptr_memory_pool = &memory_pool_list[0];
|
||||
status = hsa_amd_agent_iterate_memory_pools(agent, callback_get_memory_pools, &ptr_memory_pool);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// Ensure Virtual Memory API is supported
|
||||
bool supp = false;
|
||||
status = hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*) &supp);
|
||||
RET_IF_HSA_ERR(status)
|
||||
if(!supp)
|
||||
{
|
||||
std::cerr << "Virtual Memory API not supported" << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
// Get runtime allocation granule size. Required for vmem_handle_create
|
||||
int size;
|
||||
status = hsa_amd_memory_pool_get_info(
|
||||
memory_pool_list[0], HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, (void*) &size);
|
||||
RET_IF_HSA_ERR(status)
|
||||
for(size_t j = 0; j < i; ++j)
|
||||
{
|
||||
hsa_amd_vmem_alloc_handle_t memory_handle{};
|
||||
|
||||
status = hsa_amd_vmem_handle_create(
|
||||
memory_pool_list[0], size, MEMORY_TYPE_NONE, 0, &memory_handle);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
hsa_status_t status;
|
||||
status = hsa_init();
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
std::vector<hsa_agent_t> agents = get_agent_list();
|
||||
hsa_agent_t cpu_agent = get_cpu_agent(agents);
|
||||
hsa_agent_t gpu_agent = get_gpu_agent(agents);
|
||||
call_hsa_memory_allocate(6, 1024, cpu_agent);
|
||||
call_hsa_memory_pool_allocate(9, 512, gpu_agent);
|
||||
// Virtual memory API not supported in CI. Will add back if this changes
|
||||
// call_hsa_vmem_allocate(3, gpu_agent);
|
||||
|
||||
status = hsa_shut_down();
|
||||
RET_IF_HSA_ERR(status)
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(
|
||||
rocprofiler-tests-memory-allocation-tracing
|
||||
LANGUAGES CXX
|
||||
VERSION 0.0.0)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
if(ROCPROFILER_MEMCHECK_PRELOAD_ENV)
|
||||
set(PRELOAD_ENV
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$<TARGET_FILE:rocprofiler-sdk-json-tool>")
|
||||
else()
|
||||
set(PRELOAD_ENV "LD_PRELOAD=$<TARGET_FILE:rocprofiler-sdk-json-tool>")
|
||||
endif()
|
||||
|
||||
add_test(NAME test-memory-allocation-tracing-execute
|
||||
COMMAND $<TARGET_FILE:hsa-memory-allocation>)
|
||||
|
||||
set(memory-allocation-tracing-env
|
||||
"${PRELOAD_ENV}"
|
||||
"ROCPROFILER_TOOL_OUTPUT_FILE=memory-allocation-tracing-test.json"
|
||||
"LD_LIBRARY_PATH=$<TARGET_FILE_DIR:rocprofiler-sdk::rocprofiler-sdk-shared-library>:$ENV{LD_LIBRARY_PATH}"
|
||||
)
|
||||
|
||||
set_tests_properties(
|
||||
test-memory-allocation-tracing-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
|
||||
"${memory-allocation-tracing-env}" FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
|
||||
# copy to binary directory
|
||||
rocprofiler_configure_pytest_files(COPY validate.py conftest.py CONFIG pytest.ini)
|
||||
|
||||
add_test(NAME test-memory-allocation-tracing-validate
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/memory-allocation-tracing-test.json)
|
||||
|
||||
set_tests_properties(
|
||||
test-memory-allocation-tracing-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
test-memory-allocation-tracing-execute FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}")
|
||||
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--input",
|
||||
action="store",
|
||||
default="memory-allocation-tracing-test.json",
|
||||
help="Input JSON",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_data(request):
|
||||
filename = request.config.getoption("--input")
|
||||
with open(filename, "r") as inp:
|
||||
return dotdict(json.load(inp))
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
[pytest]
|
||||
addopts = --durations=20 -rA -s -vv
|
||||
testpaths = validate.py
|
||||
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
|
||||
@@ -0,0 +1,289 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
|
||||
# helper function
|
||||
def node_exists(name, data, min_len=1):
|
||||
assert name in data
|
||||
assert data[name] is not None
|
||||
if isinstance(data[name], (list, tuple, dict, set)):
|
||||
assert len(data[name]) >= min_len, f"{name}:\n{data}"
|
||||
|
||||
|
||||
def test_data_structure(input_data):
|
||||
"""verify minimum amount of expected data is present"""
|
||||
data = input_data
|
||||
|
||||
node_exists("rocprofiler-sdk-json-tool", data)
|
||||
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
node_exists("metadata", sdk_data)
|
||||
node_exists("pid", sdk_data["metadata"])
|
||||
node_exists("main_tid", sdk_data["metadata"])
|
||||
node_exists("init_time", sdk_data["metadata"])
|
||||
node_exists("fini_time", sdk_data["metadata"])
|
||||
|
||||
node_exists("agents", sdk_data)
|
||||
node_exists("call_stack", sdk_data)
|
||||
node_exists("callback_records", sdk_data)
|
||||
node_exists("buffer_records", sdk_data)
|
||||
|
||||
node_exists("names", sdk_data["callback_records"])
|
||||
node_exists("hsa_api_traces", sdk_data["callback_records"])
|
||||
node_exists("memory_allocations", sdk_data["callback_records"])
|
||||
|
||||
node_exists("names", sdk_data["buffer_records"])
|
||||
node_exists("hsa_api_traces", sdk_data["callback_records"])
|
||||
node_exists("memory_allocations", sdk_data["buffer_records"])
|
||||
|
||||
|
||||
def test_size_entries(input_data):
|
||||
# check that size fields are > 0 but account for function arguments
|
||||
# which are named "size"
|
||||
def check_size(data, bt):
|
||||
if "size" in data.keys():
|
||||
if isinstance(data["size"], str) and bt.endswith('["args"]'):
|
||||
pass
|
||||
else:
|
||||
assert data["size"] > 0, f"origin: {bt}"
|
||||
|
||||
# recursively check the entire data structure
|
||||
def iterate_data(data, bt):
|
||||
if isinstance(data, (list, tuple)):
|
||||
for i, itr in enumerate(data):
|
||||
if isinstance(itr, dict):
|
||||
check_size(itr, f"{bt}[{i}]")
|
||||
iterate_data(itr, f"{bt}[{i}]")
|
||||
elif isinstance(data, dict):
|
||||
check_size(data, f"{bt}")
|
||||
for key, itr in data.items():
|
||||
iterate_data(itr, f'{bt}["{key}"]')
|
||||
|
||||
# start recursive check over entire JSON dict
|
||||
iterate_data(input_data, "input_data")
|
||||
|
||||
|
||||
def test_timestamps(input_data):
|
||||
"""Verify starting timestamps are less than ending timestamps"""
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
cb_start = {}
|
||||
cb_end = {}
|
||||
for titr in ["hsa_api_traces"]:
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
cid = itr["correlation_id"]["internal"]
|
||||
phase = itr["phase"]
|
||||
if phase == 1:
|
||||
cb_start[cid] = itr["timestamp"]
|
||||
elif phase == 2:
|
||||
cb_end[cid] = itr["timestamp"]
|
||||
assert cb_start[cid] <= itr["timestamp"]
|
||||
else:
|
||||
assert phase == 1 or phase == 2
|
||||
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["start_timestamp"] <= itr["end_timestamp"]
|
||||
|
||||
for titr in ["memory_allocations"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["start_timestamp"] < itr["end_timestamp"], f"[{titr}] {itr}"
|
||||
assert itr["correlation_id"]["internal"] > 0, f"[{titr}] {itr}"
|
||||
assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}"
|
||||
assert (
|
||||
sdk_data["metadata"]["init_time"] < itr["start_timestamp"]
|
||||
), f"[{titr}] {itr}"
|
||||
assert (
|
||||
sdk_data["metadata"]["init_time"] < itr["end_timestamp"]
|
||||
), f"[{titr}] {itr}"
|
||||
assert (
|
||||
sdk_data["metadata"]["fini_time"] > itr["start_timestamp"]
|
||||
), f"[{titr}] {itr}"
|
||||
assert (
|
||||
sdk_data["metadata"]["fini_time"] > itr["end_timestamp"]
|
||||
), f"[{titr}] {itr}"
|
||||
|
||||
api_start = cb_start[itr["correlation_id"]["internal"]]
|
||||
# api_end = cb_end[itr["correlation_id"]["internal"]]
|
||||
assert api_start < itr["start_timestamp"], f"[{titr}] {itr}"
|
||||
# assert api_end <= itr["end_timestamp"], f"[{titr}] {itr}"
|
||||
|
||||
|
||||
def test_internal_correlation_ids(input_data):
|
||||
"""Assure correlation ids are unique"""
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
api_corr_ids = []
|
||||
for titr in ["hsa_api_traces"]:
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
api_corr_ids.append(itr["correlation_id"]["internal"])
|
||||
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
api_corr_ids.append(itr["correlation_id"]["internal"])
|
||||
|
||||
api_corr_ids_sorted = sorted(api_corr_ids)
|
||||
api_corr_ids_unique = list(set(api_corr_ids))
|
||||
|
||||
for itr in sdk_data["buffer_records"]["memory_allocations"]:
|
||||
assert itr["correlation_id"]["internal"] in api_corr_ids_unique
|
||||
|
||||
len_corr_id_unq = len(api_corr_ids_unique)
|
||||
assert len(api_corr_ids) != len_corr_id_unq
|
||||
assert max(api_corr_ids_sorted) == len_corr_id_unq
|
||||
|
||||
|
||||
def test_external_correlation_ids(input_data):
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
extern_corr_ids = []
|
||||
for titr in ["hsa_api_traces"]:
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
assert itr["correlation_id"]["external"] > 0
|
||||
assert itr["thread_id"] == itr["correlation_id"]["external"]
|
||||
extern_corr_ids.append(itr["correlation_id"]["external"])
|
||||
|
||||
extern_corr_ids = list(set(sorted(extern_corr_ids)))
|
||||
for titr in ["hsa_api_traces"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}"
|
||||
assert (
|
||||
itr["thread_id"] == itr["correlation_id"]["external"]
|
||||
), f"[{titr}] {itr}"
|
||||
assert itr["thread_id"] in extern_corr_ids, f"[{titr}] {itr}"
|
||||
assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}"
|
||||
|
||||
for titr in ["memory_allocations"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}"
|
||||
assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}"
|
||||
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}"
|
||||
assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}"
|
||||
|
||||
|
||||
def test_memory_alloc_sizes(input_data):
|
||||
"""Ensure trace file memory allocation operations match up with the memory allocation operations performed in hsa-memory-allocation"""
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
# Op values:
|
||||
# 0 == ??? (unknown)
|
||||
# 1 == hsa_memory_allocate
|
||||
# 2 == hsa_amd_ext_memory_allocate
|
||||
# 3 == hsa_amd_vmem_handle_create
|
||||
memory_alloc_cnt = dict(
|
||||
[
|
||||
(idx, {"agent": set(), "starting_addr": set(), "size": set(), "count": 0})
|
||||
for idx in range(1, 4)
|
||||
]
|
||||
)
|
||||
for itr in sdk_data["buffer_records"]["memory_allocations"]:
|
||||
op_id = itr["operation"]
|
||||
assert op_id > 0 and op_id <= 3, f"{itr}"
|
||||
memory_alloc_cnt[op_id]["count"] += 1
|
||||
memory_alloc_cnt[op_id]["starting_addr"].add(itr.starting_address)
|
||||
memory_alloc_cnt[op_id]["size"].add(itr.allocation_size)
|
||||
memory_alloc_cnt[op_id]["agent"].add(itr.agent_id.handle)
|
||||
|
||||
for itr in sdk_data["callback_records"]["memory_copies"]:
|
||||
op_id = itr.operation
|
||||
assert op_id > 0 and op_id <= 3, f"{itr}"
|
||||
memory_alloc_cnt[op_id]["count"] += 1
|
||||
|
||||
phase = itr.phase
|
||||
pitr = itr.payload
|
||||
|
||||
assert phase is not None, f"{itr}"
|
||||
assert pitr is not None, f"{itr}"
|
||||
|
||||
if phase == 1:
|
||||
assert pitr.start_timestamp == 0, f"{itr}"
|
||||
assert pitr.end_timestamp == 0, f"{itr}"
|
||||
elif phase == 2:
|
||||
assert pitr.start_timestamp > 0, f"{itr}"
|
||||
assert pitr.end_timestamp > 0, f"{itr}"
|
||||
assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}"
|
||||
|
||||
memory_alloc_cnt[op_id]["starting_addr"].add(pitr.starting_address)
|
||||
memory_alloc_cnt[op_id]["size"].add(pitr.allocation_size)
|
||||
memory_alloc_cnt[op_id]["agent"].add(pitr.agent_id.handle)
|
||||
else:
|
||||
assert phase == 1 or phase == 2, f"{itr}"
|
||||
|
||||
# In the memory allocation test which generates this file
|
||||
# 6 hsa_memory_allocation calls with 1024 bytes were called
|
||||
# and 9 hsa_amd_memory_pool_allocations with 512 bytes
|
||||
# were called
|
||||
assert memory_alloc_cnt[1]["count"] == 6
|
||||
assert memory_alloc_cnt[2]["count"] == 9
|
||||
# assert memory_alloc_cnt[3]["count"] == 3
|
||||
assert len(memory_alloc_cnt[1]["starting_addr"]) == 6
|
||||
assert len(memory_alloc_cnt[2]["starting_addr"]) == 9
|
||||
# assert len(memory_alloc_cnt[3]["starting_addr"]) == 3
|
||||
assert len(memory_alloc_cnt[1]["size"]) == 1
|
||||
assert len(memory_alloc_cnt[2]["size"]) == 1
|
||||
# assert len(memory_alloc_cnt[3]["size"]) == 1
|
||||
assert 1024 in memory_alloc_cnt[1]["size"]
|
||||
assert 512 in memory_alloc_cnt[2]["size"]
|
||||
assert len(memory_alloc_cnt[1]["agent"]) == 1
|
||||
assert len(memory_alloc_cnt[2]["agent"]) == 1
|
||||
# assert len(memory_alloc_cnt[3]["agent"]) == 1
|
||||
assert memory_alloc_cnt[1]["agent"] != memory_alloc_cnt[2]["agent"]
|
||||
# assert memory_alloc_cnt[2]["agent"] == memory_alloc_cnt[3]["agent"]
|
||||
|
||||
|
||||
def test_retired_correlation_ids(input_data):
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
def _sort_dict(inp):
|
||||
return dict(sorted(inp.items()))
|
||||
|
||||
api_corr_ids = {}
|
||||
for titr in ["hsa_api_traces"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
corr_id = itr["correlation_id"]["internal"]
|
||||
assert corr_id not in api_corr_ids.keys()
|
||||
api_corr_ids[corr_id] = itr
|
||||
|
||||
alloc_corr_ids = {}
|
||||
for titr in ["memory_allocations"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
corr_id = itr["correlation_id"]["internal"]
|
||||
assert corr_id not in alloc_corr_ids.keys()
|
||||
alloc_corr_ids[corr_id] = itr
|
||||
|
||||
retired_corr_ids = {}
|
||||
for itr in sdk_data["buffer_records"]["retired_correlation_ids"]:
|
||||
corr_id = itr["internal_correlation_id"]
|
||||
assert corr_id not in retired_corr_ids.keys()
|
||||
retired_corr_ids[corr_id] = itr
|
||||
|
||||
api_corr_ids = _sort_dict(api_corr_ids)
|
||||
alloc_corr_ids = _sort_dict(alloc_corr_ids)
|
||||
retired_corr_ids = _sort_dict(retired_corr_ids)
|
||||
|
||||
for cid, itr in alloc_corr_ids.items():
|
||||
assert cid in retired_corr_ids.keys()
|
||||
retired_ts = retired_corr_ids[cid]["timestamp"]
|
||||
end_ts = itr["end_timestamp"]
|
||||
assert (retired_ts - end_ts) > 0, f"correlation-id: {cid}, data: {itr}"
|
||||
|
||||
for cid, itr in api_corr_ids.items():
|
||||
assert cid in retired_corr_ids.keys()
|
||||
retired_ts = retired_corr_ids[cid]["timestamp"]
|
||||
end_ts = itr["end_timestamp"]
|
||||
assert (retired_ts - end_ts) > 0, f"correlation-id: {cid}, data: {itr}"
|
||||
|
||||
assert len(api_corr_ids.keys()) == (len(retired_corr_ids.keys()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
|
||||
sys.exit(exit_code)
|
||||
@@ -24,7 +24,9 @@ from __future__ import absolute_import
|
||||
|
||||
|
||||
def test_perfetto_data(
|
||||
pftrace_data, json_data, categories=("hip", "hsa", "marker", "kernel", "memory_copy")
|
||||
pftrace_data,
|
||||
json_data,
|
||||
categories=("hip", "hsa", "marker", "kernel", "memory_copy", "memory_allocation"),
|
||||
):
|
||||
|
||||
mapping = {
|
||||
@@ -33,6 +35,7 @@ def test_perfetto_data(
|
||||
"marker": ("marker_api", "marker_api"),
|
||||
"kernel": ("kernel_dispatch", "kernel_dispatch"),
|
||||
"memory_copy": ("memory_copy", "memory_copy"),
|
||||
"memory_allocation": ("memory_allocation", "memory_allocation"),
|
||||
}
|
||||
|
||||
# make sure they specified valid categories
|
||||
@@ -70,6 +73,7 @@ def test_otf2_data(
|
||||
"marker": ("marker_api", "marker_api"),
|
||||
"kernel": ("kernel_dispatch", "kernel_dispatch"),
|
||||
"memory_copy": ("memory_copy", "memory_copy"),
|
||||
"memory_allocation": ("memory_allocation", "memory_allocation"),
|
||||
}
|
||||
|
||||
# make sure they specified valid categories
|
||||
|
||||
@@ -29,6 +29,7 @@ add_subdirectory(tracing-hip-in-libraries)
|
||||
add_subdirectory(counter-collection)
|
||||
add_subdirectory(hsa-queue-dependency)
|
||||
add_subdirectory(kernel-rename)
|
||||
add_subdirectory(memory-allocation)
|
||||
add_subdirectory(aborted-app)
|
||||
add_subdirectory(summary)
|
||||
add_subdirectory(roctracer-roctx)
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(
|
||||
rocprofiler-tests-rocprofv3-memory-allocation-tracing
|
||||
LANGUAGES CXX
|
||||
VERSION 0.0.0)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py)
|
||||
|
||||
string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
|
||||
|
||||
set(memory-allocation-tracing-env "${PRELOAD_ENV}")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-memory-allocation-tracing-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --memory-allocation-trace -d
|
||||
${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace otf2
|
||||
--log-level env -- $<TARGET_FILE:hsa-memory-allocation>)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-memory-allocation-tracing-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
|
||||
"${memory-allocation-tracing-env}" FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-memory-allocation-tracing-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.json
|
||||
--pftrace-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.pftrace
|
||||
--otf2-input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.otf2)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-memory-allocation-tracing-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
rocprofv3-test-memory-allocation-tracing-execute FAIL_REGULAR_EXPRESSION
|
||||
"AssertionError")
|
||||
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
|
||||
from rocprofiler_sdk.pytest_utils import collapse_dict_list
|
||||
from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader
|
||||
from rocprofiler_sdk.pytest_utils.otf2_reader import OTF2Reader
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--json-input",
|
||||
action="store",
|
||||
default="memory-allocation-tracing/out_results.json",
|
||||
help="Input JSON",
|
||||
)
|
||||
parser.addoption(
|
||||
"--pftrace-input",
|
||||
action="store",
|
||||
default="memory-allocation-tracing/out_results.pftrace",
|
||||
help="Input JSON",
|
||||
)
|
||||
parser.addoption(
|
||||
"--otf2-input",
|
||||
action="store",
|
||||
default="memory-allocation-tracing/out_results.otf2",
|
||||
help="Input JSON",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_data(request):
|
||||
filename = request.config.getoption("--json-input")
|
||||
with open(filename, "r") as inp:
|
||||
return dotdict(collapse_dict_list(json.load(inp)))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pftrace_data(request):
|
||||
filename = request.config.getoption("--pftrace-input")
|
||||
return PerfettoReader(filename).read()[0]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def otf2_data(request):
|
||||
filename = request.config.getoption("--otf2-input")
|
||||
if not os.path.exists(filename):
|
||||
raise FileExistsError(f"{filename} does not exist")
|
||||
return OTF2Reader(filename).read()[0]
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
[pytest]
|
||||
addopts = --durations=20 -rA -s -vv
|
||||
testpaths = validate.py
|
||||
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
|
||||
+89
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
import json
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# helper function
|
||||
def node_exists(name, data, min_len=1):
|
||||
assert name in data
|
||||
assert data[name] is not None
|
||||
if isinstance(data[name], (list, tuple, dict, set)):
|
||||
assert len(data[name]) >= min_len
|
||||
|
||||
|
||||
def get_operation(record, kind_name, op_name=None):
|
||||
for idx, itr in enumerate(record["strings"]["buffer_records"]):
|
||||
if kind_name == itr["kind"]:
|
||||
if op_name is None:
|
||||
return idx, itr["operations"]
|
||||
else:
|
||||
for oidx, oname in enumerate(itr["operations"]):
|
||||
if op_name == oname:
|
||||
return oidx
|
||||
return None
|
||||
|
||||
|
||||
def test_memory_allocation(json_data):
|
||||
data = json_data["rocprofiler-sdk-tool"]
|
||||
buffer_records = data["buffer_records"]
|
||||
|
||||
memory_allocation_data = buffer_records["memory_allocation"]
|
||||
|
||||
_, bf_op_names = get_operation(data, "MEMORY_ALLOCATION")
|
||||
|
||||
assert len(bf_op_names) == 4
|
||||
|
||||
allocation_reported_agent_ids = set()
|
||||
# check buffering data
|
||||
for node in memory_allocation_data:
|
||||
assert "size" in node
|
||||
assert "kind" in node
|
||||
assert "operation" in node
|
||||
assert "correlation_id" in node
|
||||
assert "end_timestamp" in node
|
||||
assert "start_timestamp" in node
|
||||
assert "thread_id" in node
|
||||
|
||||
assert "agent_id" in node
|
||||
assert "starting_address" in node
|
||||
assert "allocation_size" in node
|
||||
|
||||
assert node.size > 0
|
||||
assert node.allocation_size > 0
|
||||
assert node.starting_address > 0
|
||||
assert node.thread_id > 0
|
||||
assert node.agent_id.handle > 0
|
||||
assert node.start_timestamp > 0
|
||||
assert node.end_timestamp > 0
|
||||
assert node.start_timestamp < node.end_timestamp
|
||||
|
||||
assert data.strings.buffer_records[node.kind].kind == "MEMORY_ALLOCATION"
|
||||
assert (
|
||||
data.strings.buffer_records[node.kind].operations[node.operation]
|
||||
in bf_op_names
|
||||
)
|
||||
|
||||
allocation_reported_agent_ids.add(node["agent_id"]["handle"])
|
||||
|
||||
assert 2**64 - 1 not in allocation_reported_agent_ids
|
||||
|
||||
|
||||
def test_perfetto_data(pftrace_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
rocprofv3.test_perfetto_data(pftrace_data, json_data, ("memory_allocation",))
|
||||
|
||||
|
||||
def test_otf2_data(otf2_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
rocprofv3.test_otf2_data(otf2_data, json_data, ("memory_allocation",))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
|
||||
sys.exit(exit_code)
|
||||
@@ -188,6 +188,8 @@ def test_summary_data(json_data):
|
||||
assert itr.stats.count >= 2130 and itr.stats.count <= 2150
|
||||
elif itr.domain == "MEMORY_COPY":
|
||||
assert itr.stats.count == 12
|
||||
elif itr.domain == "MEMORY_ALLOCATION":
|
||||
assert itr.stats.count >= 10 and itr.stats.count <= 30
|
||||
elif itr.domain == "MARKER_API":
|
||||
assert itr.stats.count == 1106
|
||||
expected = dict(
|
||||
@@ -231,6 +233,7 @@ def test_summary_display_data(json_data, summary_data):
|
||||
marker = get_df("MARKER_API")
|
||||
dispatch = get_df("KERNEL_DISPATCH")
|
||||
memcpy = get_df("MEMORY_COPY")
|
||||
memalloc = get_df("MEMORY_ALLOCATION")
|
||||
dispatch_and_copy = get_df("KERNEL_DISPATCH + MEMORY_COPY")
|
||||
hip_and_marker = get_df("HIP_API + MARKER_API") if num_summary_grps > 1 else None
|
||||
total = get_df("SUMMARY")
|
||||
@@ -239,18 +242,21 @@ def test_summary_display_data(json_data, summary_data):
|
||||
|
||||
assert get_dims(marker) == [7, 9], f"{marker}"
|
||||
assert get_dims(memcpy) == [2, 9], f"{memcpy}"
|
||||
assert get_dims(memalloc) == [1, 9], f"{memalloc}"
|
||||
assert get_dims(dispatch) == [3, 9], f"{dispatch}"
|
||||
assert get_dims(dispatch_and_copy) == [5, 9], f"{dispatch_and_copy}"
|
||||
assert get_dims(hip) == [14, 9], f"{hip}"
|
||||
assert get_dims(hip_and_marker) == expected_hip_and_marker_dims, f"{hip_and_marker}"
|
||||
assert get_dims(total) == [23, 9], f"{total}"
|
||||
assert get_dims(total) == [24, 9], f"{total}"
|
||||
|
||||
|
||||
def test_perfetto_data(pftrace_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
rocprofv3.test_perfetto_data(
|
||||
pftrace_data, json_data, ("hip", "marker", "kernel", "memory_copy")
|
||||
pftrace_data,
|
||||
json_data,
|
||||
("hip", "marker", "kernel", "memory_copy", "memory_allocation"),
|
||||
)
|
||||
|
||||
|
||||
@@ -258,7 +264,9 @@ def test_otf2_data(otf2_data, json_data):
|
||||
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
|
||||
|
||||
rocprofv3.test_otf2_data(
|
||||
otf2_data, json_data, ("hip", "marker", "kernel", "memory_copy")
|
||||
otf2_data,
|
||||
json_data,
|
||||
("hip", "marker", "kernel", "memory_copy", "memory_allocation"),
|
||||
)
|
||||
|
||||
|
||||
|
||||
+126
-20
@@ -391,6 +391,21 @@ struct memory_copy_callback_record_t
|
||||
}
|
||||
};
|
||||
|
||||
struct memory_allocation_callback_record_t
|
||||
{
|
||||
uint64_t timestamp = 0;
|
||||
rocprofiler_callback_tracing_record_t record = {};
|
||||
rocprofiler_callback_tracing_memory_allocation_data_t payload = {};
|
||||
|
||||
template <typename ArchiveT>
|
||||
void save(ArchiveT& ar) const
|
||||
{
|
||||
ar(cereal::make_nvp("timestamp", timestamp));
|
||||
cereal::save(ar, record);
|
||||
ar(cereal::make_nvp("payload", payload));
|
||||
}
|
||||
};
|
||||
|
||||
struct scratch_memory_callback_record_t
|
||||
{
|
||||
uint64_t timestamp = 0;
|
||||
@@ -483,6 +498,7 @@ auto hip_api_cb_records = std::deque<hip_api_callback_record_t>{};
|
||||
auto scratch_memory_cb_records = std::deque<scratch_memory_callback_record_t>{};
|
||||
auto kernel_dispatch_cb_records = std::deque<kernel_dispatch_callback_record_t>{};
|
||||
auto memory_copy_cb_records = std::deque<memory_copy_callback_record_t>{};
|
||||
auto memory_allocation_cb_records = std::deque<memory_allocation_callback_record_t>{};
|
||||
auto rccl_api_cb_records = std::deque<rccl_api_callback_record_t>{};
|
||||
|
||||
int
|
||||
@@ -696,6 +712,16 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
auto _lk = std::unique_lock<std::mutex>{_mutex};
|
||||
memory_copy_cb_records.emplace_back(memory_copy_callback_record_t{ts, record, *data});
|
||||
}
|
||||
else if(record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION)
|
||||
{
|
||||
auto* data =
|
||||
static_cast<rocprofiler_callback_tracing_memory_allocation_data_t*>(record.payload);
|
||||
|
||||
static auto _mutex = std::mutex{};
|
||||
auto _lk = std::unique_lock<std::mutex>{_mutex};
|
||||
memory_allocation_cb_records.emplace_back(
|
||||
memory_allocation_callback_record_t{ts, record, *data});
|
||||
}
|
||||
else if(record.kind == ROCPROFILER_CALLBACK_TRACING_RCCL_API)
|
||||
{
|
||||
auto* data = static_cast<rocprofiler_callback_tracing_rccl_api_data_t*>(record.payload);
|
||||
@@ -720,8 +746,10 @@ auto marker_api_bf_records = std::deque<rocprofiler_buffer_tracing_marker_a
|
||||
auto hip_api_bf_records = std::deque<rocprofiler_buffer_tracing_hip_api_record_t>{};
|
||||
auto kernel_dispatch_bf_records = std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>{};
|
||||
auto memory_copy_bf_records = std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>{};
|
||||
auto scratch_memory_records = std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>{};
|
||||
auto page_migration_records = std::deque<rocprofiler_buffer_tracing_page_migration_record_t>{};
|
||||
auto memory_allocation_bf_records =
|
||||
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>{};
|
||||
auto scratch_memory_records = std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>{};
|
||||
auto page_migration_records = std::deque<rocprofiler_buffer_tracing_page_migration_record_t>{};
|
||||
auto corr_id_retire_records =
|
||||
std::deque<rocprofiler_buffer_tracing_correlation_id_retirement_record_t>{};
|
||||
auto rccl_api_bf_records = std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>{};
|
||||
@@ -800,6 +828,13 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
|
||||
|
||||
memory_copy_bf_records.emplace_back(*record);
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION)
|
||||
{
|
||||
auto* record = static_cast<rocprofiler_buffer_tracing_memory_allocation_record_t*>(
|
||||
header->payload);
|
||||
|
||||
memory_allocation_bf_records.emplace_back(*record);
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* record = static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
|
||||
@@ -904,29 +939,32 @@ void
|
||||
pop_external_correlation();
|
||||
|
||||
// contexts
|
||||
rocprofiler_context_id_t hsa_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t hip_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t marker_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t code_object_ctx = {0};
|
||||
rocprofiler_context_id_t rccl_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t hsa_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t hip_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t marker_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t memory_copy_callback_ctx = {0};
|
||||
rocprofiler_context_id_t memory_copy_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t rccl_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t counter_collection_ctx = {0};
|
||||
rocprofiler_context_id_t scratch_memory_ctx = {0};
|
||||
rocprofiler_context_id_t corr_id_retire_ctx = {0};
|
||||
rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0};
|
||||
rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t page_migration_ctx = {0};
|
||||
rocprofiler_context_id_t hsa_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t hip_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t marker_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t code_object_ctx = {0};
|
||||
rocprofiler_context_id_t rccl_api_callback_ctx = {0};
|
||||
rocprofiler_context_id_t hsa_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t hip_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t marker_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t memory_copy_callback_ctx = {0};
|
||||
rocprofiler_context_id_t memory_copy_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t memory_allocation_callback_ctx = {0};
|
||||
rocprofiler_context_id_t memory_allocation_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t rccl_api_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t counter_collection_ctx = {0};
|
||||
rocprofiler_context_id_t scratch_memory_ctx = {0};
|
||||
rocprofiler_context_id_t corr_id_retire_ctx = {0};
|
||||
rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0};
|
||||
rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0};
|
||||
rocprofiler_context_id_t page_migration_ctx = {0};
|
||||
// buffers
|
||||
rocprofiler_buffer_id_t hsa_api_buffered_buffer = {};
|
||||
rocprofiler_buffer_id_t hip_api_buffered_buffer = {};
|
||||
rocprofiler_buffer_id_t marker_api_buffered_buffer = {};
|
||||
rocprofiler_buffer_id_t kernel_dispatch_buffer = {};
|
||||
rocprofiler_buffer_id_t memory_copy_buffer = {};
|
||||
rocprofiler_buffer_id_t memory_allocation_buffer = {};
|
||||
rocprofiler_buffer_id_t page_migration_buffer = {};
|
||||
rocprofiler_buffer_id_t counter_collection_buffer = {};
|
||||
rocprofiler_buffer_id_t scratch_memory_buffer = {};
|
||||
@@ -940,12 +978,14 @@ auto contexts = std::unordered_map<std::string_view, rocprofiler_context_id_t*>{
|
||||
{"CODE_OBJECT", &code_object_ctx},
|
||||
{"KERNEL_DISPATCH_CALLBACK", &kernel_dispatch_callback_ctx},
|
||||
{"MEMORY_COPY_CALLBACK", &memory_copy_callback_ctx},
|
||||
{"MEMORY_ALLOCATION_CALLBACK", &memory_allocation_callback_ctx},
|
||||
{"RCCL_API_CALLBACK", &rccl_api_callback_ctx},
|
||||
{"HSA_API_BUFFERED", &hsa_api_buffered_ctx},
|
||||
{"HIP_API_BUFFERED", &hip_api_buffered_ctx},
|
||||
{"MARKER_API_BUFFERED", &marker_api_buffered_ctx},
|
||||
{"KERNEL_DISPATCH_BUFFERED", &kernel_dispatch_buffered_ctx},
|
||||
{"MEMORY_COPY_BUFFERED", &memory_copy_buffered_ctx},
|
||||
{"MEMORY_ALLOCATION_BUFFERED", &memory_allocation_buffered_ctx},
|
||||
{"PAGE_MIGRATION", &page_migration_ctx},
|
||||
{"COUNTER_COLLECTION", &counter_collection_ctx},
|
||||
{"SCRATCH_MEMORY", &scratch_memory_ctx},
|
||||
@@ -953,11 +993,12 @@ auto contexts = std::unordered_map<std::string_view, rocprofiler_context_id_t*>{
|
||||
{"RCCL_API_BUFFERED", &rccl_api_buffered_ctx},
|
||||
};
|
||||
|
||||
auto buffers = std::array<rocprofiler_buffer_id_t*, 10>{&hsa_api_buffered_buffer,
|
||||
auto buffers = std::array<rocprofiler_buffer_id_t*, 11>{&hsa_api_buffered_buffer,
|
||||
&hip_api_buffered_buffer,
|
||||
&marker_api_buffered_buffer,
|
||||
&kernel_dispatch_buffer,
|
||||
&memory_copy_buffer,
|
||||
&memory_allocation_buffer,
|
||||
&scratch_memory_buffer,
|
||||
&page_migration_buffer,
|
||||
&counter_collection_buffer,
|
||||
@@ -1092,6 +1133,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
nullptr),
|
||||
"memory copy callback tracing service configure");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
|
||||
memory_allocation_callback_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION,
|
||||
nullptr,
|
||||
0,
|
||||
tool_tracing_callback,
|
||||
nullptr),
|
||||
"memory allocation callback tracing service configure");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_tracing_service(scratch_memory_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
|
||||
@@ -1158,6 +1208,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
&memory_copy_buffer),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(memory_allocation_buffered_ctx,
|
||||
buffer_size,
|
||||
watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
tool_tracing_buffered,
|
||||
tool_data,
|
||||
&memory_allocation_buffer),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(scratch_memory_ctx,
|
||||
buffer_size,
|
||||
watermark,
|
||||
@@ -1261,6 +1320,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
memory_copy_buffer),
|
||||
"buffer tracing service for memory copy configure");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(memory_allocation_buffered_ctx,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION,
|
||||
nullptr,
|
||||
0,
|
||||
memory_allocation_buffer),
|
||||
"buffer tracing service for memory allocation configure");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(scratch_memory_ctx,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
@@ -1448,9 +1515,11 @@ tool_fini(void* tool_data)
|
||||
<< ", scratch_memory_callback_records=" << scratch_memory_cb_records.size()
|
||||
<< ", kernel_dispatch_callback_records=" << kernel_dispatch_cb_records.size()
|
||||
<< ", memory_copy_callback_records=" << memory_copy_cb_records.size()
|
||||
<< ", memory_allocation_callback_records=" << memory_allocation_cb_records.size()
|
||||
<< ", rccl_api_callback_records=" << rccl_api_cb_records.size()
|
||||
<< ", kernel_dispatch_bf_records=" << kernel_dispatch_bf_records.size()
|
||||
<< ", memory_copy_bf_records=" << memory_copy_bf_records.size()
|
||||
<< ", memory_allocation_bf_records=" << memory_allocation_bf_records.size()
|
||||
<< ", scratch_memory_records=" << scratch_memory_records.size()
|
||||
<< ", page_migration=" << page_migration_records.size()
|
||||
<< ", hsa_api_bf_records=" << hsa_api_bf_records.size()
|
||||
@@ -1551,6 +1620,7 @@ write_json(call_stack_t* _call_stack)
|
||||
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records));
|
||||
json_ar(cereal::make_nvp("kernel_dispatch", kernel_dispatch_cb_records));
|
||||
json_ar(cereal::make_nvp("memory_copies", memory_copy_cb_records));
|
||||
json_ar(cereal::make_nvp("memory_allocations", memory_allocation_cb_records));
|
||||
} catch(std::exception& e)
|
||||
{
|
||||
std::cerr << "[" << getpid() << "][" << __FUNCTION__
|
||||
@@ -1566,6 +1636,7 @@ write_json(call_stack_t* _call_stack)
|
||||
json_ar(cereal::make_nvp("names", buffer_names));
|
||||
json_ar(cereal::make_nvp("kernel_dispatch", kernel_dispatch_bf_records));
|
||||
json_ar(cereal::make_nvp("memory_copies", memory_copy_bf_records));
|
||||
json_ar(cereal::make_nvp("memory_allocations", memory_allocation_bf_records));
|
||||
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records));
|
||||
json_ar(cereal::make_nvp("page_migration", page_migration_records));
|
||||
json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records));
|
||||
@@ -1650,6 +1721,12 @@ write_perfetto()
|
||||
agent_ids.emplace(itr.src_agent_id.handle);
|
||||
}
|
||||
|
||||
for(auto itr : memory_allocation_bf_records)
|
||||
{
|
||||
tids.emplace(itr.thread_id);
|
||||
agent_ids.emplace(itr.agent_id.handle);
|
||||
}
|
||||
|
||||
for(auto itr : kernel_dispatch_bf_records)
|
||||
{
|
||||
tids.emplace(itr.thread_id);
|
||||
@@ -1888,6 +1965,35 @@ write_perfetto()
|
||||
itr.end_timestamp);
|
||||
}
|
||||
|
||||
for(auto itr : memory_allocation_bf_records)
|
||||
{
|
||||
auto name = buffer_names.at(itr.kind, itr.operation);
|
||||
auto& track = agent_tracks.at(itr.agent_id.handle);
|
||||
|
||||
TRACE_EVENT_BEGIN(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
::perfetto::StaticString(name.data()),
|
||||
track,
|
||||
itr.start_timestamp,
|
||||
::perfetto::Flow::ProcessScoped(itr.correlation_id.internal),
|
||||
"begin_ns",
|
||||
itr.start_timestamp,
|
||||
"kind",
|
||||
itr.kind,
|
||||
"operation",
|
||||
itr.operation,
|
||||
"agent",
|
||||
agents_map.at(itr.agent_id).logical_node_id,
|
||||
"Allocation_size",
|
||||
itr.allocation_size,
|
||||
"Starting_address",
|
||||
itr.starting_address);
|
||||
TRACE_EVENT_END(sdk::perfetto_category<sdk::category::memory_allocation>::name,
|
||||
track,
|
||||
itr.end_timestamp,
|
||||
"end_ns",
|
||||
itr.end_timestamp);
|
||||
}
|
||||
|
||||
auto demangled = std::unordered_map<std::string_view, std::string>{};
|
||||
for(auto itr : kernel_dispatch_bf_records)
|
||||
{
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user