From 3bd7773cf77bd86e8f3adf8cb90a2cce8efc120f Mon Sep 17 00:00:00 2001 From: itrowbri Date: Mon, 18 Nov 2024 20:22:14 -0600 Subject: [PATCH] Memory Allocation Tracking (#1142) * Initial commit: Need to implement wrapper function to collect data and test that wrapper function is correctly replacing core HSA functions * Attempted to implement wrapper implementation for hsa memory allocation functions. Need to modify generate record files and test if implementation is working as expected * Debugging and implementing generateCSV function * Memory allocation size and starting address outputted to csv and json file formats * Formatting * Initial setup for OTF2 and Perfetto generation * Collecting agent id for memory_allocation and formatting * Modified memory_allocation.cpp to set up code for AMD_EXT commands * Support for memory_pool_allocate added * Removed accidently added file * Made flag optional and added more OTF2 and Perfetto code. Needs testing to ensure perfetto and OTF2 works * Formatting * Fixed perfetto and otf2 output * Fixed flag issue due to incorrect buffer use * Updated documentation * Small cleaning and comments * Added test for HSA memory allocation tracing * Fixed summary test validation errors due to allocation tracing. Added type to location_base to create unique event ids for allocation due to OTF2 trace error * Decreased lower limit of hip calls for test * Modified summary tests to vary number of allocate requests * Minor fixes to address comments. Still need to address OTF2 comments * Fix docs and changed OTF2 to use enum for type specified in location_base construction * Fixed schema error * Added vmem command tracking. Need to add test * Updated test to work with vmem command and updated generateCSV to output int instead of hex string. * OTF2 enum update and mispelling fix * CI does not support Virtual Memory API. Removed vmem test. Will add back if CI is modifed to suport vmem API * Update CMakeLists.txt for memory allocation test * Updated summary test * Minor fixes to address comments * Moved domain_type.hpp enum to before LAST * Fixed compile errors and formatting * Fixed stats summary domain name error * Added rocprofv3 test * Page migration test fix * Undo page migration test changes. Failures do not appear to have to do with memory allocation --- CHANGELOG.md | 1 + source/bin/rocprofv3.py | 12 +- source/docs/data/memory_allocation_trace.csv | 4 + source/docs/how-to/using-rocprofv3.rst | 57 +- source/docs/rocprofv3-schema.json | 84 +++ source/docs/rocprofv3_input_schema.json | 7 +- .../include/rocprofiler-sdk/buffer_tracing.h | 22 + .../rocprofiler-sdk/callback_tracing.h | 13 + source/include/rocprofiler-sdk/cxx/hash.hpp | 2 + .../include/rocprofiler-sdk/cxx/operators.hpp | 11 + .../include/rocprofiler-sdk/cxx/perfetto.hpp | 4 +- .../rocprofiler-sdk/cxx/serialization.hpp | 28 + .../rocprofiler-sdk/external_correlation.h | 1 + source/include/rocprofiler-sdk/fwd.h | 24 +- source/lib/output/buffered_output.hpp | 3 + source/lib/output/csv.hpp | 1 + source/lib/output/domain_type.cpp | 4 + source/lib/output/domain_type.hpp | 1 + source/lib/output/generateCSV.cpp | 44 ++ source/lib/output/generateCSV.hpp | 6 + source/lib/output/generateJSON.cpp | 20 +- source/lib/output/generateJSON.hpp | 25 +- source/lib/output/generateOTF2.cpp | 155 ++++- source/lib/output/generateOTF2.hpp | 24 +- source/lib/output/generatePerfetto.cpp | 88 ++- source/lib/output/generatePerfetto.hpp | 21 +- source/lib/output/generateStats.cpp | 18 + source/lib/output/generateStats.hpp | 5 + source/lib/rocprofiler-sdk-tool/config.hpp | 2 + source/lib/rocprofiler-sdk-tool/tool.cpp | 73 +- source/lib/rocprofiler-sdk/buffer_tracing.cpp | 12 + .../lib/rocprofiler-sdk/callback_tracing.cpp | 13 + source/lib/rocprofiler-sdk/hsa/CMakeLists.txt | 2 + source/lib/rocprofiler-sdk/hsa/hsa.def.cpp | 9 + .../rocprofiler-sdk/hsa/memory_allocation.cpp | 635 ++++++++++++++++++ .../rocprofiler-sdk/hsa/memory_allocation.hpp | 53 ++ source/lib/rocprofiler-sdk/registration.cpp | 3 + tests/CMakeLists.txt | 1 + tests/bin/CMakeLists.txt | 1 + .../bin/hsa-memory-allocation/CMakeLists.txt | 43 ++ .../hsa-memory-allocation.cpp | 267 ++++++++ tests/hsa-memory-allocation/CMakeLists.txt | 46 ++ tests/hsa-memory-allocation/conftest.py | 22 + tests/hsa-memory-allocation/pytest.ini | 5 + tests/hsa-memory-allocation/validate.py | 289 ++++++++ tests/pytest-packages/tests/rocprofv3.py | 6 +- tests/rocprofv3/CMakeLists.txt | 1 + .../memory-allocation/CMakeLists.txt | 47 ++ tests/rocprofv3/memory-allocation/conftest.py | 52 ++ tests/rocprofv3/memory-allocation/pytest.ini | 5 + tests/rocprofv3/memory-allocation/validate.py | 89 +++ tests/rocprofv3/summary/validate.py | 14 +- tests/tools/json-tool.cpp | 146 +++- 53 files changed, 2387 insertions(+), 134 deletions(-) create mode 100644 source/docs/data/memory_allocation_trace.csv create mode 100644 source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp create mode 100644 source/lib/rocprofiler-sdk/hsa/memory_allocation.hpp create mode 100644 tests/bin/hsa-memory-allocation/CMakeLists.txt create mode 100644 tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp create mode 100644 tests/hsa-memory-allocation/CMakeLists.txt create mode 100644 tests/hsa-memory-allocation/conftest.py create mode 100644 tests/hsa-memory-allocation/pytest.ini create mode 100644 tests/hsa-memory-allocation/validate.py create mode 100644 tests/rocprofv3/memory-allocation/CMakeLists.txt create mode 100644 tests/rocprofv3/memory-allocation/conftest.py create mode 100644 tests/rocprofv3/memory-allocation/pytest.ini create mode 100755 tests/rocprofv3/memory-allocation/validate.py diff --git a/CHANGELOG.md b/CHANGELOG.md index adb12feae2..e5d1632686 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -108,6 +108,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Start and end timestamp columns to the counter collection csv output - Check to force tools to initialize context id with zero - Support to specify hardware counters for collection using rocprofv3 as `rocprofv3 --pmc [COUNTER [COUNTER ...]]` +- Memory Allocation Tracing ### Changed diff --git a/source/bin/rocprofv3.py b/source/bin/rocprofv3.py index ea4a378547..4196123543 100755 --- a/source/bin/rocprofv3.py +++ b/source/bin/rocprofv3.py @@ -141,13 +141,13 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins aggregate_tracing_options, "-r", "--runtime-trace", - help="Collect tracing data for HIP runtime API, Marker (ROCTx) API, RCCL API, Memory operations (copies and scratch), and Kernel dispatches. Similar to --sys-trace but without tracing HIP compiler API and the underlying HSA API.", + help="Collect tracing data for HIP runtime API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocation), and Kernel dispatches. Similar to --sys-trace but without tracing HIP compiler API and the underlying HSA API.", ) add_parser_bool_argument( aggregate_tracing_options, "-s", "--sys-trace", - help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies and scratch), and Kernel dispatches.", + help="Collect tracing data for HIP API, HSA API, Marker (ROCTx) API, RCCL API, Memory operations (copies, scratch, and allocations), and Kernel dispatches.", ) basic_tracing_options = parser.add_argument_group("Basic tracing options") @@ -173,6 +173,11 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins "--memory-copy-trace", help="For collecting Memory Copy Traces. This was part of HIP and HSA traces in previous rocprof versions but is now a separate option", ) + add_parser_bool_argument( + basic_tracing_options, + "--memory-allocation-trace", + help="For collecting Memory Allocation Traces. Displays starting address, allocation size, and agent where allocation occurred.", + ) add_parser_bool_argument( basic_tracing_options, "--scratch-memory-trace", @@ -686,6 +691,7 @@ def run(app_args, args, **kwargs): "marker_trace", "kernel_trace", "memory_copy_trace", + "memory_allocation_trace", "scratch_memory_trace", "rccl_trace", ): @@ -697,6 +703,7 @@ def run(app_args, args, **kwargs): "marker_trace", "kernel_trace", "memory_copy_trace", + "memory_allocation_trace", "scratch_memory_trace", "rccl_trace", ): @@ -724,6 +731,7 @@ def run(app_args, args, **kwargs): ["rccl_trace", "RCCL_API_TRACE"], ["kernel_trace", "KERNEL_TRACE"], ["memory_copy_trace", "MEMORY_COPY_TRACE"], + ["memory_allocation_trace", "MEMORY_ALLOCATION_TRACE"], ["scratch_memory_trace", "SCRATCH_MEMORY_TRACE"], ] ).items(): diff --git a/source/docs/data/memory_allocation_trace.csv b/source/docs/data/memory_allocation_trace.csv new file mode 100644 index 0000000000..d9733b2718 --- /dev/null +++ b/source/docs/data/memory_allocation_trace.csv @@ -0,0 +1,4 @@ +"Kind","Operation","Agent_Id","Allocation_Size","Starting_Address","Correlation_Id","Start_Timestamp","End_Timestamp" +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497356288,1,65788054621500,65788055678893 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497348096,1,65788055691832,65788056666844 +"MEMORY_ALLOCATION","MEMORY_ALLOCATION_ALLOCATE",0,1024,140341497339904,1,65788056672061,65788057643457 diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index 97b6ce408f..68b0236bf8 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -55,11 +55,11 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt - Output control * - ``-r`` \| ``--runtime-trace`` - - Collects HIP (runtime), memory copy, marker, scratch memory, and kernel dispatch traces. + - Collects HIP (runtime), memory copy, memory allocation, marker, scratch memory, and kernel dispatch traces. - Application Tracing * - ``-s`` \| ``--sys-trace`` - - Collects HIP, HSA, memory copy, marker, scratch memory, and kernel dispatch traces. + - Collects HIP, HSA, memory copy, memory allocation, marker, scratch memory, and kernel dispatch traces. - Application Tracing * - ``--hip-trace`` @@ -78,6 +78,10 @@ Here is the sample of commonly used ``rocprofv3`` command-line options. Some opt - Collects memory copy traces. - Application tracing + * - ``--memory-allocation-trace`` + - Collects memory allocation traces. + - Application tracing + * - ``--scratch-memory-trace`` - Collects scratch memory operations traces. - Application tracing @@ -356,6 +360,30 @@ Here are the contents of ``memory_copy_trace.csv`` file: For the description of the fields in the output file, see :ref:`output-file-fields`. +Memory allocation trace ++++++++++++++++++++++++++ + +To trace memory allocations during the application run, use: + +.. code-block:: shell + + rocprofv3 –-memory-allocation-trace -- < app_path > + +The above command generates a ``memory_allocation_trace.csv`` file prefixed with the process ID. + +.. code-block:: shell + + $ cat 6489_memory_allocation_trace.csv + +Here are the contents of ``memory_allocation_trace.csv`` file: + +.. csv-table:: Memory allocation trace + :file: /data/memory_allocation_trace.csv + :widths: 10,10,10,10,10,10,20,20 + :header-rows: 1 + +For the description of the fields in the output file, see :ref:`output-file-fields`. + Runtime trace +++++++++++++++ @@ -374,7 +402,7 @@ memory operations (copies and scratch). rocprofv3 –-runtime-trace -- < app_relative_path > -Running the above command generates ``hip_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, ``scratch_memory_trace.csv``,and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID. +Running the above command generates ``hip_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, ``scratch_memory_trace.csv``, ``memory_allocation_trace.csv``, and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID. System trace ++++++++++++++ @@ -385,7 +413,7 @@ This is an all-inclusive option to collect all the above-mentioned traces. rocprofv3 –-sys-trace -- < app_relative_path > -Running the above command generates ``hip_api_trace.csv``, ``hsa_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID. +Running the above command generates ``hip_api_trace.csv``, ``hsa_api_trace.csv``, ``kernel_trace.csv``, ``memory_copy_trace.csv``, ``memory_allocation_trace.csv``, and ``marker_api_trace.csv`` (if ``ROCTx`` APIs are specified in the application) files prefixed with the process ID. Scratch memory trace ++++++++++++++++++++++ @@ -464,6 +492,8 @@ Properties Dispatch Traces. - **``memory_copy_trace``** *(boolean)*: For Collecting Memory Copy Traces. + - **``memory_allocation_trace``** *(boolean)*: For Collecting Memory + Allocation Traces. - **``scratch_memory_trace``** *(boolean)*: For Collecting Scratch Memory operations Traces. - **``stats``** *(boolean)*: For Collecting statistics of enabled @@ -479,8 +509,8 @@ Properties - **``hsa_image_trace``** *(boolean)*: For Collecting HSA API Traces (Image-extension API). - **``sys_trace``** *(boolean)*: For Collecting HIP, HSA, Marker - (ROCTx), Memory copy, Scratch memory, and Kernel dispatch - traces. + (ROCTx), Memory copy, Memory allocation, Scratch memory, and + Kernel dispatch traces. - **``mangled_kernels``** *(boolean)*: Do not demangle the kernel names. - **``truncate_kernels``** *(boolean)*: Truncate the demangled @@ -990,3 +1020,18 @@ Properties - **`src_agent_id`** *(object, required)*: Source Agent ID. - **`handle`** *(integer, required)*: Handle of the agent. - **`bytes`** *(integer, required)*: Bytes copied. + - **`memory_allocation`** *(array)*: Memory allocation records. + - **Items** *(object)* + - **`size`** *(integer, required)*: Size of the Marker API record. + - **`kind`** *(integer, required)*: Kind of the Marker API. + - **`operation`** *(integer, required)*: Operation of the Marker API. + - **`correlation_id`** *(object, required)*: Correlation ID information. + - **`internal`** *(integer, required)*: Internal correlation ID. + - **`external`** *(integer, required)*: External correlation ID. + - **`start_timestamp`** *(integer, required)*: Start timestamp. + - **`end_timestamp`** *(integer, required)*: End timestamp. + - **`thread_id`** *(integer, required)*: Thread ID. + - **`agent_id`** *(object, required)*: Agent ID. + - **`handle`** *(integer, required)*: Handle of the agent. + - **`starting_address`** *(string, required)*: Starting address of allocation. + - **`allocation_size`** *(integer, required)*: Size of allocation. diff --git a/source/docs/rocprofv3-schema.json b/source/docs/rocprofv3-schema.json index f9e57d765d..f9438dc752 100644 --- a/source/docs/rocprofv3-schema.json +++ b/source/docs/rocprofv3-schema.json @@ -1374,6 +1374,90 @@ "bytes" ] } + }, + "memory_allocation": { + "type": "array", + "description": "Memory allocation records.", + "items": { + "type": "object", + "properties": { + "size": { + "type": "integer", + "description": "Size of the Marker API record." + }, + "kind": { + "type": "integer", + "description": "Kind of the Marker API." + }, + "operation": { + "type": "integer", + "description": "Operation of the Marker API." + }, + "correlation_id": { + "type": "object", + "description": "Correlation ID information.", + "properties": { + "internal": { + "type": "integer", + "description": "Internal correlation ID." + }, + "external": { + "type": "integer", + "description": "External correlation ID." + } + }, + "required": [ + "internal", + "external" + ] + }, + "start_timestamp": { + "type": "integer", + "description": "Start timestamp." + }, + "end_timestamp": { + "type": "integer", + "description": "End timestamp." + }, + "thread_id": { + "type": "integer", + "description": "Thread ID." + }, + "agent_id": { + "type": "object", + "description": "Agent ID.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the agent." + } + }, + "required": [ + "handle" + ] + }, + "starting_address": { + "type": "integer", + "description": "Starting address of allocation" + }, + "allocation_size": { + "type": "integer", + "description": "allocation_size" + } + }, + "required": [ + "size", + "kind", + "operation", + "correlation_id", + "start_timestamp", + "end_timestamp", + "thread_id", + "agent_id", + "starting_address", + "allocation_size" + ] + } } } } diff --git a/source/docs/rocprofv3_input_schema.json b/source/docs/rocprofv3_input_schema.json index 58228ed28e..08f999a639 100644 --- a/source/docs/rocprofv3_input_schema.json +++ b/source/docs/rocprofv3_input_schema.json @@ -61,6 +61,11 @@ "description": "For Collecting Memory Copy Traces" }, + "memory_allocation_trace": { + "type": "boolean", + "description": "For Collecting Memory Allocation Traces" + }, + "scratch_memory_trace": { "type": "boolean", "description": "For Collecting Scratch Memory operations Traces" @@ -98,7 +103,7 @@ "sys_trace" : { "type": "boolean", - "description": "For Collecting HIP, HSA, Marker (ROCTx), Memory copy, Scratch memory, and Kernel dispatch traces" + "description": "For Collecting HIP, HSA, Marker (ROCTx), Memory copy, Memory allocation, Scratch memory, and Kernel dispatch traces" }, "mangled_kernels": { diff --git a/source/include/rocprofiler-sdk/buffer_tracing.h b/source/include/rocprofiler-sdk/buffer_tracing.h index 16b379bb17..3074ce12f4 100644 --- a/source/include/rocprofiler-sdk/buffer_tracing.h +++ b/source/include/rocprofiler-sdk/buffer_tracing.h @@ -203,6 +203,28 @@ typedef struct /// ::rocprofiler_memory_copy_operation_t) } rocprofiler_buffer_tracing_memory_copy_record_t; +/** + * @brief ROCProfiler Buffer Memory Allocation Tracer Record. + */ +typedef struct +{ + uint64_t size; ///< size of this struct + rocprofiler_buffer_tracing_kind_t kind; + rocprofiler_memory_allocation_operation_t operation; + rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record + rocprofiler_thread_id_t thread_id; ///< id for thread that triggered copy + rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds + rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds + rocprofiler_agent_id_t agent_id; ///< agent information for memory allocation + uint64_t starting_address; ///< starting address for memory allocation + uint64_t allocation_size; ///< size for memory allocation + /// @var kind + /// @brief ::ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION + /// @var operation + /// @brief Specification of the memory allocation function (@see + /// ::rocprofiler_memory_allocation_operation_t +} rocprofiler_buffer_tracing_memory_allocation_record_t; + /** * @brief ROCProfiler Buffer Kernel Dispatch Tracer Record. */ diff --git a/source/include/rocprofiler-sdk/callback_tracing.h b/source/include/rocprofiler-sdk/callback_tracing.h index 23ab96f995..633e7fb393 100644 --- a/source/include/rocprofiler-sdk/callback_tracing.h +++ b/source/include/rocprofiler-sdk/callback_tracing.h @@ -210,6 +210,19 @@ typedef struct uint64_t bytes; ///< bytes copied } rocprofiler_callback_tracing_memory_copy_data_t; +/** + * @brief ROCProfiler Memory Copy Allocation Tracer Record. + */ +typedef struct +{ + uint64_t size; ///< size of this struct + rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds + rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds + rocprofiler_agent_id_t agent_id; ///< agent id for memory allocation + uint64_t starting_address; ///< starting address for memory allocation + uint64_t allocation_size; ///< size of memory allocation +} rocprofiler_callback_tracing_memory_allocation_data_t; + /** * @brief ROCProfiler Scratch Memory Callback Data. */ diff --git a/source/include/rocprofiler-sdk/cxx/hash.hpp b/source/include/rocprofiler-sdk/cxx/hash.hpp index f5e5306b47..5a3ca76154 100644 --- a/source/include/rocprofiler-sdk/cxx/hash.hpp +++ b/source/include/rocprofiler-sdk/cxx/hash.hpp @@ -66,6 +66,8 @@ ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(rocprofiler_callback_thread_t) ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_agent_t) ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_signal_t) ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_executable_t) +ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_region_t) +ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_amd_memory_pool_t) #undef ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER } // namespace std diff --git a/source/include/rocprofiler-sdk/cxx/operators.hpp b/source/include/rocprofiler-sdk/cxx/operators.hpp index dd8f59a71b..b988781567 100644 --- a/source/include/rocprofiler-sdk/cxx/operators.hpp +++ b/source/include/rocprofiler-sdk/cxx/operators.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,8 @@ ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_signal_t) ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_executable_t) ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_agent_v0_t&) ROCPROFILER_CXX_DECLARE_OPERATORS(rocprofiler_dim3_t) +ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_region_t) +ROCPROFILER_CXX_DECLARE_OPERATORS(hsa_amd_memory_pool_t) // definitions of operator== ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_context_id_t) @@ -116,6 +119,8 @@ ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_callback_thread_t) ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_agent_t) ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_signal_t) ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_executable_t) +ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_region_t) +ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_amd_memory_pool_t) inline bool operator==(const rocprofiler_agent_v0_t& lhs, const rocprofiler_agent_v0_t& rhs) @@ -142,6 +147,8 @@ ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_signal_t) ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_executable_t) ROCPROFILER_CXX_DEFINE_NE_OPERATOR(const rocprofiler_agent_v0_t&) ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_dim3_t) +ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_region_t) +ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_amd_memory_pool_t) // definitions of operator< ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_context_id_t) @@ -154,6 +161,8 @@ ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_callback_thread_t) ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_agent_t) ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_signal_t) ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_executable_t) +ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_region_t) +ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_amd_memory_pool_t) inline bool operator<(const rocprofiler_agent_v0_t& lhs, const rocprofiler_agent_v0_t& rhs) @@ -185,6 +194,8 @@ ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_signal_t) ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_executable_t) ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(const rocprofiler_agent_v0_t&) ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_dim3_t) +ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_region_t) +ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_amd_memory_pool_t) // cleanup defines #undef ROCPROFILER_CXX_DECLARE_OPERATORS diff --git a/source/include/rocprofiler-sdk/cxx/perfetto.hpp b/source/include/rocprofiler-sdk/cxx/perfetto.hpp index 2e39ef7daf..3a620a4667 100644 --- a/source/include/rocprofiler-sdk/cxx/perfetto.hpp +++ b/source/include/rocprofiler-sdk/cxx/perfetto.hpp @@ -80,6 +80,7 @@ ROCPROFILER_DEFINE_CATEGORY(category, marker_api, "Marker API region") ROCPROFILER_DEFINE_CATEGORY(category, rccl_api, "RCCL API function") ROCPROFILER_DEFINE_CATEGORY(category, kernel_dispatch, "GPU kernel dispatch") ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy") +ROCPROFILER_DEFINE_CATEGORY(category, memory_allocation, "Memory Allocation") #define ROCPROFILER_PERFETTO_CATEGORIES \ ROCPROFILER_PERFETTO_CATEGORY(category::hsa_api), \ @@ -87,7 +88,8 @@ ROCPROFILER_DEFINE_CATEGORY(category, memory_copy, "Async memory copy") ROCPROFILER_PERFETTO_CATEGORY(category::marker_api), \ ROCPROFILER_PERFETTO_CATEGORY(category::rccl_api), \ ROCPROFILER_PERFETTO_CATEGORY(category::kernel_dispatch), \ - ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy) + ROCPROFILER_PERFETTO_CATEGORY(category::memory_copy), \ + ROCPROFILER_PERFETTO_CATEGORY(category::memory_allocation) #include diff --git a/source/include/rocprofiler-sdk/cxx/serialization.hpp b/source/include/rocprofiler-sdk/cxx/serialization.hpp index a8cdac371c..e8034fbddf 100644 --- a/source/include/rocprofiler-sdk/cxx/serialization.hpp +++ b/source/include/rocprofiler-sdk/cxx/serialization.hpp @@ -330,6 +330,18 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_memory_copy_data_t data) ROCP_SDK_SAVE_DATA_FIELD(bytes); } +template +void +save(ArchiveT& ar, rocprofiler_callback_tracing_memory_allocation_data_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); + ROCP_SDK_SAVE_DATA_FIELD(starting_address); + ROCP_SDK_SAVE_DATA_FIELD(allocation_size); +} + template void save(ArchiveT& ar, rocprofiler_rccl_api_retval_t data) @@ -461,6 +473,22 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data) ROCP_SDK_SAVE_DATA_FIELD(bytes); } +template +void +save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_allocation_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(kind); + ROCP_SDK_SAVE_DATA_FIELD(operation); + ROCP_SDK_SAVE_DATA_FIELD(thread_id); + ROCP_SDK_SAVE_DATA_FIELD(correlation_id); + ROCP_SDK_SAVE_DATA_FIELD(start_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(end_timestamp); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); + ROCP_SDK_SAVE_DATA_FIELD(starting_address); + ROCP_SDK_SAVE_DATA_FIELD(allocation_size); +} + template void save(ArchiveT& ar, const rocprofiler_page_migration_page_fault_start_t& data) diff --git a/source/include/rocprofiler-sdk/external_correlation.h b/source/include/rocprofiler-sdk/external_correlation.h index 755d31998c..9602e25b76 100644 --- a/source/include/rocprofiler-sdk/external_correlation.h +++ b/source/include/rocprofiler-sdk/external_correlation.h @@ -68,6 +68,7 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_SCRATCH_MEMORY, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_RCCL_API, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_OPENMP, ///< + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION, ///< ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_LAST, } rocprofiler_external_correlation_id_request_kind_t; diff --git a/source/include/rocprofiler-sdk/fwd.h b/source/include/rocprofiler-sdk/fwd.h index 1c2297b081..a4453a7ed8 100644 --- a/source/include/rocprofiler-sdk/fwd.h +++ b/source/include/rocprofiler-sdk/fwd.h @@ -167,10 +167,12 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, ///< @see ::rocprofiler_marker_name_api_id_t ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t - ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches - ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t - ROCPROFILER_CALLBACK_TRACING_RCCL_API, ///< @RCCL tracing - ROCPROFILER_CALLBACK_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t + ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ///< @see ::rocprofiler_memory_copy_operation_t + ROCPROFILER_CALLBACK_TRACING_RCCL_API, ///< @RCCL tracing + ROCPROFILER_CALLBACK_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, ///< @see + ///< ::rocprofiler_memory_allocation_operation_t ROCPROFILER_CALLBACK_TRACING_LAST, } rocprofiler_callback_tracing_kind_t; @@ -197,6 +199,8 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT, ///< Correlation ID in no longer in use ROCPROFILER_BUFFER_TRACING_RCCL_API, ///< RCCL tracing ROCPROFILER_BUFFER_TRACING_OPENMP, ///< @see ::rocprofiler_ompt_operation_t + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, ///< @see + ///< ::rocprofiler_memory_allocation_operation_t ROCPROFILER_BUFFER_TRACING_LAST, } rocprofiler_buffer_tracing_kind_t; @@ -224,6 +228,18 @@ typedef enum // NOLINT(performance-enum-size) ROCPROFILER_MEMORY_COPY_LAST, } rocprofiler_memory_copy_operation_t; +/** + * @brief Memory Allocation Operation. + */ +typedef enum // NOLINT(performance-enum-size) +{ + ROCPROFILER_MEMORY_ALLOCATION_NONE = 0, ///< Unknown memory allocation function + ROCPROFILER_MEMORY_ALLOCATION_ALLOCATE, ///< Allocate memory function + ROCPROFILER_MEMORY_ALLOCATION_MEMORY_POOL_ALLOCATE, ///< Allocate memory pool + ROCPROFILER_MEMORY_ALLOCATION_VMEM_HANDLE_CREATE, ///< Allocate vmem memory handle + ROCPROFILER_MEMORY_ALLOCATION_LAST, +} rocprofiler_memory_allocation_operation_t; + /** * @brief ROCProfiler Kernel Dispatch Tracing Operation Types. */ diff --git a/source/lib/output/buffered_output.hpp b/source/lib/output/buffered_output.hpp index 7245e03dc5..8a495f5f3e 100644 --- a/source/lib/output/buffered_output.hpp +++ b/source/lib/output/buffered_output.hpp @@ -153,5 +153,8 @@ using counter_collection_buffered_output_t = using scratch_memory_buffered_output_t = buffered_output; +using memory_allocation_buffered_output_t = + buffered_output; } // namespace tool } // namespace rocprofiler diff --git a/source/lib/output/csv.hpp b/source/lib/output/csv.hpp index 64b5f12b05..f66a2561f6 100644 --- a/source/lib/output/csv.hpp +++ b/source/lib/output/csv.hpp @@ -104,6 +104,7 @@ using agent_info_csv_encoder = csv_encoder<53>; using kernel_trace_csv_encoder = csv_encoder<18>; using counter_collection_csv_encoder = csv_encoder<18>; using memory_copy_csv_encoder = csv_encoder<7>; +using memory_allocation_csv_encoder = csv_encoder<8>; using marker_csv_encoder = csv_encoder<7>; using list_basic_metrics_csv_encoder = csv_encoder<5>; using list_derived_metrics_csv_encoder = csv_encoder<5>; diff --git a/source/lib/output/domain_type.cpp b/source/lib/output/domain_type.cpp index d2fa6692d6..e903e15605 100644 --- a/source/lib/output/domain_type.cpp +++ b/source/lib/output/domain_type.cpp @@ -52,6 +52,10 @@ DEFINE_BUFFER_TYPE_NAME(COUNTER_COLLECTION, "counter_collection", "counter_collection_stats") DEFINE_BUFFER_TYPE_NAME(RCCL, "RCCL_API", "rccl_api_trace", "rccl_api_stats") +DEFINE_BUFFER_TYPE_NAME(MEMORY_ALLOCATION, + "MEMORY_ALLOCATION", + "memory_allocation", + "memory_allocation_stats") #undef DEFINE_BUFFER_TYPE_NAME diff --git a/source/lib/output/domain_type.hpp b/source/lib/output/domain_type.hpp index 25780b8bd3..7c23eed41c 100644 --- a/source/lib/output/domain_type.hpp +++ b/source/lib/output/domain_type.hpp @@ -34,6 +34,7 @@ enum class domain_type SCRATCH_MEMORY, COUNTER_COLLECTION, RCCL, + MEMORY_ALLOCATION, LAST, }; diff --git a/source/lib/output/generateCSV.cpp b/source/lib/output/generateCSV.cpp index 7f588c78f2..d12167839b 100644 --- a/source/lib/output/generateCSV.cpp +++ b/source/lib/output/generateCSV.cpp @@ -437,6 +437,50 @@ generate_csv(const output_config& c } } +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats) +{ + if(data.empty()) return; + + if(cfg.stats && stats) + write_stats(get_stats_output_file(cfg, domain_type::MEMORY_ALLOCATION), stats.entries); + + auto ofs = tool::csv_output_file{cfg, + domain_type::MEMORY_ALLOCATION, + tool::csv::memory_allocation_csv_encoder{}, + {"Kind", + "Operation", + "Agent_Id", + "Allocation_Size", + "Starting_Address", + "Correlation_Id", + "Start_Timestamp", + "End_Timestamp"}}; + for(auto ditr : data) + { + for(auto record : data.get(ditr)) + { + auto api_name = tool_metadata.get_operation_name(record.kind, record.operation); + auto row_ss = std::stringstream{}; + rocprofiler::tool::csv::memory_allocation_csv_encoder::write_row( + row_ss, + tool_metadata.get_kind_name(record.kind), + api_name, + tool_metadata.get_node_id(record.agent_id), + record.allocation_size, + record.starting_address, + record.correlation_id.internal, + record.start_timestamp, + record.end_timestamp); + + ofs << row_ss.str(); + } + } +} + void generate_csv(const output_config& cfg, const metadata& tool_metadata, diff --git a/source/lib/output/generateCSV.hpp b/source/lib/output/generateCSV.hpp index 20beb36a53..c38f2d42f7 100644 --- a/source/lib/output/generateCSV.hpp +++ b/source/lib/output/generateCSV.hpp @@ -87,6 +87,12 @@ generate_csv(const output_config& cfg, const generator& data, const stats_entry_t& stats); +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); + void generate_csv(const output_config& cfg, const metadata& tool_metadata, diff --git a/source/lib/output/generateJSON.cpp b/source/lib/output/generateJSON.cpp index 98da33b4a0..e1dbac5b96 100644 --- a/source/lib/output/generateJSON.cpp +++ b/source/lib/output/generateJSON.cpp @@ -169,15 +169,16 @@ void write_json(json_output& json_ar, const output_config& /*cfg*/, const metadata& /*tool_metadata*/, - const domain_stats_vec_t& domain_stats, - generator&& hip_api_gen, - generator hsa_api_gen, - generator kernel_dispatch_gen, - generator memory_copy_gen, - generator counter_collection_gen, - generator marker_api_gen, - generator scratch_memory_gen, - generator rccl_api_gen) + const domain_stats_vec_t& domain_stats, + generator&& hip_api_gen, + generator hsa_api_gen, + generator kernel_dispatch_gen, + generator memory_copy_gen, + generator counter_collection_gen, + generator marker_api_gen, + generator scratch_memory_gen, + generator rccl_api_gen, + generator memory_allocation_gen) { // summary @@ -216,6 +217,7 @@ write_json(json_output& json_ar, json_ar(cereal::make_nvp("marker_api", marker_api_gen)); json_ar(cereal::make_nvp("rccl_api", rccl_api_gen)); json_ar(cereal::make_nvp("memory_copy", memory_copy_gen)); + json_ar(cereal::make_nvp("memory_allocation", memory_allocation_gen)); json_ar(cereal::make_nvp("scratch_memory", scratch_memory_gen)); json_ar.finishNode(); } diff --git a/source/lib/output/generateJSON.hpp b/source/lib/output/generateJSON.hpp index b2d18a2e74..03e1daff74 100644 --- a/source/lib/output/generateJSON.hpp +++ b/source/lib/output/generateJSON.hpp @@ -81,17 +81,18 @@ void write_json(json_output&, const output_config& cfg, const metadata& tool_metadata, uint64_t pid); void -write_json(json_output& json_ar, - const output_config& cfg, - const metadata& tool_metadata, - const domain_stats_vec_t& domain_stats, - generator&& hip_api_gen, - generator hsa_api_gen, - generator kernel_dispatch_gen, - generator memory_copy_gen, - generator counter_collection_gen, - generator marker_api_gen, - generator scratch_memory_gen, - generator rccl_api_gen); +write_json(json_output& json_ar, + const output_config& cfg, + const metadata& tool_metadata, + const domain_stats_vec_t& domain_stats, + generator&& hip_api_gen, + generator hsa_api_gen, + generator kernel_dispatch_gen, + generator memory_copy_gen, + generator counter_collection_gen, + generator marker_api_gen, + generator scratch_memory_gen, + generator rccl_api_gen, + generator memory_allocation_gen); } // namespace tool } // namespace rocprofiler diff --git a/source/lib/output/generateOTF2.cpp b/source/lib/output/generateOTF2.cpp index 62af356bef..7f9839de4d 100644 --- a/source/lib/output/generateOTF2.cpp +++ b/source/lib/output/generateOTF2.cpp @@ -127,43 +127,55 @@ auto main_tid = common::get_tid(); archive_t* archive = nullptr; auto flush_callbacks = OTF2_FlushCallbacks{pre_flush, post_flush}; +enum rocprofiler_location_type_t +{ + ROCPROFILER_AGENT_NO_TYPE = 0, + ROCPROFILER_AGENT_MEMORY_COPY_TYPE, + ROCPROFILER_AGENT_DISPATCH_TYPE, + ROCPROFILER_AGENT_MEMORY_ALLOC_TYPE +}; + struct location_base { - uint64_t pid = 0; - rocprofiler_thread_id_t tid = 0; - rocprofiler_agent_id_t agent = {.handle = 0}; - rocprofiler_queue_id_t queue = {.handle = 0}; + uint64_t pid = 0; + rocprofiler_thread_id_t tid = 0; + rocprofiler_agent_id_t agent = {.handle = 0}; + rocprofiler_queue_id_t queue = {.handle = 0}; + rocprofiler_location_type_t type = ROCPROFILER_AGENT_NO_TYPE; - location_base(uint64_t _pid, - rocprofiler_thread_id_t _tid, - rocprofiler_agent_id_t _agent = {.handle = 0}, - rocprofiler_queue_id_t _queue = {.handle = 0}) + location_base(uint64_t _pid, + rocprofiler_thread_id_t _tid, + rocprofiler_agent_id_t _agent = {.handle = 0}, + rocprofiler_location_type_t _type = ROCPROFILER_AGENT_NO_TYPE, + rocprofiler_queue_id_t _queue = {.handle = 0}) : pid{_pid} , tid{_tid} , agent{_agent} , queue{_queue} + , type{_type} {} auto hash() const { - return array_hash{}(pid, tid, agent.handle + 1, queue.handle + 1); + return array_hash{}(pid, tid, agent.handle + 1, queue.handle + 1, type); } }; bool operator<(const location_base& lhs, const location_base& rhs) { - return std::tie(lhs.pid, lhs.tid, lhs.agent.handle, lhs.queue.handle) < - std::tie(rhs.pid, rhs.tid, rhs.agent.handle, rhs.queue.handle); + return std::tie(lhs.pid, lhs.tid, lhs.agent.handle, lhs.queue.handle, lhs.type) < + std::tie(rhs.pid, rhs.tid, rhs.agent.handle, rhs.queue.handle, rhs.type); } struct location_data : location_base { - location_data(uint64_t _pid, - rocprofiler_thread_id_t _tid, - rocprofiler_agent_id_t _agent = {.handle = 0}, - rocprofiler_queue_id_t _queue = {.handle = 0}) - : location_base{_pid, _tid, _agent, _queue} + location_data(uint64_t _pid, + rocprofiler_thread_id_t _tid, + rocprofiler_agent_id_t _agent = {.handle = 0}, + rocprofiler_location_type_t _type = ROCPROFILER_AGENT_NO_TYPE, + rocprofiler_queue_id_t _queue = {.handle = 0}) + : location_base{_pid, _tid, _agent, _type, _queue} , index{++index_counter} , event_writer{OTF2_Archive_GetEvtWriter(CHECK_NOTNULL(archive), index)} { @@ -206,7 +218,7 @@ get_location(const location_base& _location, bool _init = false) if(_init) return get_locations() .emplace_back(std::make_unique( - _location.pid, _location.tid, _location.agent, _location.queue)) + _location.pid, _location.tid, _location.agent, _location.type, _location.queue)) .get(); return nullptr; @@ -299,7 +311,7 @@ setup(const output_config& cfg) OTF2_CHECK(OTF2_Pthread_Archive_SetLockingCallbacks(archive, nullptr)); OTF2_CHECK(OTF2_Archive_OpenEvtFiles(archive)); - ROCP_ERROR << "Opened result file: " << _filename << ".oft2"; + ROCP_ERROR << "Opened result file: " << _filename << ".otf2"; } void @@ -343,17 +355,19 @@ create_attribute_list() } // namespace void -write_otf2(const output_config& cfg, - const metadata& tool_metadata, - uint64_t pid, - const std::vector& agent_data, - std::deque* hip_api_data, - std::deque* hsa_api_data, - std::deque* kernel_dispatch_data, - std::deque* memory_copy_data, - std::deque* marker_api_data, - std::deque* /*scratch_memory_data*/, - std::deque* rccl_api_data) +write_otf2( + const output_config& cfg, + const metadata& tool_metadata, + uint64_t pid, + const std::vector& agent_data, + std::deque* hip_api_data, + std::deque* hsa_api_data, + std::deque* kernel_dispatch_data, + std::deque* memory_copy_data, + std::deque* marker_api_data, + std::deque* /*scratch_memory_data*/, + std::deque* rccl_api_data, + std::deque* memory_allocation_data) { namespace sdk = ::rocprofiler::sdk; @@ -366,6 +380,8 @@ write_otf2(const output_config& cfg, const auto& buffer_names = tool_metadata.buffer_names; auto tids = std::set{}; auto agent_thread_ids = std::map>{}; + auto agent_thread_ids_alloc = + std::map>{}; auto agent_queue_ids = std::map>>{}; @@ -373,6 +389,8 @@ write_otf2(const output_config& cfg, auto thread_event_info = std::map{}; auto agent_memcpy_info = std::map>{}; + auto agent_memalloc_info = + std::map>{}; auto agent_dispatch_info = std::map>>{}; @@ -407,6 +425,12 @@ write_otf2(const output_config& cfg, agent_thread_ids[itr.thread_id].emplace(itr.dst_agent_id); } + for(auto itr : *memory_allocation_data) + { + tids.emplace(itr.thread_id); + agent_thread_ids_alloc[itr.thread_id].emplace(itr.agent_id); + } + for(auto itr : *kernel_dispatch_data) { tids.emplace(itr.thread_id); @@ -421,13 +445,20 @@ write_otf2(const output_config& cfg, for(const auto& [tid, itr] : agent_thread_ids) for(auto agent : itr) - agent_memcpy_info[tid].emplace(agent, location_base{pid, tid, agent}); + agent_memcpy_info[tid].emplace( + agent, location_base{pid, tid, agent, ROCPROFILER_AGENT_MEMORY_COPY_TYPE}); + + for(const auto& [tid, itr] : agent_thread_ids_alloc) + for(auto agent : itr) + agent_memalloc_info[tid].emplace( + agent, location_base{pid, tid, agent, ROCPROFILER_AGENT_MEMORY_ALLOC_TYPE}); for(const auto& [tid, itr] : agent_queue_ids) for(const auto& [agent, qitr] : itr) for(auto queue : qitr) - agent_dispatch_info[tid][agent].emplace(queue, - location_base{pid, tid, agent, queue}); + agent_dispatch_info[tid][agent].emplace( + queue, + location_base{pid, tid, agent, ROCPROFILER_AGENT_DISPATCH_TYPE, queue}); } for(auto& [tid, evt] : thread_event_info) @@ -451,6 +482,24 @@ write_otf2(const output_config& cfg, } } + for(auto& [tid, itr] : agent_memalloc_info) + { + for(auto& [agent, evt] : itr) + { + const auto* _agent = _get_agent(agent); + auto _type_name = std::string_view{"UNK"}; + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _type_name = "CPU"; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _type_name = "GPU"; + + evt.name = fmt::format("Thread {}, Memory Allocation at {} {}", + tid, + _type_name, + _agent->logical_node_type_id); + } + } + auto _queue_ids = std::map{}; for(auto& [tid, itr] : agent_dispatch_info) for(auto& [agent, qitr] : itr) @@ -583,6 +632,30 @@ write_otf2(const output_config& cfg, nullptr}); } + for(auto itr : *memory_allocation_data) + { + auto name = buffer_names.at(itr.kind, itr.operation); + _hash_data.emplace( + get_hash_id(name), + region_info{std::string{name}, OTF2_REGION_ROLE_ALLOCATE, OTF2_PARADIGM_HIP}); + + // TODO: add attributes for memory allocation parameters + + auto& _evt_info = agent_memalloc_info.at(itr.thread_id).at(itr.agent_id); + _evt_info.event_count += 1; + + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_ENTER, + name, + _evt_info.get_location(), + itr.start_timestamp, + get_attr(sdk::category::memory_allocation{})}); + _data.emplace_back(evt_data{ROCPROFILER_CALLBACK_PHASE_EXIT, + name, + _evt_info.get_location(), + itr.end_timestamp, + nullptr}); + } + for(auto itr : *kernel_dispatch_data) { const auto& info = itr.dispatch_info; @@ -781,6 +854,24 @@ write_otf2(const output_config& cfg, } } + // Memalloc Events + for(auto& [tid, itr] : agent_memalloc_info) + { + for(auto& [agent, evt] : itr) + { + auto _hash = get_hash_id(evt.name); + + add_write_string(_hash, evt.name); + OTF2_CHECK(OTF2_GlobalDefWriter_WriteLocation(global_def_writer, + evt.id(), // id + _hash, + OTF2_LOCATION_TYPE_ACCELERATOR_STREAM, + 2 * evt.event_count, // # events + agent.handle // location group + )); + } + } + // Dispatch Events for(auto& [tid, itr] : agent_dispatch_info) { diff --git a/source/lib/output/generateOTF2.hpp b/source/lib/output/generateOTF2.hpp index a4d19b8486..1b47b4c73b 100644 --- a/source/lib/output/generateOTF2.hpp +++ b/source/lib/output/generateOTF2.hpp @@ -34,16 +34,18 @@ namespace rocprofiler namespace tool { void -write_otf2(const output_config& cfg, - const metadata& tool_metadata, - uint64_t pid, - const std::vector& agent_data, - std::deque* hip_api_data, - std::deque* hsa_api_data, - std::deque* kernel_dispatch_data, - std::deque* memory_copy_data, - std::deque* marker_api_data, - std::deque* scratch_memory_data, - std::deque* rccl_api_data); +write_otf2( + const output_config& cfg, + const metadata& tool_metadata, + uint64_t pid, + const std::vector& agent_data, + std::deque* hip_api_data, + std::deque* hsa_api_data, + std::deque* kernel_dispatch_data, + std::deque* memory_copy_data, + std::deque* marker_api_data, + std::deque* scratch_memory_data, + std::deque* rccl_api_data, + std::deque* memory_allocation_data); } // namespace tool } // namespace rocprofiler diff --git a/source/lib/output/generatePerfetto.cpp b/source/lib/output/generatePerfetto.cpp index 0e184c91f6..fa16fe65ed 100644 --- a/source/lib/output/generatePerfetto.cpp +++ b/source/lib/output/generatePerfetto.cpp @@ -71,7 +71,8 @@ write_perfetto( const generator& memory_copy_gen, const generator& marker_api_gen, const generator& /*scratch_memory_gen*/, - const generator& rccl_api_gen) + const generator& rccl_api_gen, + const generator& memory_allocation_gen) { namespace sdk = ::rocprofiler::sdk; @@ -127,9 +128,10 @@ write_perfetto( tracing_session->Setup(cfg); tracing_session->StartBlocking(); - auto tids = std::set{}; - auto demangled = std::unordered_map{}; - auto agent_thread_ids = std::unordered_map>{}; + auto tids = std::set{}; + auto demangled = std::unordered_map{}; + auto agent_thread_ids = std::unordered_map>{}; + auto agent_thread_ids_alloc = std::unordered_map>{}; auto agent_queue_ids = std::unordered_map>{}; auto thread_indexes = std::unordered_map{}; @@ -138,6 +140,9 @@ write_perfetto( auto agent_thread_tracks = std::unordered_map>{}; + auto agent_thread_tracks_alloc = + std::unordered_map>{}; auto agent_queue_tracks = std::unordered_map>{}; @@ -171,6 +176,13 @@ write_perfetto( agent_thread_ids[itr.dst_agent_id].emplace(itr.thread_id); } + for(auto ditr : memory_allocation_gen) + for(auto itr : memory_allocation_gen.get(ditr)) + { + tids.emplace(itr.thread_id); + agent_thread_ids_alloc[itr.agent_id].emplace(itr.thread_id); + } + for(auto ditr : kernel_dispatch_gen) for(auto itr : kernel_dispatch_gen.get(ditr)) { @@ -229,6 +241,33 @@ write_perfetto( } } + for(const auto& itr : agent_thread_ids_alloc) + { + const auto* _agent = _get_agent(itr.first); + + for(auto titr : itr.second) + { + auto _namess = std::stringstream{}; + _namess << "MEMORY ALLOCATION on AGENT [" << _agent->logical_node_id << "] THREAD [" + << thread_indexes.at(titr) << "] "; + + if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) + _namess << "(CPU)"; + else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) + _namess << "(GPU)"; + else + _namess << "(UNK)"; + + auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; + auto _desc = _track.Serialize(); + _desc.set_name(_namess.str()); + + perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); + + agent_thread_tracks_alloc[itr.first].emplace(titr, _track); + } + } + for(const auto& aitr : agent_queue_ids) { uint32_t nqueue = 0; @@ -424,6 +463,47 @@ write_perfetto( tracing_session->FlushBlocking(); } + for(auto ditr : memory_allocation_gen) + for(auto itr : memory_allocation_gen.get(ditr)) + { + auto name = buffer_names.at(itr.kind, itr.operation); + auto& track = agent_thread_tracks_alloc.at(itr.agent_id).at(itr.thread_id); + std::stringstream hex_stream; + hex_stream << "0x" << std::hex << std::setw(16) << std::setfill('0') + << itr.starting_address; + std::string hex_starting_address(hex_stream.str()); + + TRACE_EVENT_BEGIN(sdk::perfetto_category::name, + ::perfetto::StaticString(name.data()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "end_ns", + itr.end_timestamp, + "delta_ns", + (itr.end_timestamp - itr.start_timestamp), + "kind", + itr.kind, + "operation", + itr.operation, + "agent", + agents_map.at(itr.agent_id).logical_node_id, + "allocation_size", + itr.allocation_size, + "starting_address", + hex_starting_address, + "corr_id", + itr.correlation_id.internal, + "tid", + itr.thread_id); + TRACE_EVENT_END(sdk::perfetto_category::name, + track, + itr.end_timestamp); + tracing_session->FlushBlocking(); + } + for(auto ditr : kernel_dispatch_gen) for(auto itr : kernel_dispatch_gen.get(ditr)) { diff --git a/source/lib/output/generatePerfetto.hpp b/source/lib/output/generatePerfetto.hpp index 0417416b79..9074dd6b3b 100644 --- a/source/lib/output/generatePerfetto.hpp +++ b/source/lib/output/generatePerfetto.hpp @@ -36,15 +36,16 @@ namespace tool { void write_perfetto( - const output_config& cfg, - const metadata& tool_metadata, - std::vector agent_data, - const generator& hip_api_gen, - const generator& hsa_api_gen, - const generator& kernel_dispatch_gen, - const generator& memory_copy_gen, - const generator& marker_api_gen, - const generator& scratch_memory_gen, - const generator& rccl_api_gen); + const output_config& cfg, + const metadata& tool_metadata, + std::vector agent_data, + const generator& hip_api_gen, + const generator& hsa_api_gen, + const generator& kernel_dispatch_gen, + const generator& memory_copy_gen, + const generator& marker_api_gen, + const generator& scratch_memory_gen, + const generator& rccl_api_gen, + const generator& memory_allocation_gen); } // namespace tool } // namespace rocprofiler diff --git a/source/lib/output/generateStats.cpp b/source/lib/output/generateStats.cpp index 7e55f3648d..50a7d338e6 100644 --- a/source/lib/output/generateStats.cpp +++ b/source/lib/output/generateStats.cpp @@ -166,6 +166,24 @@ generate_stats(const output_config& /*cfg*/, return get_stats(marker_stats); } +stats_entry_t +generate_stats(const output_config& /*cfg*/, + const metadata& tool_metadata, + const generator& data) +{ + auto memory_allocation_stats = stats_map_t{}; + for(auto ditr : data) + { + for(auto record : data.get(ditr)) + { + auto api_name = tool_metadata.get_operation_name(record.kind, record.operation); + memory_allocation_stats[api_name] += (record.end_timestamp - record.start_timestamp); + } + } + + return get_stats(memory_allocation_stats); +} + stats_entry_t generate_stats(const output_config& /*cfg*/, const metadata& /*tool_metadata*/, diff --git a/source/lib/output/generateStats.hpp b/source/lib/output/generateStats.hpp index bb46c31570..06f1597149 100644 --- a/source/lib/output/generateStats.hpp +++ b/source/lib/output/generateStats.hpp @@ -70,6 +70,11 @@ generate_stats(const output_config& cf const metadata& tool_metadata, const generator& data); +stats_entry_t +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); + void generate_stats(const output_config& cfg, const metadata& tool_metadata, diff --git a/source/lib/rocprofiler-sdk-tool/config.hpp b/source/lib/rocprofiler-sdk-tool/config.hpp index 2785f2154d..a2abd29750 100644 --- a/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/source/lib/rocprofiler-sdk-tool/config.hpp @@ -84,6 +84,7 @@ struct config : output_config bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false); bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false); bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false); + bool memory_allocation_trace = get_env("ROCPROF_MEMORY_ALLOCATION_TRACE", false); bool scratch_memory_trace = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false); bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false); bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false); @@ -123,6 +124,7 @@ config::save(ArchiveT& ar) const CFG_SERIALIZE_MEMBER(hsa_finalizer_ext_api_trace); CFG_SERIALIZE_MEMBER(marker_api_trace); CFG_SERIALIZE_MEMBER(memory_copy_trace); + CFG_SERIALIZE_MEMBER(memory_allocation_trace); CFG_SERIALIZE_MEMBER(scratch_memory_trace); CFG_SERIALIZE_MEMBER(counter_collection); CFG_SERIALIZE_MEMBER(hip_runtime_api_trace); diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index ca1f797114..582b52aeb5 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -148,20 +148,22 @@ get_list_derived_metrics_file() struct buffer_ids { - rocprofiler_buffer_id_t hsa_api_trace = {}; - rocprofiler_buffer_id_t hip_api_trace = {}; - rocprofiler_buffer_id_t kernel_trace = {}; - rocprofiler_buffer_id_t memory_copy_trace = {}; - rocprofiler_buffer_id_t counter_collection = {}; - rocprofiler_buffer_id_t scratch_memory = {}; - rocprofiler_buffer_id_t rccl_api_trace = {}; + rocprofiler_buffer_id_t hsa_api_trace = {}; + rocprofiler_buffer_id_t hip_api_trace = {}; + rocprofiler_buffer_id_t kernel_trace = {}; + rocprofiler_buffer_id_t memory_copy_trace = {}; + rocprofiler_buffer_id_t memory_allocation_trace = {}; + rocprofiler_buffer_id_t counter_collection = {}; + rocprofiler_buffer_id_t scratch_memory = {}; + rocprofiler_buffer_id_t rccl_api_trace = {}; auto as_array() const { - return std::array{hsa_api_trace, + return std::array{hsa_api_trace, hip_api_trace, kernel_trace, memory_copy_trace, + memory_allocation_trace, counter_collection, scratch_memory, rccl_api_trace}; @@ -635,6 +637,13 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, tool::write_ring_buffer(*record, domain_type::MEMORY_COPY); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) + { + auto* record = static_cast( + header->payload); + + tool::write_ring_buffer(*record, domain_type::MEMORY_ALLOCATION); + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) { auto* record = static_cast( @@ -1110,6 +1119,26 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "buffer tracing service for memory copy configure"); } + if(tool::get_config().memory_allocation_trace) + { + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + buffer_size, + buffer_watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + buffered_tracing_callback, + nullptr, + &get_buffers().memory_allocation_trace), + "create memory allocation buffer"); + + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + get_client_ctx(), + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + nullptr, + 0, + get_buffers().memory_allocation_trace), + "buffer tracing service for memory allocation configure"); + } + if(tool::get_config().scratch_memory_trace) { ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), @@ -1357,6 +1386,8 @@ tool_fini(void* /*tool_data*/) auto scratch_memory_output = tool::scratch_memory_buffered_output_t{tool::get_config().scratch_memory_trace}; auto rccl_output = tool::rccl_buffered_output_t{tool::get_config().rccl_api_trace}; + auto memory_allocation_output = + tool::memory_allocation_buffered_output_t{tool::get_config().memory_allocation_trace}; auto node_id_sort = [](const auto& lhs, const auto& rhs) { return lhs.node_id < rhs.node_id; }; @@ -1374,6 +1405,7 @@ tool_fini(void* /*tool_data*/) generate_output(hsa_output, contributions); generate_output(hip_output, contributions); generate_output(memory_copy_output, contributions); + generate_output(memory_allocation_output, contributions); generate_output(marker_output, contributions); generate_output(rccl_output, contributions); generate_output(counters_output, contributions); @@ -1401,7 +1433,8 @@ tool_fini(void* /*tool_data*/) counters_output.get_generator(), marker_output.get_generator(), scratch_memory_output.get_generator(), - rccl_output.get_generator()); + rccl_output.get_generator(), + memory_allocation_output.get_generator()); json_ar.finish_process(); tool::close_json(json_ar); @@ -1418,18 +1451,20 @@ tool_fini(void* /*tool_data*/) memory_copy_output.get_generator(), marker_output.get_generator(), scratch_memory_output.get_generator(), - rccl_output.get_generator()); + rccl_output.get_generator(), + memory_allocation_output.get_generator()); } if(tool::get_config().otf2_output) { - auto hip_elem_data = hip_output.load_all(); - auto hsa_elem_data = hsa_output.load_all(); - auto kernel_dispatch_elem_data = kernel_dispatch_output.load_all(); - auto memory_copy_elem_data = memory_copy_output.load_all(); - auto marker_elem_data = marker_output.load_all(); - auto scratch_memory_elem_data = scratch_memory_output.load_all(); - auto rccl_elem_data = rccl_output.load_all(); + auto hip_elem_data = hip_output.load_all(); + auto hsa_elem_data = hsa_output.load_all(); + auto kernel_dispatch_elem_data = kernel_dispatch_output.load_all(); + auto memory_copy_elem_data = memory_copy_output.load_all(); + auto marker_elem_data = marker_output.load_all(); + auto scratch_memory_elem_data = scratch_memory_output.load_all(); + auto rccl_elem_data = rccl_output.load_all(); + auto memory_allocation_elem_data = memory_allocation_output.load_all(); tool::write_otf2(tool::get_config(), *tool_metadata, @@ -1441,7 +1476,8 @@ tool_fini(void* /*tool_data*/) &memory_copy_elem_data, &marker_elem_data, &scratch_memory_elem_data, - &rccl_elem_data); + &rccl_elem_data, + &memory_allocation_elem_data); } if(tool::get_config().summary_output) @@ -1455,6 +1491,7 @@ tool_fini(void* /*tool_data*/) destroy_output(hsa_output); destroy_output(hip_output); destroy_output(memory_copy_output); + destroy_output(memory_allocation_output); destroy_output(marker_output); destroy_output(counters_output); destroy_output(scratch_memory_output); diff --git a/source/lib/rocprofiler-sdk/buffer_tracing.cpp b/source/lib/rocprofiler-sdk/buffer_tracing.cpp index 14dc73d034..52ea8d43da 100644 --- a/source/lib/rocprofiler-sdk/buffer_tracing.cpp +++ b/source/lib/rocprofiler-sdk/buffer_tracing.cpp @@ -33,6 +33,7 @@ #include "lib/rocprofiler-sdk/hip/hip.hpp" #include "lib/rocprofiler-sdk/hsa/async_copy.hpp" #include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp" #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" @@ -80,6 +81,7 @@ ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_CORE_API) ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_CONTROL_API) ROCPROFILER_BUFFER_TRACING_KIND_STRING(MARKER_NAME_API) ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_COPY) +ROCPROFILER_BUFFER_TRACING_KIND_STRING(MEMORY_ALLOCATION) ROCPROFILER_BUFFER_TRACING_KIND_STRING(KERNEL_DISPATCH) ROCPROFILER_BUFFER_TRACING_KIND_STRING(PAGE_MIGRATION) ROCPROFILER_BUFFER_TRACING_KIND_STRING(SCRATCH_MEMORY) @@ -218,6 +220,11 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_ val = rocprofiler::hsa::async_copy::name_by_id(operation); break; } + case ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION: + { + val = rocprofiler::hsa::memory_allocation::name_by_id(operation); + break; + } case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: { val = rocprofiler::hsa::scratch_memory::name_by_id(operation); @@ -339,6 +346,11 @@ rocprofiler_iterate_buffer_tracing_kind_operations( ops = rocprofiler::hsa::async_copy::get_ids(); break; } + case ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION: + { + ops = rocprofiler::hsa::memory_allocation::get_ids(); + break; + } case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY: { ops = rocprofiler::hsa::scratch_memory::get_ids(); diff --git a/source/lib/rocprofiler-sdk/callback_tracing.cpp b/source/lib/rocprofiler-sdk/callback_tracing.cpp index 83d64d3e4d..e3ce91b635 100644 --- a/source/lib/rocprofiler-sdk/callback_tracing.cpp +++ b/source/lib/rocprofiler-sdk/callback_tracing.cpp @@ -34,6 +34,7 @@ #include "lib/rocprofiler-sdk/hip/hip.hpp" #include "lib/rocprofiler-sdk/hsa/async_copy.hpp" #include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp" #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" #include "lib/rocprofiler-sdk/kernel_dispatch/kernel_dispatch.hpp" #include "lib/rocprofiler-sdk/marker/marker.hpp" @@ -81,6 +82,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(CODE_OBJECT) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(SCRATCH_MEMORY) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_COPY) +ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MEMORY_ALLOCATION) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(RCCL_API) ROCPROFILER_CALLBACK_TRACING_KIND_STRING(OPENMP) @@ -256,6 +258,11 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; break; } + case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: + { + val = rocprofiler::hsa::memory_allocation::name_by_id(operation); + break; + } }; if(!val) @@ -375,6 +382,11 @@ rocprofiler_iterate_callback_tracing_kind_operations( return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; break; } + case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: + { + ops = rocprofiler::hsa::memory_allocation::get_ids(); + break; + } }; for(const auto& itr : ops) @@ -504,6 +516,7 @@ rocprofiler_iterate_callback_tracing_kind_operation_args( case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT: case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + case ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION: case ROCPROFILER_CALLBACK_TRACING_RCCL_API: case ROCPROFILER_CALLBACK_TRACING_OPENMP: { diff --git a/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt b/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt index 935aee5715..13f21429dd 100644 --- a/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk/hsa/CMakeLists.txt @@ -6,6 +6,7 @@ set(ROCPROFILER_LIB_HSA_SOURCES async_copy.cpp hsa_barrier.cpp hsa.cpp + memory_allocation.cpp pc_sampling.hpp profile_serializer.cpp queue_controller.cpp @@ -16,6 +17,7 @@ set(ROCPROFILER_LIB_HSA_HEADERS agent_cache.hpp aql_packet.hpp async_copy.hpp + memory_allocation.hpp defines.hpp hsa_barrier.hpp hsa.hpp diff --git a/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp b/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp index 0ee22b5634..4d0c1dfd20 100644 --- a/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp +++ b/source/lib/rocprofiler-sdk/hsa/hsa.def.cpp @@ -474,6 +474,15 @@ HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_async_copy_rect, hsa_amd_memory_async_copy_rect, hsa_amd_memory_async_copy_rect_fn) // clang-format on +#elif defined(ROCPROFILER_LIB_ROCPROFILER_HSA_MEMORY_ALLOCATION_CPP_IMPL) && \ + ROCPROFILER_LIB_ROCPROFILER_HSA_MEMORY_ALLOCATION_CPP_IMPL == 1 + +// clang-format off +HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, hsa_memory_allocate, hsa_memory_allocate_fn) +HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate, hsa_amd_memory_pool_allocate_fn) +HSA_API_META_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create, hsa_amd_vmem_handle_create_fn) +// clang-format on + #else # error "Do not compile this file directly. It is included by lib/rocprofiler/hsa/hsa.cpp" #endif diff --git a/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp b/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp new file mode 100644 index 0000000000..8226ed2e77 --- /dev/null +++ b/source/lib/rocprofiler-sdk/hsa/memory_allocation.cpp @@ -0,0 +1,635 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp" + +#include "lib/common/logging.hpp" +#include "lib/common/scope_destructor.hpp" +#include "lib/common/static_object.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" +#include "lib/rocprofiler-sdk/tracing/fwd.hpp" +#include "lib/rocprofiler-sdk/tracing/tracing.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define ROCPROFILER_LIB_ROCPROFILER_HSA_MEMORY_ALLOCATION_CPP_IMPL 1 + +// template specializations +#include "hsa.def.cpp" + +namespace rocprofiler +{ +namespace hsa +{ +namespace memory_allocation +{ +namespace +{ +using context_t = context::context; +using external_corr_id_map_t = std::unordered_map; +using region_to_agent_map = std::unordered_map; +using memory_pool_to_agent_map = std::unordered_map; +using region_to_agent_pair = std::pair; +using map_pool_to_agent_pair = std::pair; + +// Set up information to identify agent from regions/pool +template +struct memory_allocation_info; + +#define SPECIALIZE_MEMORY_ALLOCATION_INFO(FUNCTION, MAPTYPE, PAIRTYPE, SEARCHTYPE, ITERATEFUNC) \ + template <> \ + struct memory_allocation_info \ + { \ + using maptype = MAPTYPE; \ + using pairtype = PAIRTYPE; \ + using searchtype = SEARCHTYPE; \ + auto& operator()() const { return ITERATEFUNC; } \ + static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \ + static constexpr auto name = "MEMORY_ALLOCATION_" #FUNCTION; \ + }; + +SPECIALIZE_MEMORY_ALLOCATION_INFO(NONE, + region_to_agent_map, + region_to_agent_pair, + hsa_region_t, + get_core_table()->hsa_agent_iterate_regions_fn) +SPECIALIZE_MEMORY_ALLOCATION_INFO(ALLOCATE, + region_to_agent_map, + region_to_agent_pair, + hsa_region_t, + get_core_table()->hsa_agent_iterate_regions_fn) +SPECIALIZE_MEMORY_ALLOCATION_INFO(MEMORY_POOL_ALLOCATE, + memory_pool_to_agent_map, + map_pool_to_agent_pair, + hsa_amd_memory_pool_t, + get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn) +SPECIALIZE_MEMORY_ALLOCATION_INFO(VMEM_HANDLE_CREATE, + memory_pool_to_agent_map, + map_pool_to_agent_pair, + hsa_amd_memory_pool_t, + get_amd_ext_table()->hsa_amd_agent_iterate_memory_pools_fn) +#undef SPECIALIZE_MEMORY_ALLOCATION_INFO + +template +const char* +name_by_id(const uint32_t id, std::index_sequence) +{ + if(Idx == id) return memory_allocation_info::name; + if constexpr(sizeof...(IdxTail) > 0) + return name_by_id(id, std::index_sequence{}); + else + return nullptr; +} + +template +uint32_t +id_by_name(const char* name, std::index_sequence) +{ + if(std::string_view{memory_allocation_info::name} == std::string_view{name}) + return memory_allocation_info::operation_idx; + if constexpr(sizeof...(IdxTail) > 0) + return id_by_name(name, std::index_sequence{}); + else + return ROCPROFILER_MEMORY_ALLOCATION_LAST; +} + +template +void +get_ids(std::vector& _id_list, std::index_sequence) +{ + auto _emplace = [](auto& _vec, uint32_t _v) { + if(_v < static_cast(ROCPROFILER_MEMORY_ALLOCATION_LAST)) _vec.emplace_back(_v); + }; + + (_emplace(_id_list, memory_allocation_info::operation_idx), ...); +} + +template +void +get_names(std::vector& _name_list, std::index_sequence) +{ + auto _emplace = [](auto& _vec, const char* _v) { + if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v); + }; + + (_emplace(_name_list, memory_allocation_info::name), ...); +} + +bool +context_filter(const context::context* ctx) +{ + auto has_buffered = + (ctx->buffered_tracer && + (ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION))); + + auto has_callback = + (ctx->callback_tracer && + (ctx->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION))); + + return (has_buffered || has_callback); +} + +// Sequence of HSA functions being tracked. Add to these to trace new commands +enum memory_allocation_core_id +{ + memory_allocation_core_allocate_id = ROCPROFILER_HSA_CORE_API_ID_hsa_memory_allocate, +}; +using memory_allocation_core_index_seq_t = std::index_sequence; + +enum memory_allocation_amd_ext_id +{ + memory_allocation_amd_ext_allocate_id = + ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_memory_pool_allocate, + memory_allocation_vmem_allocate_id = ROCPROFILER_HSA_AMD_EXT_API_ID_hsa_amd_vmem_handle_create +}; +using memory_allocation_amd_ext_index_seq_t = + std::index_sequence; + +template +struct memory_allocation_seq; + +#define MEMORY_ALLOCATION_DEFINE_SEQ(TABLE_TYPE, SEQ) \ + template <> \ + struct memory_allocation_seq \ + { \ + static constexpr auto memory_allocation_index_seq_t = SEQ{}; \ + }; + +MEMORY_ALLOCATION_DEFINE_SEQ(ROCPROFILER_HSA_TABLE_ID_Core, memory_allocation_core_index_seq_t) +MEMORY_ALLOCATION_DEFINE_SEQ(ROCPROFILER_HSA_TABLE_ID_AmdExt, memory_allocation_amd_ext_index_seq_t) + +// Set argument indices for tracked functions +template +struct arg_indices; + +#define HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES( \ + ENUM_ID, STARTING_ADDRESS_IDX, SIZE_IDX, REGION_IDX) \ + template <> \ + struct arg_indices \ + { \ + static constexpr auto starting_address_idx = STARTING_ADDRESS_IDX; \ + static constexpr auto size_idx = SIZE_IDX; \ + static constexpr auto region_idx = REGION_IDX; \ + }; + +HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_core_allocate_id, 2, 1, 0) +HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_amd_ext_allocate_id, 3, 1, 0) +HSA_MEMORY_ALLOCATE_DEFINE_ARG_INDICES(memory_allocation_vmem_allocate_id, 4, 1, 0) + +// Define operation indices for each tracked functions +template +struct memory_allocation_op; + +#define MEMORY_ALLOCATE_OPERATION_IDX(ENUM_ID, FUNCTION) \ + template <> \ + struct memory_allocation_op \ + { \ + static constexpr auto operation_idx = ROCPROFILER_MEMORY_ALLOCATION_##FUNCTION; \ + }; + +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_core_allocate_id, ALLOCATE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_amd_ext_allocate_id, MEMORY_POOL_ALLOCATE); +MEMORY_ALLOCATE_OPERATION_IDX(memory_allocation_vmem_allocate_id, VMEM_HANDLE_CREATE); + +template +decltype(auto) +invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence) +{ + return std::forward(_func)(std::get(_args)...); +} + +template +auto& +get_next_dispatch() +{ + using function_t = typename hsa_api_meta::function_type; + static function_t _v = nullptr; + return _v; +} + +constexpr auto null_rocp_agent_id = + rocprofiler_agent_id_t{.handle = std::numeric_limits::max()}; + +struct memory_allocation_data +{ + using timestamp_t = rocprofiler_timestamp_t; + using callback_data_t = rocprofiler_callback_tracing_memory_allocation_data_t; + using buffered_data_t = rocprofiler_buffer_tracing_memory_allocation_record_t; + + rocprofiler_thread_id_t tid = common::get_tid(); + rocprofiler_agent_id_t agent = null_rocp_agent_id; + uint64_t size_allocated = 0; + uint64_t starting_addr = 0; + uint64_t start_ts = 0; + context::correlation_id* correlation_id = nullptr; + tracing::tracing_data tracing_data = {}; + rocprofiler_memory_allocation_operation_t func = ROCPROFILER_MEMORY_ALLOCATION_NONE; + + callback_data_t get_callback_data(timestamp_t _beg = 0, timestamp_t _end = 0) const; + buffered_data_t get_buffered_record(const context_t* _ctx, + timestamp_t _beg = 0, + timestamp_t _end = 0) const; +}; + +memory_allocation_data::callback_data_t +memory_allocation_data::get_callback_data(timestamp_t _beg, timestamp_t _end) const +{ + return common::init_public_api_struct( + callback_data_t{}, _beg, _end, agent, starting_addr, size_allocated); +} + +memory_allocation_data::buffered_data_t +memory_allocation_data::get_buffered_record(const context_t* _ctx, + timestamp_t _beg, + timestamp_t _end) const +{ + auto _external_corr_id = + (_ctx) ? tracing_data.external_correlation_ids.at(_ctx) : context::null_user_data; + auto _corr_id = rocprofiler_correlation_id_t{correlation_id->internal, _external_corr_id}; + + return common::init_public_api_struct(buffered_data_t{}, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + func, + _corr_id, + correlation_id->thread_idx, + _beg, + _end, + agent, + starting_addr, + size_allocated); +} + +// Callback function to populate the mapping of agents to regions +template +hsa_status_t +callback_populate_map(T region_or_pool, void* data) +{ + auto _agent_map_pair = static_cast::pairtype*>(data); + auto _rocprof_agent = _agent_map_pair->second; + auto existing_map = _agent_map_pair->first; + + existing_map->insert({region_or_pool, _rocprof_agent}); + return HSA_STATUS_SUCCESS; +} + +// Returns the rocprofiler agent when given the region/pool +template +rocprofiler_agent_id_t +get_agent(T val, IterateFunc iterate_func, CallbackFunc callback) +{ + static auto existing = typename memory_allocation_info::maptype(); + + if(existing.count(val) == 0) + { + auto agents = rocprofiler::agent::get_agents(); + for(auto itr : agents) + { + auto hsa_agent = rocprofiler::agent::get_hsa_agent(itr); + if(hsa_agent) + { + auto rocprof_agent = rocprofiler::agent::get_rocprofiler_agent(*hsa_agent); + if(rocprof_agent) + { + auto data = typename memory_allocation_info::pairtype{&existing, + rocprof_agent->id}; + iterate_func(*hsa_agent, callback, &data); + } + } + } + } + return existing.count(val) == 0 ? null_rocp_agent_id : existing.at(val); +} + +uint64_t +handle_starting_addr(void** starting_addr_pointer) +{ + return reinterpret_cast(*starting_addr_pointer); +} + +uint64_t +handle_starting_addr(hsa_amd_vmem_alloc_handle_t* vmem_alloc_handle) +{ + return vmem_alloc_handle->handle; +} + +// Wrapper implementation that stores memory allocation information +template +hsa_status_t +memory_allocation_impl(Args... args) +{ + constexpr auto N = sizeof...(Args); + constexpr auto starting_address_idx = arg_indices::starting_address_idx; + constexpr auto size_idx = arg_indices::size_idx; + constexpr auto region_idx = arg_indices::region_idx; + constexpr auto operation = memory_allocation_op::operation_idx; + + auto&& _tied_args = std::tie(args...); + memory_allocation_data _data{}; + + { + auto tracing_data = tracing::tracing_data{}; + + tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + OpIdx, + tracing_data); + // if no contexts are tracing memory copies for this direction, execute as usual + if(tracing_data.empty()) + { + return invoke(get_next_dispatch(), + std::move(_tied_args), + std::make_index_sequence{}); + } + _data.tracing_data = std::move(tracing_data); + } + + auto& tracing_data = _data.tracing_data; + auto starting_addr_pointer = std::get(_tied_args); + auto region_or_pool = std::get(_tied_args); + + _data.tid = common::get_tid(); + _data.agent = get_agent( + region_or_pool, + memory_allocation_info{}(), + callback_populate_map::searchtype>); + _data.size_allocated = std::get(_tied_args); + _data.func = operation; + _data.correlation_id = context::get_latest_correlation_id(); + + if(!_data.correlation_id) + { + constexpr auto ref_count = 1; + _data.correlation_id = context::correlation_tracing_service::construct(ref_count); + } + + // increase the reference count to denote that this correlation id is being used in a kernel + _data.correlation_id->add_ref_count(); + auto thr_id = _data.correlation_id->thread_idx; + tracing::populate_external_correlation_ids( + tracing_data.external_correlation_ids, + thr_id, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION, + operation, + _data.correlation_id->internal); + + if(!tracing_data.callback_contexts.empty()) + { + auto _tracer_data = _data.get_callback_data(); + + tracing::execute_phase_enter_callbacks(tracing_data.callback_contexts, + thr_id, + _data.correlation_id->internal, + tracing_data.external_correlation_ids, + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + operation, + _tracer_data); + // enter callback may update the external correlation id field + tracing::update_external_correlation_ids( + tracing_data.external_correlation_ids, + thr_id, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_ALLOCATION); + } + auto start_ts = common::timestamp_ns(); + auto _ret = invoke( + get_next_dispatch(), std::move(_tied_args), std::make_index_sequence{}); + auto end_ts = common::timestamp_ns(); + // Starting address is set after memory_allocation function is run. May need additional safety + // checks before retrieving starting address? + if(starting_addr_pointer != nullptr) + { + _data.starting_addr = handle_starting_addr(starting_addr_pointer); + } + + if(!tracing_data.empty()) + { + if(!_data.tracing_data.callback_contexts.empty()) + { + auto _tracer_data = _data.get_callback_data(start_ts, end_ts); + + tracing::execute_phase_exit_callbacks(_data.tracing_data.callback_contexts, + _data.tracing_data.external_correlation_ids, + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + operation, + _tracer_data); + } + + if(!_data.tracing_data.buffered_contexts.empty()) + { + auto record = _data.get_buffered_record(nullptr, start_ts, end_ts); + + tracing::execute_buffer_record_emplace(_data.tracing_data.buffered_contexts, + _data.tid, + _data.correlation_id->internal, + _data.tracing_data.external_correlation_ids, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + operation, + record); + } + } + + // decrement the reference count after usage in the callback/buffers + _data.correlation_id->sub_ref_count(); + return _ret; +} + +template +auto get_memory_allocation_impl(RetT (*)(Args...)) +{ + return &memory_allocation_impl; +} +} // namespace +// check out the assembly here... this compiles to a switch statement +const char* +name_by_id(uint32_t id) +{ + return name_by_id(id, std::make_index_sequence{}); +} + +uint32_t +id_by_name(const char* name) +{ + return id_by_name(name, std::make_index_sequence{}); +} + +std::vector +get_ids() +{ + auto _data = std::vector{}; + _data.reserve(ROCPROFILER_MEMORY_ALLOCATION_LAST); + get_ids(_data, std::make_index_sequence{}); + return _data; +} + +std::vector +get_names() +{ + auto _data = std::vector{}; + _data.reserve(ROCPROFILER_MEMORY_ALLOCATION_LAST); + get_names(_data, std::make_index_sequence{}); + return _data; +} + +template +void +memory_allocation_save(Tp* _orig, uint64_t _tbl_instance, std::integral_constant) +{ + using table_type = typename hsa_table_lookup::type; + + if constexpr(std::is_same::value) + { + auto _meta = hsa_api_meta{}; + + // original table and function + auto& _orig_table = _meta.get_table(_orig); + auto& _orig_func = _meta.get_table_func(_orig_table); + + // table with copy function + auto& _allocate_func = get_next_dispatch(); + + ROCP_FATAL_IF(_allocate_func && _tbl_instance == 0) + << _meta.name << " has non-null function pointer " << _allocate_func + << " despite this being the first instance of the library being copies"; + + if(!_allocate_func) + { + ROCP_TRACE << "copying table entry for " << _meta.name; + _allocate_func = _orig_func; + } + else + { + ROCP_TRACE << "skipping copying table entry for " << _meta.name + << " from table instance " << _tbl_instance; + } + } +} +template +void +memory_allocation_save(Tp* _orig, uint64_t _tbl_instance, std::index_sequence) +{ + memory_allocation_save( + _orig, _tbl_instance, std::integral_constant{}); + if constexpr(sizeof...(OpIdxTail) > 0) + memory_allocation_save( + _orig, _tbl_instance, std::index_sequence{}); +} + +template +void +memory_allocation_save(TableT* _orig, uint64_t _tbl_instance) +{ + constexpr auto TableIdx = hsa_table_id_lookup::value; + if(_orig) + memory_allocation_save( + _orig, _tbl_instance, memory_allocation_seq::memory_allocation_index_seq_t); +} + +template +void +memory_allocation_wrap(Tp* _orig, std::integral_constant) +{ + auto _meta = hsa_api_meta{}; + auto& _table = _meta.get_table(_orig); + auto& _func = _meta.get_table_func(_table); + + auto& _dispatch = get_next_dispatch(); + CHECK_NOTNULL(_dispatch); + _func = get_memory_allocation_impl(_func); +} + +template +void +memory_allocation_wrap(Tp* _orig, std::index_sequence) +{ + memory_allocation_wrap(_orig, std::integral_constant{}); + if constexpr(sizeof...(OpIdxTail) > 0) + memory_allocation_wrap(_orig, std::index_sequence{}); +} + +template +void +memory_allocation_wrap(TableT* _orig) +{ + constexpr auto TableIdx = hsa_table_id_lookup::value; + if(_orig) + { + memory_allocation_wrap( + _orig, memory_allocation_seq::memory_allocation_index_seq_t); + } +} + +} // namespace memory_allocation + +template +void +memory_allocation_init(TableT* _orig, uint64_t _tbl_instance) +{ + constexpr auto TableIdx = hsa_table_id_lookup::value; + if(_orig) + { + memory_allocation::memory_allocation_save( + _orig, + _tbl_instance, + memory_allocation::memory_allocation_seq::memory_allocation_index_seq_t); + + auto ctxs = context::get_registered_contexts(memory_allocation::context_filter); + if(!ctxs.empty()) + { + memory_allocation::memory_allocation_wrap( + _orig, + memory_allocation::memory_allocation_seq::memory_allocation_index_seq_t); + } + } +} + +#define INSTANTIATE_MEMORY_ALLOC_FUNC(TABLE_TYPE, TABLE_IDX) \ + template void memory_allocation_init(TABLE_TYPE * _tbl, uint64_t _instv); \ + template void memory_allocation::memory_allocation_save(TABLE_TYPE * _tbl, \ + uint64_t _instv); \ + template void memory_allocation::memory_allocation_wrap(TABLE_TYPE * _tbl); + +INSTANTIATE_MEMORY_ALLOC_FUNC(hsa_core_table_t, ROCPROFILER_HSA_TABLE_ID_Core) +INSTANTIATE_MEMORY_ALLOC_FUNC(hsa_amd_ext_table_t, ROCPROFILER_HSA_TABLE_ID_AmdExt) +#undef INSTANTIATE_MEMORY_ALLOC_FUNC + +} // namespace hsa +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/memory_allocation.hpp b/source/lib/rocprofiler-sdk/hsa/memory_allocation.hpp new file mode 100644 index 0000000000..49dc04c567 --- /dev/null +++ b/source/lib/rocprofiler-sdk/hsa/memory_allocation.hpp @@ -0,0 +1,53 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include + +#include "lib/rocprofiler-sdk/hsa/hsa.hpp" + +namespace rocprofiler +{ +namespace hsa +{ +namespace memory_allocation +{ +const char* +name_by_id(uint32_t id); + +uint32_t +id_by_name(const char* name); + +std::vector +get_names(); + +std::vector +get_ids(); +} // namespace memory_allocation + +template +void +memory_allocation_init(TableT* _orig, uint64_t _tbl_instance); + +} // namespace hsa +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/registration.cpp b/source/lib/rocprofiler-sdk/registration.cpp index efb7fca394..192ddb3c0b 100644 --- a/source/lib/rocprofiler-sdk/registration.cpp +++ b/source/lib/rocprofiler-sdk/registration.cpp @@ -34,6 +34,7 @@ #include "lib/rocprofiler-sdk/hip/hip.hpp" #include "lib/rocprofiler-sdk/hsa/async_copy.hpp" #include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/memory_allocation.hpp" #include "lib/rocprofiler-sdk/hsa/queue.hpp" #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" #include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp" @@ -790,6 +791,8 @@ rocprofiler_set_api_table(const char* name, rocprofiler::counters::device_counting_service_hsa_registration(); rocprofiler::hsa::async_copy_init(hsa_api_table, lib_instance); + rocprofiler::hsa::memory_allocation_init(hsa_api_table->core_, lib_instance); + rocprofiler::hsa::memory_allocation_init(hsa_api_table->amd_ext_, lib_instance); rocprofiler::code_object::initialize(hsa_api_table); rocprofiler::thread_trace::initialize(hsa_api_table); #if ROCPROFILER_SDK_HSA_PC_SAMPLING > 0 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6bbec14e81..d3e914ff90 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -54,6 +54,7 @@ add_subdirectory(bin) # validation tests add_subdirectory(kernel-tracing) add_subdirectory(async-copy-tracing) +add_subdirectory(hsa-memory-allocation) add_subdirectory(scratch-memory-tracing) add_subdirectory(c-tool) add_subdirectory(page-migration) diff --git a/tests/bin/CMakeLists.txt b/tests/bin/CMakeLists.txt index 62b0faa9f9..59ed2ebb59 100644 --- a/tests/bin/CMakeLists.txt +++ b/tests/bin/CMakeLists.txt @@ -24,3 +24,4 @@ add_subdirectory(scratch-memory) add_subdirectory(page-migration) add_subdirectory(hsa-queue-dependency) add_subdirectory(hip-graph) +add_subdirectory(hsa-memory-allocation) diff --git a/tests/bin/hsa-memory-allocation/CMakeLists.txt b/tests/bin/hsa-memory-allocation/CMakeLists.txt new file mode 100644 index 0000000000..485b866209 --- /dev/null +++ b/tests/bin/hsa-memory-allocation/CMakeLists.txt @@ -0,0 +1,43 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-tool-test-app-hsa-memory-allocation LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(hsa-memory-allocation.cpp PROPERTIES LANGUAGE HIP) +add_executable(hsa-memory-allocation) +target_sources(hsa-memory-allocation PRIVATE hsa-memory-allocation.cpp) +target_compile_options(hsa-memory-allocation PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow + -Werror) + +find_package(Threads REQUIRED) +target_link_libraries(hsa-memory-allocation PRIVATE Threads::Threads hsa-runtime64 + rocprofiler-sdk::tests-common-library) diff --git a/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp b/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp new file mode 100644 index 0000000000..2b94eb3291 --- /dev/null +++ b/tests/bin/hsa-memory-allocation/hsa-memory-allocation.cpp @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include +#include + +#define RET_IF_HSA_ERR(err) \ + { \ + if((err) != HSA_STATUS_SUCCESS) \ + { \ + char err_val[12]; \ + char* err_str = nullptr; \ + if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \ + { \ + sprintf(&(err_val[0]), "%#x", (uint32_t) err); \ + err_str = &(err_val[0]); \ + } \ + printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \ + printf("Call returned %s\n", err_str); \ + abort(); \ + } \ + } + +// Callback function to get the list of agents +hsa_status_t +get_agents(hsa_agent_t agent, void* data) +{ + hsa_agent_t** agent_list = (hsa_agent_t**) data; + **agent_list = agent; + ++(*agent_list); + + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the number of agents +hsa_status_t +get_num_agents(hsa_agent_t agent, void* data) +{ + (void) agent; + int* num_agents = (int*) data; + ++(*num_agents); + + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the number of regions of an agent +hsa_status_t +callback_get_num_regions(hsa_region_t region, void* data) +{ + (void) region; + int* num_regions = (int*) data; + ++(*num_regions); + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the number of memory pools of an agent +hsa_status_t +callback_get_num_pools(hsa_amd_memory_pool_t memory_pool, void* data) +{ + (void) memory_pool; + int* num_pools = (int*) data; + ++(*num_pools); + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the list of regions of an agent +hsa_status_t +callback_get_regions(hsa_region_t region, void* data) +{ + hsa_region_t** region_list = (hsa_region_t**) data; + **region_list = region; + ++(*region_list); + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the list of memory pools of an agent +hsa_status_t +callback_get_memory_pools(hsa_amd_memory_pool_t memory_pool, void* data) +{ + hsa_amd_memory_pool_t** pool_list = (hsa_amd_memory_pool_t**) data; + **pool_list = memory_pool; + ++(*pool_list); + return HSA_STATUS_SUCCESS; +} + +std::vector +get_agent_list() +{ + size_t num_agents = 0; + hsa_status_t status; + // Get number of agents + status = hsa_iterate_agents(get_num_agents, &num_agents); + RET_IF_HSA_ERR(status) + if(num_agents < 2) + { + printf("Not enough HSA agents available\n"); + abort(); + } + + // Create a array of size num_agents to store the agent list + std::vector agents(num_agents); + + // Get the agent list + hsa_agent_t* agent_iter = &agents[0]; + status = hsa_iterate_agents(get_agents, &agent_iter); + RET_IF_HSA_ERR(status) + + return agents; +} + +hsa_agent_t +get_cpu_agent(std::vector& agents) +{ + for(hsa_agent_t agent : agents) + { + hsa_device_type_t ag_type; + hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type); + RET_IF_HSA_ERR(status) + + if(ag_type == HSA_DEVICE_TYPE_CPU) + { + return agent; + } + } + std::cerr << "No CPU agents available" << std::endl; + abort(); +} + +hsa_agent_t +get_gpu_agent(std::vector& agents) +{ + for(hsa_agent_t agent : agents) + { + hsa_device_type_t ag_type; + hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &ag_type); + RET_IF_HSA_ERR(status) + + if(ag_type == HSA_DEVICE_TYPE_GPU) + { + return agent; + } + } + std::cerr << "No GPU agents available" << std::endl; + abort(); +} + +void +call_hsa_memory_allocate(const size_t i, const size_t base_size, hsa_agent_t agent) +{ + // Getting total number of regions for the agent + int num_regions = 0; + hsa_status_t status = hsa_agent_iterate_regions(agent, callback_get_num_regions, &num_regions); + RET_IF_HSA_ERR(status) + if(num_regions < 1) + { + printf("No HSA regions available\n"); + abort(); + } + // Allocate memory to hold region list of an agent + std::vector region_list(num_regions); + hsa_region_t* ptr_reg = ®ion_list[0]; + status = hsa_agent_iterate_regions(agent, callback_get_regions, &ptr_reg); + RET_IF_HSA_ERR(status) + + for(size_t j = 0; j < i; ++j) + { + void* addr = 0; + + status = hsa_memory_allocate(region_list[0], base_size, &addr); + RET_IF_HSA_ERR(status) + } +} + +void +call_hsa_memory_pool_allocate(const size_t i, const size_t base_size, hsa_agent_t agent) +{ + // Getting total number of regions for the agent + int num_pools = 0; + hsa_status_t status = + hsa_amd_agent_iterate_memory_pools(agent, callback_get_num_pools, &num_pools); + RET_IF_HSA_ERR(status) + if(num_pools < 1) + { + printf("No memory pools available\n"); + abort(); + } + // Allocate memory to hold region list of an agent + std::vector memory_pool_list(num_pools); + hsa_amd_memory_pool_t* ptr_memory_pool = &memory_pool_list[0]; + status = hsa_amd_agent_iterate_memory_pools(agent, callback_get_memory_pools, &ptr_memory_pool); + RET_IF_HSA_ERR(status) + + for(size_t j = 0; j < i; ++j) + { + void* addr = 0; + uint32_t flags = 0; + + status = hsa_amd_memory_pool_allocate(memory_pool_list[0], base_size, flags, &addr); + RET_IF_HSA_ERR(status) + } +} + +void +call_hsa_vmem_allocate(const size_t i, hsa_agent_t agent) +{ + // Getting total number of regions for the agent + int num_pools = 0; + hsa_status_t status = + hsa_amd_agent_iterate_memory_pools(agent, callback_get_num_pools, &num_pools); + RET_IF_HSA_ERR(status) + if(num_pools < 1) + { + printf("No memory pools available\n"); + abort(); + } + // Allocate memory to hold region list of an agent + std::vector memory_pool_list(num_pools); + hsa_amd_memory_pool_t* ptr_memory_pool = &memory_pool_list[0]; + status = hsa_amd_agent_iterate_memory_pools(agent, callback_get_memory_pools, &ptr_memory_pool); + RET_IF_HSA_ERR(status) + + // Ensure Virtual Memory API is supported + bool supp = false; + status = hsa_system_get_info(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED, (void*) &supp); + RET_IF_HSA_ERR(status) + if(!supp) + { + std::cerr << "Virtual Memory API not supported" << std::endl; + abort(); + } + + // Get runtime allocation granule size. Required for vmem_handle_create + int size; + status = hsa_amd_memory_pool_get_info( + memory_pool_list[0], HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, (void*) &size); + RET_IF_HSA_ERR(status) + for(size_t j = 0; j < i; ++j) + { + hsa_amd_vmem_alloc_handle_t memory_handle{}; + + status = hsa_amd_vmem_handle_create( + memory_pool_list[0], size, MEMORY_TYPE_NONE, 0, &memory_handle); + RET_IF_HSA_ERR(status) + } +} + +int +main() +{ + hsa_status_t status; + status = hsa_init(); + RET_IF_HSA_ERR(status) + + std::vector agents = get_agent_list(); + hsa_agent_t cpu_agent = get_cpu_agent(agents); + hsa_agent_t gpu_agent = get_gpu_agent(agents); + call_hsa_memory_allocate(6, 1024, cpu_agent); + call_hsa_memory_pool_allocate(9, 512, gpu_agent); + // Virtual memory API not supported in CI. Will add back if this changes + // call_hsa_vmem_allocate(3, gpu_agent); + + status = hsa_shut_down(); + RET_IF_HSA_ERR(status) + return 0; +} diff --git a/tests/hsa-memory-allocation/CMakeLists.txt b/tests/hsa-memory-allocation/CMakeLists.txt new file mode 100644 index 0000000000..987e30a11f --- /dev/null +++ b/tests/hsa-memory-allocation/CMakeLists.txt @@ -0,0 +1,46 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-memory-allocation-tracing + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") +else() + set(PRELOAD_ENV "LD_PRELOAD=$") +endif() + +add_test(NAME test-memory-allocation-tracing-execute + COMMAND $) + +set(memory-allocation-tracing-env + "${PRELOAD_ENV}" + "ROCPROFILER_TOOL_OUTPUT_FILE=memory-allocation-tracing-test.json" + "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH}" + ) + +set_tests_properties( + test-memory-allocation-tracing-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + "${memory-allocation-tracing-env}" FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +# copy to binary directory +rocprofiler_configure_pytest_files(COPY validate.py conftest.py CONFIG pytest.ini) + +add_test(NAME test-memory-allocation-tracing-validate + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input + ${CMAKE_CURRENT_BINARY_DIR}/memory-allocation-tracing-test.json) + +set_tests_properties( + test-memory-allocation-tracing-validate + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + test-memory-allocation-tracing-execute FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/tests/hsa-memory-allocation/conftest.py b/tests/hsa-memory-allocation/conftest.py new file mode 100644 index 0000000000..9856dd1a78 --- /dev/null +++ b/tests/hsa-memory-allocation/conftest.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +import json +import pytest + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict + + +def pytest_addoption(parser): + parser.addoption( + "--input", + action="store", + default="memory-allocation-tracing-test.json", + help="Input JSON", + ) + + +@pytest.fixture +def input_data(request): + filename = request.config.getoption("--input") + with open(filename, "r") as inp: + return dotdict(json.load(inp)) diff --git a/tests/hsa-memory-allocation/pytest.ini b/tests/hsa-memory-allocation/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/tests/hsa-memory-allocation/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/tests/hsa-memory-allocation/validate.py b/tests/hsa-memory-allocation/validate.py new file mode 100644 index 0000000000..56ec4bcdaa --- /dev/null +++ b/tests/hsa-memory-allocation/validate.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 + +import sys +import pytest + + +# helper function +def node_exists(name, data, min_len=1): + assert name in data + assert data[name] is not None + if isinstance(data[name], (list, tuple, dict, set)): + assert len(data[name]) >= min_len, f"{name}:\n{data}" + + +def test_data_structure(input_data): + """verify minimum amount of expected data is present""" + data = input_data + + node_exists("rocprofiler-sdk-json-tool", data) + + sdk_data = data["rocprofiler-sdk-json-tool"] + + node_exists("metadata", sdk_data) + node_exists("pid", sdk_data["metadata"]) + node_exists("main_tid", sdk_data["metadata"]) + node_exists("init_time", sdk_data["metadata"]) + node_exists("fini_time", sdk_data["metadata"]) + + node_exists("agents", sdk_data) + node_exists("call_stack", sdk_data) + node_exists("callback_records", sdk_data) + node_exists("buffer_records", sdk_data) + + node_exists("names", sdk_data["callback_records"]) + node_exists("hsa_api_traces", sdk_data["callback_records"]) + node_exists("memory_allocations", sdk_data["callback_records"]) + + node_exists("names", sdk_data["buffer_records"]) + node_exists("hsa_api_traces", sdk_data["callback_records"]) + node_exists("memory_allocations", sdk_data["buffer_records"]) + + +def test_size_entries(input_data): + # check that size fields are > 0 but account for function arguments + # which are named "size" + def check_size(data, bt): + if "size" in data.keys(): + if isinstance(data["size"], str) and bt.endswith('["args"]'): + pass + else: + assert data["size"] > 0, f"origin: {bt}" + + # recursively check the entire data structure + def iterate_data(data, bt): + if isinstance(data, (list, tuple)): + for i, itr in enumerate(data): + if isinstance(itr, dict): + check_size(itr, f"{bt}[{i}]") + iterate_data(itr, f"{bt}[{i}]") + elif isinstance(data, dict): + check_size(data, f"{bt}") + for key, itr in data.items(): + iterate_data(itr, f'{bt}["{key}"]') + + # start recursive check over entire JSON dict + iterate_data(input_data, "input_data") + + +def test_timestamps(input_data): + """Verify starting timestamps are less than ending timestamps""" + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + cb_start = {} + cb_end = {} + for titr in ["hsa_api_traces"]: + for itr in sdk_data["callback_records"][titr]: + cid = itr["correlation_id"]["internal"] + phase = itr["phase"] + if phase == 1: + cb_start[cid] = itr["timestamp"] + elif phase == 2: + cb_end[cid] = itr["timestamp"] + assert cb_start[cid] <= itr["timestamp"] + else: + assert phase == 1 or phase == 2 + + for itr in sdk_data["buffer_records"][titr]: + assert itr["start_timestamp"] <= itr["end_timestamp"] + + for titr in ["memory_allocations"]: + for itr in sdk_data["buffer_records"][titr]: + assert itr["start_timestamp"] < itr["end_timestamp"], f"[{titr}] {itr}" + assert itr["correlation_id"]["internal"] > 0, f"[{titr}] {itr}" + assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}" + assert ( + sdk_data["metadata"]["init_time"] < itr["start_timestamp"] + ), f"[{titr}] {itr}" + assert ( + sdk_data["metadata"]["init_time"] < itr["end_timestamp"] + ), f"[{titr}] {itr}" + assert ( + sdk_data["metadata"]["fini_time"] > itr["start_timestamp"] + ), f"[{titr}] {itr}" + assert ( + sdk_data["metadata"]["fini_time"] > itr["end_timestamp"] + ), f"[{titr}] {itr}" + + api_start = cb_start[itr["correlation_id"]["internal"]] + # api_end = cb_end[itr["correlation_id"]["internal"]] + assert api_start < itr["start_timestamp"], f"[{titr}] {itr}" + # assert api_end <= itr["end_timestamp"], f"[{titr}] {itr}" + + +def test_internal_correlation_ids(input_data): + """Assure correlation ids are unique""" + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + api_corr_ids = [] + for titr in ["hsa_api_traces"]: + for itr in sdk_data["callback_records"][titr]: + api_corr_ids.append(itr["correlation_id"]["internal"]) + + for itr in sdk_data["buffer_records"][titr]: + api_corr_ids.append(itr["correlation_id"]["internal"]) + + api_corr_ids_sorted = sorted(api_corr_ids) + api_corr_ids_unique = list(set(api_corr_ids)) + + for itr in sdk_data["buffer_records"]["memory_allocations"]: + assert itr["correlation_id"]["internal"] in api_corr_ids_unique + + len_corr_id_unq = len(api_corr_ids_unique) + assert len(api_corr_ids) != len_corr_id_unq + assert max(api_corr_ids_sorted) == len_corr_id_unq + + +def test_external_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + extern_corr_ids = [] + for titr in ["hsa_api_traces"]: + for itr in sdk_data["callback_records"][titr]: + assert itr["correlation_id"]["external"] > 0 + assert itr["thread_id"] == itr["correlation_id"]["external"] + extern_corr_ids.append(itr["correlation_id"]["external"]) + + extern_corr_ids = list(set(sorted(extern_corr_ids))) + for titr in ["hsa_api_traces"]: + for itr in sdk_data["buffer_records"][titr]: + assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}" + assert ( + itr["thread_id"] == itr["correlation_id"]["external"] + ), f"[{titr}] {itr}" + assert itr["thread_id"] in extern_corr_ids, f"[{titr}] {itr}" + assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}" + + for titr in ["memory_allocations"]: + for itr in sdk_data["buffer_records"][titr]: + assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}" + assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}" + + for itr in sdk_data["callback_records"][titr]: + assert itr["correlation_id"]["external"] > 0, f"[{titr}] {itr}" + assert itr["correlation_id"]["external"] in extern_corr_ids, f"[{titr}] {itr}" + + +def test_memory_alloc_sizes(input_data): + """Ensure trace file memory allocation operations match up with the memory allocation operations performed in hsa-memory-allocation""" + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + # Op values: + # 0 == ??? (unknown) + # 1 == hsa_memory_allocate + # 2 == hsa_amd_ext_memory_allocate + # 3 == hsa_amd_vmem_handle_create + memory_alloc_cnt = dict( + [ + (idx, {"agent": set(), "starting_addr": set(), "size": set(), "count": 0}) + for idx in range(1, 4) + ] + ) + for itr in sdk_data["buffer_records"]["memory_allocations"]: + op_id = itr["operation"] + assert op_id > 0 and op_id <= 3, f"{itr}" + memory_alloc_cnt[op_id]["count"] += 1 + memory_alloc_cnt[op_id]["starting_addr"].add(itr.starting_address) + memory_alloc_cnt[op_id]["size"].add(itr.allocation_size) + memory_alloc_cnt[op_id]["agent"].add(itr.agent_id.handle) + + for itr in sdk_data["callback_records"]["memory_copies"]: + op_id = itr.operation + assert op_id > 0 and op_id <= 3, f"{itr}" + memory_alloc_cnt[op_id]["count"] += 1 + + phase = itr.phase + pitr = itr.payload + + assert phase is not None, f"{itr}" + assert pitr is not None, f"{itr}" + + if phase == 1: + assert pitr.start_timestamp == 0, f"{itr}" + assert pitr.end_timestamp == 0, f"{itr}" + elif phase == 2: + assert pitr.start_timestamp > 0, f"{itr}" + assert pitr.end_timestamp > 0, f"{itr}" + assert pitr.end_timestamp >= pitr.start_timestamp, f"{itr}" + + memory_alloc_cnt[op_id]["starting_addr"].add(pitr.starting_address) + memory_alloc_cnt[op_id]["size"].add(pitr.allocation_size) + memory_alloc_cnt[op_id]["agent"].add(pitr.agent_id.handle) + else: + assert phase == 1 or phase == 2, f"{itr}" + + # In the memory allocation test which generates this file + # 6 hsa_memory_allocation calls with 1024 bytes were called + # and 9 hsa_amd_memory_pool_allocations with 512 bytes + # were called + assert memory_alloc_cnt[1]["count"] == 6 + assert memory_alloc_cnt[2]["count"] == 9 + # assert memory_alloc_cnt[3]["count"] == 3 + assert len(memory_alloc_cnt[1]["starting_addr"]) == 6 + assert len(memory_alloc_cnt[2]["starting_addr"]) == 9 + # assert len(memory_alloc_cnt[3]["starting_addr"]) == 3 + assert len(memory_alloc_cnt[1]["size"]) == 1 + assert len(memory_alloc_cnt[2]["size"]) == 1 + # assert len(memory_alloc_cnt[3]["size"]) == 1 + assert 1024 in memory_alloc_cnt[1]["size"] + assert 512 in memory_alloc_cnt[2]["size"] + assert len(memory_alloc_cnt[1]["agent"]) == 1 + assert len(memory_alloc_cnt[2]["agent"]) == 1 + # assert len(memory_alloc_cnt[3]["agent"]) == 1 + assert memory_alloc_cnt[1]["agent"] != memory_alloc_cnt[2]["agent"] + # assert memory_alloc_cnt[2]["agent"] == memory_alloc_cnt[3]["agent"] + + +def test_retired_correlation_ids(input_data): + data = input_data + sdk_data = data["rocprofiler-sdk-json-tool"] + + def _sort_dict(inp): + return dict(sorted(inp.items())) + + api_corr_ids = {} + for titr in ["hsa_api_traces"]: + for itr in sdk_data["buffer_records"][titr]: + corr_id = itr["correlation_id"]["internal"] + assert corr_id not in api_corr_ids.keys() + api_corr_ids[corr_id] = itr + + alloc_corr_ids = {} + for titr in ["memory_allocations"]: + for itr in sdk_data["buffer_records"][titr]: + corr_id = itr["correlation_id"]["internal"] + assert corr_id not in alloc_corr_ids.keys() + alloc_corr_ids[corr_id] = itr + + retired_corr_ids = {} + for itr in sdk_data["buffer_records"]["retired_correlation_ids"]: + corr_id = itr["internal_correlation_id"] + assert corr_id not in retired_corr_ids.keys() + retired_corr_ids[corr_id] = itr + + api_corr_ids = _sort_dict(api_corr_ids) + alloc_corr_ids = _sort_dict(alloc_corr_ids) + retired_corr_ids = _sort_dict(retired_corr_ids) + + for cid, itr in alloc_corr_ids.items(): + assert cid in retired_corr_ids.keys() + retired_ts = retired_corr_ids[cid]["timestamp"] + end_ts = itr["end_timestamp"] + assert (retired_ts - end_ts) > 0, f"correlation-id: {cid}, data: {itr}" + + for cid, itr in api_corr_ids.items(): + assert cid in retired_corr_ids.keys() + retired_ts = retired_corr_ids[cid]["timestamp"] + end_ts = itr["end_timestamp"] + assert (retired_ts - end_ts) > 0, f"correlation-id: {cid}, data: {itr}" + + assert len(api_corr_ids.keys()) == (len(retired_corr_ids.keys())) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/tests/pytest-packages/tests/rocprofv3.py b/tests/pytest-packages/tests/rocprofv3.py index 017c813055..3f809506b5 100644 --- a/tests/pytest-packages/tests/rocprofv3.py +++ b/tests/pytest-packages/tests/rocprofv3.py @@ -24,7 +24,9 @@ from __future__ import absolute_import def test_perfetto_data( - pftrace_data, json_data, categories=("hip", "hsa", "marker", "kernel", "memory_copy") + pftrace_data, + json_data, + categories=("hip", "hsa", "marker", "kernel", "memory_copy", "memory_allocation"), ): mapping = { @@ -33,6 +35,7 @@ def test_perfetto_data( "marker": ("marker_api", "marker_api"), "kernel": ("kernel_dispatch", "kernel_dispatch"), "memory_copy": ("memory_copy", "memory_copy"), + "memory_allocation": ("memory_allocation", "memory_allocation"), } # make sure they specified valid categories @@ -70,6 +73,7 @@ def test_otf2_data( "marker": ("marker_api", "marker_api"), "kernel": ("kernel_dispatch", "kernel_dispatch"), "memory_copy": ("memory_copy", "memory_copy"), + "memory_allocation": ("memory_allocation", "memory_allocation"), } # make sure they specified valid categories diff --git a/tests/rocprofv3/CMakeLists.txt b/tests/rocprofv3/CMakeLists.txt index 4b1200fde3..dac9c12e1f 100644 --- a/tests/rocprofv3/CMakeLists.txt +++ b/tests/rocprofv3/CMakeLists.txt @@ -29,6 +29,7 @@ add_subdirectory(tracing-hip-in-libraries) add_subdirectory(counter-collection) add_subdirectory(hsa-queue-dependency) add_subdirectory(kernel-rename) +add_subdirectory(memory-allocation) add_subdirectory(aborted-app) add_subdirectory(summary) add_subdirectory(roctracer-roctx) diff --git a/tests/rocprofv3/memory-allocation/CMakeLists.txt b/tests/rocprofv3/memory-allocation/CMakeLists.txt new file mode 100644 index 0000000000..b05546f440 --- /dev/null +++ b/tests/rocprofv3/memory-allocation/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-rocprofv3-memory-allocation-tracing + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(memory-allocation-tracing-env "${PRELOAD_ENV}") + +add_test( + NAME rocprofv3-test-memory-allocation-tracing-execute + COMMAND + $ --memory-allocation-trace -d + ${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace otf2 + --log-level env -- $) + +set_tests_properties( + rocprofv3-test-memory-allocation-tracing-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + "${memory-allocation-tracing-env}" FAIL_REGULAR_EXPRESSION + "threw an exception") + +add_test( + NAME rocprofv3-test-memory-allocation-tracing-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input + ${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.json + --pftrace-input + ${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.pftrace + --otf2-input + ${CMAKE_CURRENT_BINARY_DIR}/hsa-memory-allocation-trace/out_results.otf2) + +set_tests_properties( + rocprofv3-test-memory-allocation-tracing-validate + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS + rocprofv3-test-memory-allocation-tracing-execute FAIL_REGULAR_EXPRESSION + "AssertionError") diff --git a/tests/rocprofv3/memory-allocation/conftest.py b/tests/rocprofv3/memory-allocation/conftest.py new file mode 100644 index 0000000000..8504f44cef --- /dev/null +++ b/tests/rocprofv3/memory-allocation/conftest.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import json +import os +import pytest + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list +from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader +from rocprofiler_sdk.pytest_utils.otf2_reader import OTF2Reader + + +def pytest_addoption(parser): + parser.addoption( + "--json-input", + action="store", + default="memory-allocation-tracing/out_results.json", + help="Input JSON", + ) + parser.addoption( + "--pftrace-input", + action="store", + default="memory-allocation-tracing/out_results.pftrace", + help="Input JSON", + ) + parser.addoption( + "--otf2-input", + action="store", + default="memory-allocation-tracing/out_results.otf2", + help="Input JSON", + ) + + +@pytest.fixture +def json_data(request): + filename = request.config.getoption("--json-input") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) + + +@pytest.fixture +def pftrace_data(request): + filename = request.config.getoption("--pftrace-input") + return PerfettoReader(filename).read()[0] + + +@pytest.fixture +def otf2_data(request): + filename = request.config.getoption("--otf2-input") + if not os.path.exists(filename): + raise FileExistsError(f"{filename} does not exist") + return OTF2Reader(filename).read()[0] diff --git a/tests/rocprofv3/memory-allocation/pytest.ini b/tests/rocprofv3/memory-allocation/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/tests/rocprofv3/memory-allocation/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/tests/rocprofv3/memory-allocation/validate.py b/tests/rocprofv3/memory-allocation/validate.py new file mode 100755 index 0000000000..b2448106c3 --- /dev/null +++ b/tests/rocprofv3/memory-allocation/validate.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import json + +from collections import defaultdict + + +# helper function +def node_exists(name, data, min_len=1): + assert name in data + assert data[name] is not None + if isinstance(data[name], (list, tuple, dict, set)): + assert len(data[name]) >= min_len + + +def get_operation(record, kind_name, op_name=None): + for idx, itr in enumerate(record["strings"]["buffer_records"]): + if kind_name == itr["kind"]: + if op_name is None: + return idx, itr["operations"] + else: + for oidx, oname in enumerate(itr["operations"]): + if op_name == oname: + return oidx + return None + + +def test_memory_allocation(json_data): + data = json_data["rocprofiler-sdk-tool"] + buffer_records = data["buffer_records"] + + memory_allocation_data = buffer_records["memory_allocation"] + + _, bf_op_names = get_operation(data, "MEMORY_ALLOCATION") + + assert len(bf_op_names) == 4 + + allocation_reported_agent_ids = set() + # check buffering data + for node in memory_allocation_data: + assert "size" in node + assert "kind" in node + assert "operation" in node + assert "correlation_id" in node + assert "end_timestamp" in node + assert "start_timestamp" in node + assert "thread_id" in node + + assert "agent_id" in node + assert "starting_address" in node + assert "allocation_size" in node + + assert node.size > 0 + assert node.allocation_size > 0 + assert node.starting_address > 0 + assert node.thread_id > 0 + assert node.agent_id.handle > 0 + assert node.start_timestamp > 0 + assert node.end_timestamp > 0 + assert node.start_timestamp < node.end_timestamp + + assert data.strings.buffer_records[node.kind].kind == "MEMORY_ALLOCATION" + assert ( + data.strings.buffer_records[node.kind].operations[node.operation] + in bf_op_names + ) + + allocation_reported_agent_ids.add(node["agent_id"]["handle"]) + + assert 2**64 - 1 not in allocation_reported_agent_ids + + +def test_perfetto_data(pftrace_data, json_data): + import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 + + rocprofv3.test_perfetto_data(pftrace_data, json_data, ("memory_allocation",)) + + +def test_otf2_data(otf2_data, json_data): + import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 + + rocprofv3.test_otf2_data(otf2_data, json_data, ("memory_allocation",)) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/tests/rocprofv3/summary/validate.py b/tests/rocprofv3/summary/validate.py index 1c51598bc0..2adc1666c0 100644 --- a/tests/rocprofv3/summary/validate.py +++ b/tests/rocprofv3/summary/validate.py @@ -188,6 +188,8 @@ def test_summary_data(json_data): assert itr.stats.count >= 2130 and itr.stats.count <= 2150 elif itr.domain == "MEMORY_COPY": assert itr.stats.count == 12 + elif itr.domain == "MEMORY_ALLOCATION": + assert itr.stats.count >= 10 and itr.stats.count <= 30 elif itr.domain == "MARKER_API": assert itr.stats.count == 1106 expected = dict( @@ -231,6 +233,7 @@ def test_summary_display_data(json_data, summary_data): marker = get_df("MARKER_API") dispatch = get_df("KERNEL_DISPATCH") memcpy = get_df("MEMORY_COPY") + memalloc = get_df("MEMORY_ALLOCATION") dispatch_and_copy = get_df("KERNEL_DISPATCH + MEMORY_COPY") hip_and_marker = get_df("HIP_API + MARKER_API") if num_summary_grps > 1 else None total = get_df("SUMMARY") @@ -239,18 +242,21 @@ def test_summary_display_data(json_data, summary_data): assert get_dims(marker) == [7, 9], f"{marker}" assert get_dims(memcpy) == [2, 9], f"{memcpy}" + assert get_dims(memalloc) == [1, 9], f"{memalloc}" assert get_dims(dispatch) == [3, 9], f"{dispatch}" assert get_dims(dispatch_and_copy) == [5, 9], f"{dispatch_and_copy}" assert get_dims(hip) == [14, 9], f"{hip}" assert get_dims(hip_and_marker) == expected_hip_and_marker_dims, f"{hip_and_marker}" - assert get_dims(total) == [23, 9], f"{total}" + assert get_dims(total) == [24, 9], f"{total}" def test_perfetto_data(pftrace_data, json_data): import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 rocprofv3.test_perfetto_data( - pftrace_data, json_data, ("hip", "marker", "kernel", "memory_copy") + pftrace_data, + json_data, + ("hip", "marker", "kernel", "memory_copy", "memory_allocation"), ) @@ -258,7 +264,9 @@ def test_otf2_data(otf2_data, json_data): import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 rocprofv3.test_otf2_data( - otf2_data, json_data, ("hip", "marker", "kernel", "memory_copy") + otf2_data, + json_data, + ("hip", "marker", "kernel", "memory_copy", "memory_allocation"), ) diff --git a/tests/tools/json-tool.cpp b/tests/tools/json-tool.cpp index 597bf20844..f1b2d2dcea 100644 --- a/tests/tools/json-tool.cpp +++ b/tests/tools/json-tool.cpp @@ -391,6 +391,21 @@ struct memory_copy_callback_record_t } }; +struct memory_allocation_callback_record_t +{ + uint64_t timestamp = 0; + rocprofiler_callback_tracing_record_t record = {}; + rocprofiler_callback_tracing_memory_allocation_data_t payload = {}; + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("timestamp", timestamp)); + cereal::save(ar, record); + ar(cereal::make_nvp("payload", payload)); + } +}; + struct scratch_memory_callback_record_t { uint64_t timestamp = 0; @@ -483,6 +498,7 @@ auto hip_api_cb_records = std::deque{}; auto scratch_memory_cb_records = std::deque{}; auto kernel_dispatch_cb_records = std::deque{}; auto memory_copy_cb_records = std::deque{}; +auto memory_allocation_cb_records = std::deque{}; auto rccl_api_cb_records = std::deque{}; int @@ -696,6 +712,16 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record, auto _lk = std::unique_lock{_mutex}; memory_copy_cb_records.emplace_back(memory_copy_callback_record_t{ts, record, *data}); } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION) + { + auto* data = + static_cast(record.payload); + + static auto _mutex = std::mutex{}; + auto _lk = std::unique_lock{_mutex}; + memory_allocation_cb_records.emplace_back( + memory_allocation_callback_record_t{ts, record, *data}); + } else if(record.kind == ROCPROFILER_CALLBACK_TRACING_RCCL_API) { auto* data = static_cast(record.payload); @@ -720,8 +746,10 @@ auto marker_api_bf_records = std::deque{}; auto kernel_dispatch_bf_records = std::deque{}; auto memory_copy_bf_records = std::deque{}; -auto scratch_memory_records = std::deque{}; -auto page_migration_records = std::deque{}; +auto memory_allocation_bf_records = + std::deque{}; +auto scratch_memory_records = std::deque{}; +auto page_migration_records = std::deque{}; auto corr_id_retire_records = std::deque{}; auto rccl_api_bf_records = std::deque{}; @@ -800,6 +828,13 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, memory_copy_bf_records.emplace_back(*record); } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) + { + auto* record = static_cast( + header->payload); + + memory_allocation_bf_records.emplace_back(*record); + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) { auto* record = static_cast( @@ -904,29 +939,32 @@ void pop_external_correlation(); // contexts -rocprofiler_context_id_t hsa_api_callback_ctx = {0}; -rocprofiler_context_id_t hip_api_callback_ctx = {0}; -rocprofiler_context_id_t marker_api_callback_ctx = {0}; -rocprofiler_context_id_t code_object_ctx = {0}; -rocprofiler_context_id_t rccl_api_callback_ctx = {0}; -rocprofiler_context_id_t hsa_api_buffered_ctx = {0}; -rocprofiler_context_id_t hip_api_buffered_ctx = {0}; -rocprofiler_context_id_t marker_api_buffered_ctx = {0}; -rocprofiler_context_id_t memory_copy_callback_ctx = {0}; -rocprofiler_context_id_t memory_copy_buffered_ctx = {0}; -rocprofiler_context_id_t rccl_api_buffered_ctx = {0}; -rocprofiler_context_id_t counter_collection_ctx = {0}; -rocprofiler_context_id_t scratch_memory_ctx = {0}; -rocprofiler_context_id_t corr_id_retire_ctx = {0}; -rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0}; -rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0}; -rocprofiler_context_id_t page_migration_ctx = {0}; +rocprofiler_context_id_t hsa_api_callback_ctx = {0}; +rocprofiler_context_id_t hip_api_callback_ctx = {0}; +rocprofiler_context_id_t marker_api_callback_ctx = {0}; +rocprofiler_context_id_t code_object_ctx = {0}; +rocprofiler_context_id_t rccl_api_callback_ctx = {0}; +rocprofiler_context_id_t hsa_api_buffered_ctx = {0}; +rocprofiler_context_id_t hip_api_buffered_ctx = {0}; +rocprofiler_context_id_t marker_api_buffered_ctx = {0}; +rocprofiler_context_id_t memory_copy_callback_ctx = {0}; +rocprofiler_context_id_t memory_copy_buffered_ctx = {0}; +rocprofiler_context_id_t memory_allocation_callback_ctx = {0}; +rocprofiler_context_id_t memory_allocation_buffered_ctx = {0}; +rocprofiler_context_id_t rccl_api_buffered_ctx = {0}; +rocprofiler_context_id_t counter_collection_ctx = {0}; +rocprofiler_context_id_t scratch_memory_ctx = {0}; +rocprofiler_context_id_t corr_id_retire_ctx = {0}; +rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0}; +rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0}; +rocprofiler_context_id_t page_migration_ctx = {0}; // buffers rocprofiler_buffer_id_t hsa_api_buffered_buffer = {}; rocprofiler_buffer_id_t hip_api_buffered_buffer = {}; rocprofiler_buffer_id_t marker_api_buffered_buffer = {}; rocprofiler_buffer_id_t kernel_dispatch_buffer = {}; rocprofiler_buffer_id_t memory_copy_buffer = {}; +rocprofiler_buffer_id_t memory_allocation_buffer = {}; rocprofiler_buffer_id_t page_migration_buffer = {}; rocprofiler_buffer_id_t counter_collection_buffer = {}; rocprofiler_buffer_id_t scratch_memory_buffer = {}; @@ -940,12 +978,14 @@ auto contexts = std::unordered_map{ {"CODE_OBJECT", &code_object_ctx}, {"KERNEL_DISPATCH_CALLBACK", &kernel_dispatch_callback_ctx}, {"MEMORY_COPY_CALLBACK", &memory_copy_callback_ctx}, + {"MEMORY_ALLOCATION_CALLBACK", &memory_allocation_callback_ctx}, {"RCCL_API_CALLBACK", &rccl_api_callback_ctx}, {"HSA_API_BUFFERED", &hsa_api_buffered_ctx}, {"HIP_API_BUFFERED", &hip_api_buffered_ctx}, {"MARKER_API_BUFFERED", &marker_api_buffered_ctx}, {"KERNEL_DISPATCH_BUFFERED", &kernel_dispatch_buffered_ctx}, {"MEMORY_COPY_BUFFERED", &memory_copy_buffered_ctx}, + {"MEMORY_ALLOCATION_BUFFERED", &memory_allocation_buffered_ctx}, {"PAGE_MIGRATION", &page_migration_ctx}, {"COUNTER_COLLECTION", &counter_collection_ctx}, {"SCRATCH_MEMORY", &scratch_memory_ctx}, @@ -953,11 +993,12 @@ auto contexts = std::unordered_map{ {"RCCL_API_BUFFERED", &rccl_api_buffered_ctx}, }; -auto buffers = std::array{&hsa_api_buffered_buffer, +auto buffers = std::array{&hsa_api_buffered_buffer, &hip_api_buffered_buffer, &marker_api_buffered_buffer, &kernel_dispatch_buffer, &memory_copy_buffer, + &memory_allocation_buffer, &scratch_memory_buffer, &page_migration_buffer, &counter_collection_buffer, @@ -1092,6 +1133,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "memory copy callback tracing service configure"); + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + memory_allocation_callback_ctx, + ROCPROFILER_CALLBACK_TRACING_MEMORY_ALLOCATION, + nullptr, + 0, + tool_tracing_callback, + nullptr), + "memory allocation callback tracing service configure"); + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service(scratch_memory_ctx, ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, @@ -1158,6 +1208,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &memory_copy_buffer), "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(memory_allocation_buffered_ctx, + buffer_size, + watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_buffered, + tool_data, + &memory_allocation_buffer), + "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(scratch_memory_ctx, buffer_size, watermark, @@ -1261,6 +1320,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) memory_copy_buffer), "buffer tracing service for memory copy configure"); + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service(memory_allocation_buffered_ctx, + ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION, + nullptr, + 0, + memory_allocation_buffer), + "buffer tracing service for memory allocation configure"); + ROCPROFILER_CALL( rocprofiler_configure_buffer_tracing_service(scratch_memory_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, @@ -1448,9 +1515,11 @@ tool_fini(void* tool_data) << ", scratch_memory_callback_records=" << scratch_memory_cb_records.size() << ", kernel_dispatch_callback_records=" << kernel_dispatch_cb_records.size() << ", memory_copy_callback_records=" << memory_copy_cb_records.size() + << ", memory_allocation_callback_records=" << memory_allocation_cb_records.size() << ", rccl_api_callback_records=" << rccl_api_cb_records.size() << ", kernel_dispatch_bf_records=" << kernel_dispatch_bf_records.size() << ", memory_copy_bf_records=" << memory_copy_bf_records.size() + << ", memory_allocation_bf_records=" << memory_allocation_bf_records.size() << ", scratch_memory_records=" << scratch_memory_records.size() << ", page_migration=" << page_migration_records.size() << ", hsa_api_bf_records=" << hsa_api_bf_records.size() @@ -1551,6 +1620,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records)); json_ar(cereal::make_nvp("kernel_dispatch", kernel_dispatch_cb_records)); json_ar(cereal::make_nvp("memory_copies", memory_copy_cb_records)); + json_ar(cereal::make_nvp("memory_allocations", memory_allocation_cb_records)); } catch(std::exception& e) { std::cerr << "[" << getpid() << "][" << __FUNCTION__ @@ -1566,6 +1636,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("names", buffer_names)); json_ar(cereal::make_nvp("kernel_dispatch", kernel_dispatch_bf_records)); json_ar(cereal::make_nvp("memory_copies", memory_copy_bf_records)); + json_ar(cereal::make_nvp("memory_allocations", memory_allocation_bf_records)); json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records)); json_ar(cereal::make_nvp("page_migration", page_migration_records)); json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records)); @@ -1650,6 +1721,12 @@ write_perfetto() agent_ids.emplace(itr.src_agent_id.handle); } + for(auto itr : memory_allocation_bf_records) + { + tids.emplace(itr.thread_id); + agent_ids.emplace(itr.agent_id.handle); + } + for(auto itr : kernel_dispatch_bf_records) { tids.emplace(itr.thread_id); @@ -1888,6 +1965,35 @@ write_perfetto() itr.end_timestamp); } + for(auto itr : memory_allocation_bf_records) + { + auto name = buffer_names.at(itr.kind, itr.operation); + auto& track = agent_tracks.at(itr.agent_id.handle); + + TRACE_EVENT_BEGIN(sdk::perfetto_category::name, + ::perfetto::StaticString(name.data()), + track, + itr.start_timestamp, + ::perfetto::Flow::ProcessScoped(itr.correlation_id.internal), + "begin_ns", + itr.start_timestamp, + "kind", + itr.kind, + "operation", + itr.operation, + "agent", + agents_map.at(itr.agent_id).logical_node_id, + "Allocation_size", + itr.allocation_size, + "Starting_address", + itr.starting_address); + TRACE_EVENT_END(sdk::perfetto_category::name, + track, + itr.end_timestamp, + "end_ns", + itr.end_timestamp); + } + auto demangled = std::unordered_map{}; for(auto itr : kernel_dispatch_bf_records) {