Add support for scratch reporting (#523)
* Add ToolsApiTable Add ToolsApiTable wrapping for scratch memory tracking * Add initial support for scratch memory tracking Buffering is implemented * cmake formatting (cmake-format) (#525) Co-authored-by: MythreyaK <MythreyaK@users.noreply.github.com> * source formatting (clang-format v11) (#524) Co-authored-by: MythreyaK <MythreyaK@users.noreply.github.com> * Add callback tracing for scratch Fixed the error where scratch tracking init was called irrespective of whether any client requested for it * Apply suggestions from code review Co-authored-by: Jonathan R. Madsen <jrmadsen@users.noreply.github.com> * Fix tools api copy/update Table were saved/updated incorrectly in previous commit. Also adds passing user data through the callback * Fix OpKind sequence for scratch tracking Previously scratch was using OpKind from rocprofiler-sdk, but templates were instantiated using API ID. These differ by 1 * Integration tests for scratch reporting Added buffer and callback integration tests for scratch reporting * source formatting (clang-format v11) (#550) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * cmake formatting (cmake-format) (#551) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * python formatting (black) (#549) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * CI fixes * source formatting (clang-format v11) (#554) Co-authored-by: MythreyaK <26112391+MythreyaK@users.noreply.github.com> * Update api Rebase on main and updates based on PR feedback * Update scratch reporting and address PR comments - Added agent id to buffer records - Updated `test_internal_correlation_ids` - Is almost identical to one in async-copy - Updated scratch test to check for agent id - Updated queue id serialization in callback records (prints handle as nested key) - Remove `marker_api_traces` from scratch `test_internal_correlation_ids` validation test - Rename `amd_tools_api` to `scratch_memory` - Added doxygen comments - Remove scratch callback from `tool.cpp` - Replace assert with `LOF_IF` in `scratch_memory.cpp` * Update tools table Changed to match up with changes to hsa tables in main branch * Rework scratch memory structure * Update tests - Added suggestions from PR review, and updated tests accordingly * Misc cleanup * Update scratch test As of Apr 4th, `hsa_amd_agent_set_async_scratch_limit` is disabled. Note, > This API: `hsa_amd_agent_set_async_scratch_limit` is currently > disabled. We need some changes in CP firmware to be able to do this > and these changes are not ready yet. > With the current code, you will also not get notifications for > alternate-scratch allocations because this feature has been disabled > while CP firmware is making additional changes > We are hoping to have that feature enabled by ROCm-6.3 * Minor update to lib/rocprofiler-sdk/internal_threading.* - delay destruction of shared_ptrs of the tasks to prevent rare (but possible) data race on the destruction of the shared_ptr --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: MythreyaK <MythreyaK@users.noreply.github.com> Co-authored-by: Jonathan R. Madsen <jrmadsen@users.noreply.github.com> Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Этот коммит содержится в:
@@ -34,6 +34,7 @@ usage() {
|
||||
echo -e "${GREEN}--marker-trace ${RESET} For Collecting Marker (ROCTx) Traces"
|
||||
echo -e "${GREEN}--kernel-trace ${RESET} For Collecting Kernel Dispatch Traces"
|
||||
echo -e "${GREEN}--memory-copy-trace ${RESET} For Collecting Memory Copy Traces"
|
||||
echo -e "${GREEN}--scratch-memory-trace ${RESET} For Collecting Scratch Memory operations Traces"
|
||||
echo -e ""
|
||||
echo -e "${GREEN}--hsa-trace ${RESET} For Collecting HSA API Traces (core + amd + image + finalizer)"
|
||||
echo -e "${GREEN}--hsa-core-trace ${RESET} For Collecting HSA API Traces (core API)"
|
||||
@@ -150,6 +151,9 @@ while true; do
|
||||
elif [ "$1" == "--memory-copy-trace" ]; then
|
||||
export ROCPROF_MEMORY_COPY_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--scratch-memory-trace" ]; then
|
||||
export ROCPROF_SCRATCH_MEMORY_TRACE=1
|
||||
shift
|
||||
elif [ "$1" == "--marker-trace" ]; then
|
||||
export ROCPROF_MARKER_API_TRACE=1
|
||||
shift
|
||||
|
||||
@@ -152,16 +152,21 @@ typedef struct
|
||||
} rocprofiler_buffer_tracing_page_migration_record_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Buffer Scratch Memory Tracer Record. Not implemented.
|
||||
* @brief ROCProfiler Buffer Scratch Memory Tracer Record
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_buffer_tracing_kind_t kind; ///< ::ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY
|
||||
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
// Not Sure What is the info needed here?
|
||||
rocprofiler_scratch_memory_operation_t
|
||||
operation; ///< @see rocprofiler_scratch_memory_operation_t
|
||||
rocprofiler_agent_id_t agent_id; ///< agent kernel was dispatched on
|
||||
rocprofiler_queue_id_t queue_id; ///< queue kernel was dispatched on
|
||||
rocprofiler_thread_id_t thread_id; ///< id for thread generating this record
|
||||
rocprofiler_timestamp_t start_timestamp; ///< start time in nanoseconds
|
||||
rocprofiler_timestamp_t end_timestamp; ///< end time in nanoseconds
|
||||
rocprofiler_correlation_id_t correlation_id; ///< correlation ids for record
|
||||
rocprofiler_scratch_alloc_flag_t flags;
|
||||
} rocprofiler_buffer_tracing_scratch_memory_record_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -28,6 +28,9 @@
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
#include <rocprofiler-sdk/marker.h>
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_amd_tool.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include <stdint.h>
|
||||
@@ -153,6 +156,19 @@ typedef struct
|
||||
|
||||
} rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
|
||||
/**
|
||||
* @brief ROCProfiler Scratch Memory Callback Data.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint64_t size; ///< size of this struct
|
||||
rocprofiler_agent_id_t agent_id;
|
||||
rocprofiler_queue_id_t queue_id;
|
||||
rocprofiler_scratch_alloc_flag_t flags;
|
||||
hsa_amd_tool_event_kind_t args_kind;
|
||||
rocprofiler_scratch_memory_args_t args;
|
||||
} rocprofiler_callback_tracing_scratch_memory_data_t;
|
||||
|
||||
/**
|
||||
* @brief API Tracing callback function. This function is invoked twice per API function: once
|
||||
* before the function is invoked and once after the function is invoked. The external correlation
|
||||
|
||||
@@ -24,6 +24,8 @@
|
||||
|
||||
#include <rocprofiler-sdk/defines.h>
|
||||
|
||||
#include <hsa/hsa_amd_tool.h>
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -146,7 +148,8 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, ///< @see
|
||||
///< ::rocprofiler_marker_control_api_id_t
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API, ///< @see ::rocprofiler_marker_name_api_id_t
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ///< @see ::rocprofiler_code_object_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, ///< @see ::rocprofiler_scratch_memory_operation_t
|
||||
ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ///< Callbacks for kernel dispatches
|
||||
ROCPROFILER_CALLBACK_TRACING_LAST,
|
||||
} rocprofiler_callback_tracing_kind_t;
|
||||
@@ -233,6 +236,31 @@ typedef enum // NOLINT(performance-enum-size)
|
||||
ROCPROFILER_BUFFER_POLICY_LAST,
|
||||
} rocprofiler_buffer_policy_t;
|
||||
|
||||
/**
|
||||
* @brief Scratch event kind
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_SCRATCH_MEMORY_NONE = 0, ///< Unknown scratch operation
|
||||
ROCPROFILER_SCRATCH_MEMORY_ALLOC, ///< Scratch memory allocation event
|
||||
ROCPROFILER_SCRATCH_MEMORY_FREE, ///< Scratch memory free event
|
||||
ROCPROFILER_SCRATCH_MEMORY_ASYNC_RECLAIM, ///< Scratch memory asynchronously reclaimed
|
||||
ROCPROFILER_SCRATCH_MEMORY_LAST,
|
||||
} rocprofiler_scratch_memory_operation_t;
|
||||
|
||||
/**
|
||||
* @brief Allocation flags for @see rocprofiler_buffer_tracing_scratch_memory_record_t
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
ROCPROFILER_SCRATCH_ALLOC_FLAG_NONE = 0,
|
||||
ROCPROFILER_SCRATCH_ALLOC_FLAG_USE_ONCE =
|
||||
HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_USE_ONCE, ///< This scratch allocation is only valid for 1
|
||||
///< dispatch.
|
||||
ROCPROFILER_SCRATCH_ALLOC_FLAG_ALT =
|
||||
HSA_AMD_EVENT_SCRATCH_ALLOC_FLAG_ALT, ///< Used alternate scratch instead of main scratch
|
||||
} rocprofiler_scratch_alloc_flag_t;
|
||||
|
||||
/**
|
||||
* @brief Enumeration for specifying runtime libraries supported by rocprofiler. This enumeration is
|
||||
* used for thread creation callbacks. @see INTERNAL_THREADING.
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
|
||||
#include <rocprofiler-sdk/hsa/api_args.h>
|
||||
#include <rocprofiler-sdk/hsa/api_id.h>
|
||||
#include <rocprofiler-sdk/hsa/scratch_memory_args.h>
|
||||
#include <rocprofiler-sdk/hsa/table_id.h>
|
||||
|
||||
#if defined(ROCPROFILER_DEFINED_AMD_INTERNAL_BUILD) && ROCPROFILER_DEFINED_AMD_INTERNAL_BUILD > 0
|
||||
|
||||
@@ -4,8 +4,15 @@
|
||||
#
|
||||
#
|
||||
set(ROCPROFILER_HSA_HEADER_FILES
|
||||
amd_ext_api_id.h api_args.h api_id.h api_trace_version.h core_api_id.h
|
||||
finalize_ext_api_id.h image_ext_api_id.h table_id.h)
|
||||
amd_ext_api_id.h
|
||||
api_args.h
|
||||
api_id.h
|
||||
api_trace_version.h
|
||||
core_api_id.h
|
||||
finalize_ext_api_id.h
|
||||
image_ext_api_id.h
|
||||
scratch_memory_args.h
|
||||
table_id.h)
|
||||
|
||||
install(
|
||||
FILES ${ROCPROFILER_HSA_HEADER_FILES}
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <rocprofiler-sdk/defines.h>
|
||||
#include <rocprofiler-sdk/hsa/api_trace_version.h>
|
||||
#include <rocprofiler-sdk/version.h>
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_amd_tool.h>
|
||||
|
||||
ROCPROFILER_EXTERN_C_INIT
|
||||
|
||||
// Empty struct has a size of 0 in C but size of 1 in C++.
|
||||
// This struct is added to the union members which represent
|
||||
// functions with no arguments to ensure ABI compatibility
|
||||
typedef struct rocprofiler_scratch_memory_no_args
|
||||
{
|
||||
char empty;
|
||||
} rocprofiler_scratch_memory_no_args;
|
||||
|
||||
typedef union rocprofiler_scratch_memory_args_t
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint64_t dispatch_id;
|
||||
} alloc_start;
|
||||
struct
|
||||
{
|
||||
uint64_t dispatch_id;
|
||||
size_t size;
|
||||
size_t num_slots;
|
||||
} alloc_end;
|
||||
struct
|
||||
{
|
||||
rocprofiler_scratch_memory_no_args no_args;
|
||||
} free_start;
|
||||
struct
|
||||
{
|
||||
rocprofiler_scratch_memory_no_args no_args;
|
||||
} free_end;
|
||||
struct
|
||||
{
|
||||
rocprofiler_scratch_memory_no_args no_args;
|
||||
} async_reclaim_start;
|
||||
struct
|
||||
{
|
||||
rocprofiler_scratch_memory_no_args no_args;
|
||||
} async_reclaim_end;
|
||||
} rocprofiler_scratch_memory_args_t;
|
||||
|
||||
ROCPROFILER_EXTERN_C_FINI
|
||||
@@ -30,5 +30,6 @@ typedef enum
|
||||
ROCPROFILER_HSA_TABLE_ID_AmdExt,
|
||||
ROCPROFILER_HSA_TABLE_ID_ImageExt,
|
||||
ROCPROFILER_HSA_TABLE_ID_FinalizeExt,
|
||||
ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
ROCPROFILER_HSA_TABLE_ID_LAST,
|
||||
} rocprofiler_hsa_table_id_t;
|
||||
|
||||
@@ -66,6 +66,7 @@ struct config
|
||||
bool hsa_finalizer_ext_api_trace = get_env("ROCPROF_HSA_FINALIZER_EXT_API_TRACE", false);
|
||||
bool marker_api_trace = get_env("ROCPROF_MARKER_API_TRACE", false);
|
||||
bool memory_copy_trace = get_env("ROCPROF_MEMORY_COPY_TRACE", false);
|
||||
bool scratch_memory = get_env("ROCPROF_SCRATCH_MEMORY_TRACE", false);
|
||||
bool counter_collection = get_env("ROCPROF_COUNTER_COLLECTION", false);
|
||||
bool hip_runtime_api_trace = get_env("ROCPROF_HIP_RUNTIME_API_TRACE", false);
|
||||
bool hip_compiler_api_trace = get_env("ROCPROF_HIP_COMPILER_API_TRACE", false);
|
||||
|
||||
@@ -81,6 +81,7 @@ using memory_copy_csv_encoder = csv_encoder<7>;
|
||||
using marker_csv_encoder = csv_encoder<7>;
|
||||
using list_basic_metrics_csv_encoder = csv_encoder<5>;
|
||||
using list_derived_metrics_csv_encoder = csv_encoder<5>;
|
||||
using scratch_memory_encoder = csv_encoder<8>;
|
||||
} // namespace csv
|
||||
} // namespace tool
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -48,6 +48,7 @@ get_buffer_id_names()
|
||||
ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API,
|
||||
ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
};
|
||||
|
||||
auto cb_name_info = rocprofiler_tool_buffer_name_info_t{};
|
||||
|
||||
@@ -208,6 +208,25 @@ get_memory_copy_trace_file()
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto&
|
||||
get_scratch_memory_trace_file()
|
||||
{
|
||||
static auto* _v = new tool::output_file{"scratch_memory_trace",
|
||||
tool::csv::scratch_memory_encoder{},
|
||||
{
|
||||
"Kind",
|
||||
"Operation",
|
||||
"Agent_Id",
|
||||
"Queue_Id",
|
||||
"Thread_Id",
|
||||
"Alloc_flags",
|
||||
"Start_Timestamp",
|
||||
"End_Timestamp",
|
||||
}};
|
||||
ADD_DESTRUCTOR(_v);
|
||||
return _v;
|
||||
}
|
||||
|
||||
tool::output_file*&
|
||||
get_marker_api_file()
|
||||
{
|
||||
@@ -264,11 +283,16 @@ struct buffer_ids
|
||||
rocprofiler_buffer_id_t kernel_trace = {};
|
||||
rocprofiler_buffer_id_t memory_copy_trace = {};
|
||||
rocprofiler_buffer_id_t counter_collection = {};
|
||||
rocprofiler_buffer_id_t scratch_memory = {};
|
||||
|
||||
auto as_array() const
|
||||
{
|
||||
return std::array<rocprofiler_buffer_id_t, 5>{
|
||||
hsa_api_trace, hip_api_trace, kernel_trace, memory_copy_trace, counter_collection};
|
||||
return std::array<rocprofiler_buffer_id_t, 6>{hsa_api_trace,
|
||||
hip_api_trace,
|
||||
kernel_trace,
|
||||
memory_copy_trace,
|
||||
counter_collection,
|
||||
scratch_memory};
|
||||
}
|
||||
};
|
||||
|
||||
@@ -728,6 +752,27 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
|
||||
|
||||
get_dereference(get_memory_copy_trace_file()) << memory_copy_trace_ss.str();
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* record = static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
|
||||
header->payload);
|
||||
|
||||
auto scratch_memory_trace = std::stringstream{};
|
||||
tool::csv::scratch_memory_encoder::write_row(
|
||||
scratch_memory_trace,
|
||||
CHECK_NOTNULL(buffered_name_info)->kind_names.at(record->kind),
|
||||
CHECK_NOTNULL(buffered_name_info)
|
||||
->operation_names.at(record->kind)
|
||||
.at(record->operation),
|
||||
agent_info->at(record->agent_id)->node_id,
|
||||
record->queue_id.handle,
|
||||
record->thread_id,
|
||||
record->flags,
|
||||
record->start_timestamp,
|
||||
record->end_timestamp);
|
||||
|
||||
get_dereference(get_scratch_memory_trace_file()) << scratch_memory_trace.str();
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API ||
|
||||
header->kind == ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API)
|
||||
{
|
||||
@@ -1162,6 +1207,26 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
"buffer tracing service for memory copy configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().scratch_memory)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
buffer_size,
|
||||
buffer_watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
buffered_tracing_callback,
|
||||
tool_data,
|
||||
&get_buffers().scratch_memory),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(get_client_ctx(),
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
nullptr,
|
||||
0,
|
||||
get_buffers().scratch_memory),
|
||||
"buffer tracing service for scratch memory configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().hsa_core_api_trace || tool::get_config().hsa_amd_ext_api_trace ||
|
||||
tool::get_config().hsa_image_ext_api_trace || tool::get_config().hsa_finalizer_ext_api_trace)
|
||||
{
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "lib/rocprofiler-sdk/hip/hip.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/async_copy.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/rocprofiler-sdk/marker/marker.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
|
||||
@@ -104,7 +105,7 @@ rocprofiler_configure_buffer_tracing_service(rocprofiler_context_id_t c
|
||||
return ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED;
|
||||
|
||||
static auto unsupported = std::unordered_set<rocprofiler_buffer_tracing_kind_t>{
|
||||
ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY};
|
||||
ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION};
|
||||
if(unsupported.count(kind) > 0) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id);
|
||||
@@ -194,6 +195,11 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_
|
||||
val = rocprofiler::hsa::async_copy::name_by_id(operation);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY:
|
||||
{
|
||||
val = rocprofiler::hsa::scratch_memory::name_by_id(operation);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API:
|
||||
{
|
||||
val = rocprofiler::marker::name_by_id<ROCPROFILER_MARKER_TABLE_ID_RoctxCore>(operation);
|
||||
@@ -222,7 +228,6 @@ rocprofiler_query_buffer_tracing_kind_operation_name(rocprofiler_buffer_tracing_
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH:
|
||||
case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION:
|
||||
case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY:
|
||||
case ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT:
|
||||
{
|
||||
return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
@@ -294,6 +299,11 @@ rocprofiler_iterate_buffer_tracing_kind_operations(
|
||||
ops = rocprofiler::hsa::async_copy::get_ids();
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY:
|
||||
{
|
||||
ops = rocprofiler::hsa::scratch_memory::get_ids();
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_MARKER_CORE_API:
|
||||
{
|
||||
ops = rocprofiler::marker::get_ids<ROCPROFILER_MARKER_TABLE_ID_RoctxCore>();
|
||||
@@ -321,7 +331,6 @@ rocprofiler_iterate_buffer_tracing_kind_operations(
|
||||
}
|
||||
case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH:
|
||||
case ROCPROFILER_BUFFER_TRACING_PAGE_MIGRATION:
|
||||
case ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY:
|
||||
case ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT:
|
||||
{
|
||||
return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "lib/rocprofiler-sdk/hip/hip.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/code_object.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/rocprofiler-sdk/marker/marker.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
|
||||
@@ -74,6 +75,7 @@ ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_CORE_API)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_CONTROL_API)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(MARKER_NAME_API)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(CODE_OBJECT)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(SCRATCH_MEMORY)
|
||||
ROCPROFILER_CALLBACK_TRACING_KIND_STRING(KERNEL_DISPATCH)
|
||||
|
||||
template <size_t Idx, size_t... Tail>
|
||||
@@ -179,6 +181,11 @@ rocprofiler_query_callback_tracing_kind_operation_name(rocprofiler_callback_trac
|
||||
val = rocprofiler::hsa::name_by_id<ROCPROFILER_HSA_TABLE_ID_FinalizeExt>(operation);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
|
||||
{
|
||||
val = rocprofiler::hsa::scratch_memory::name_by_id(operation);
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API:
|
||||
{
|
||||
val = rocprofiler::marker::name_by_id<ROCPROFILER_MARKER_TABLE_ID_RoctxCore>(operation);
|
||||
@@ -278,6 +285,11 @@ rocprofiler_iterate_callback_tracing_kind_operations(
|
||||
ops = rocprofiler::hsa::get_ids<ROCPROFILER_HSA_TABLE_ID_FinalizeExt>();
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
|
||||
{
|
||||
ops = rocprofiler::hsa::scratch_memory::get_ids();
|
||||
break;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API:
|
||||
{
|
||||
ops = rocprofiler::marker::get_ids<ROCPROFILER_MARKER_TABLE_ID_RoctxCore>();
|
||||
@@ -437,6 +449,7 @@ rocprofiler_iterate_callback_tracing_kind_operation_args(
|
||||
user_data);
|
||||
return ROCPROFILER_STATUS_SUCCESS;
|
||||
}
|
||||
case ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY:
|
||||
case ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT:
|
||||
case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH:
|
||||
{
|
||||
|
||||
@@ -8,7 +8,8 @@ set(ROCPROFILER_LIB_HSA_SOURCES
|
||||
queue_controller.cpp
|
||||
queue.cpp
|
||||
hsa_barrier.cpp
|
||||
profile_serializer.cpp)
|
||||
profile_serializer.cpp
|
||||
scratch_memory.cpp)
|
||||
|
||||
set(ROCPROFILER_LIB_HSA_HEADERS
|
||||
agent_cache.hpp
|
||||
@@ -21,6 +22,7 @@ set(ROCPROFILER_LIB_HSA_HEADERS
|
||||
profile_serializer.hpp
|
||||
queue_controller.hpp
|
||||
queue.hpp
|
||||
scratch_memory.hpp
|
||||
types.hpp
|
||||
utils.hpp)
|
||||
|
||||
|
||||
@@ -62,6 +62,51 @@
|
||||
} \
|
||||
}
|
||||
|
||||
// meta definition for non-public APIs (i.e. only in table)
|
||||
#define HSA_API_META_DEFINITION_NP(HSA_TABLE, HSA_API_ID, HSA_FUNC, HSA_FUNC_PTR) \
|
||||
namespace rocprofiler \
|
||||
{ \
|
||||
namespace hsa \
|
||||
{ \
|
||||
template <> \
|
||||
struct hsa_api_meta<HSA_TABLE, HSA_API_ID> \
|
||||
{ \
|
||||
static constexpr auto table_idx = HSA_TABLE; \
|
||||
static constexpr auto operation_idx = HSA_API_ID; \
|
||||
static constexpr auto name = #HSA_FUNC; \
|
||||
\
|
||||
using this_type = hsa_api_meta<table_idx, operation_idx>; \
|
||||
using function_type = hsa_api_func<decltype( \
|
||||
std::declval<hsa_table_lookup<table_idx>::type>().HSA_FUNC_PTR)>::function_type; \
|
||||
\
|
||||
static constexpr auto offset() \
|
||||
{ \
|
||||
return offsetof(hsa_table_lookup<table_idx>::type, HSA_FUNC_PTR); \
|
||||
} \
|
||||
\
|
||||
template <typename TableT> \
|
||||
static auto& get_table(TableT& _v) \
|
||||
{ \
|
||||
return hsa_table_lookup<table_idx>{}(_v); \
|
||||
} \
|
||||
\
|
||||
template <typename TableT> \
|
||||
static auto& get_table_func(TableT& _table) \
|
||||
{ \
|
||||
if constexpr(std::is_pointer<TableT>::value) \
|
||||
{ \
|
||||
assert(_table != nullptr && "nullptr to HSA table for " #HSA_FUNC " function"); \
|
||||
return _table->HSA_FUNC_PTR; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
return _table.HSA_FUNC_PTR; \
|
||||
} \
|
||||
} \
|
||||
}; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define HSA_API_INFO_DEFINITION_0(HSA_TABLE, HSA_API_ID, HSA_FUNC, HSA_FUNC_PTR) \
|
||||
namespace rocprofiler \
|
||||
{ \
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/details/ostream.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/types.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/utils.hpp"
|
||||
#include "lib/rocprofiler-sdk/registration.hpp"
|
||||
@@ -104,6 +105,7 @@ DEFINE_TABLE_VERSION(core, CORE_API)
|
||||
DEFINE_TABLE_VERSION(amd_ext, AMD_EXT_API)
|
||||
DEFINE_TABLE_VERSION(fini_ext, FINALIZER_API)
|
||||
DEFINE_TABLE_VERSION(img_ext, IMAGE_API)
|
||||
DEFINE_TABLE_VERSION(amd_tool, TOOLS_API)
|
||||
|
||||
#undef DEFINE_TABLE_VERSION
|
||||
#undef DEFINE_TABLE_VERSION_IMPL
|
||||
@@ -153,6 +155,13 @@ get_tracing_img_ext_table()
|
||||
return val;
|
||||
}
|
||||
|
||||
hsa_amd_tool_table_t*
|
||||
get_tracing_amd_tool_table() // table is never traced
|
||||
{
|
||||
static auto*& val = GET_TABLE_IMPL(amd_tool, tracing_table);
|
||||
return val;
|
||||
}
|
||||
|
||||
hsa_table_version_t
|
||||
get_table_version()
|
||||
{
|
||||
@@ -187,6 +196,13 @@ get_img_ext_table()
|
||||
return val;
|
||||
}
|
||||
|
||||
hsa_amd_tool_table_t*
|
||||
get_amd_tool_table()
|
||||
{
|
||||
static auto*& val = GET_TABLE_IMPL(amd_tool, internal_table);
|
||||
return val;
|
||||
}
|
||||
|
||||
#undef GET_TABLE_IMPL
|
||||
|
||||
hsa_api_table_t&
|
||||
@@ -196,7 +212,8 @@ get_table()
|
||||
.core_ = get_core_table(),
|
||||
.amd_ext_ = get_amd_ext_table(),
|
||||
.finalizer_ext_ = get_fini_ext_table(),
|
||||
.image_ext_ = get_img_ext_table()};
|
||||
.image_ext_ = get_img_ext_table(),
|
||||
.tools_ = get_amd_tool_table()};
|
||||
return tbl;
|
||||
}
|
||||
|
||||
@@ -773,6 +790,20 @@ INSTANTIATE_HSA_TABLE_FUNC(hsa_amd_ext_table_t, ROCPROFILER_HSA_TABLE_ID_AmdExt)
|
||||
INSTANTIATE_HSA_TABLE_FUNC(hsa_img_ext_table_t, ROCPROFILER_HSA_TABLE_ID_ImageExt)
|
||||
INSTANTIATE_HSA_TABLE_FUNC(hsa_fini_ext_table_t, ROCPROFILER_HSA_TABLE_ID_FinalizeExt)
|
||||
|
||||
template <>
|
||||
void
|
||||
copy_table<hsa_amd_tool_table_t>(hsa_amd_tool_table_t* _tbl, uint64_t _instv)
|
||||
{
|
||||
scratch_memory::copy_table(_tbl, _instv);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
update_table<hsa_amd_tool_table_t>(hsa_amd_tool_table_t* _tbl, uint64_t _instv)
|
||||
{
|
||||
scratch_memory::update_table(_tbl, _instv);
|
||||
}
|
||||
|
||||
#undef INSTANTIATE_HSA_TABLE_FUNC
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -26,11 +26,15 @@
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
#include <rocprofiler-sdk/hsa/table_id.h>
|
||||
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_Core, ::CoreApiTable, core)
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdExt, ::AmdExtTable, amd_ext)
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_ImageExt, ::ImageExtTable, img_ext)
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_FinalizeExt, ::FinalizerExtTable, fini_ext)
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdTool, ::ToolsApiTable, amd_tool)
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
@@ -49,44 +53,54 @@ template <>
|
||||
struct hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_Core>
|
||||
: hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_LAST>
|
||||
{
|
||||
using enum_type = rocprofiler_hsa_core_api_id_t;
|
||||
static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_CORE_API;
|
||||
static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_CORE_API;
|
||||
static constexpr auto none = ROCPROFILER_HSA_CORE_API_ID_NONE;
|
||||
static constexpr auto last = ROCPROFILER_HSA_CORE_API_ID_LAST;
|
||||
using enum_type = rocprofiler_hsa_core_api_id_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_AmdExt>
|
||||
: hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_LAST>
|
||||
{
|
||||
using enum_type = rocprofiler_hsa_amd_ext_api_id_t;
|
||||
static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_AMD_EXT_API;
|
||||
static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_AMD_EXT_API;
|
||||
static constexpr auto none = ROCPROFILER_HSA_AMD_EXT_API_ID_NONE;
|
||||
static constexpr auto last = ROCPROFILER_HSA_AMD_EXT_API_ID_LAST;
|
||||
using enum_type = rocprofiler_hsa_amd_ext_api_id_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_ImageExt>
|
||||
: hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_LAST>
|
||||
{
|
||||
using enum_type = rocprofiler_hsa_image_ext_api_id_t;
|
||||
static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_IMAGE_EXT_API;
|
||||
static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_IMAGE_EXT_API;
|
||||
static constexpr auto none = ROCPROFILER_HSA_IMAGE_EXT_API_ID_NONE;
|
||||
static constexpr auto last = ROCPROFILER_HSA_IMAGE_EXT_API_ID_LAST;
|
||||
using enum_type = rocprofiler_hsa_image_ext_api_id_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_FinalizeExt>
|
||||
: hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_LAST>
|
||||
{
|
||||
using enum_type = rocprofiler_hsa_finalize_ext_api_id_t;
|
||||
static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_HSA_FINALIZE_EXT_API;
|
||||
static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_HSA_FINALIZE_EXT_API;
|
||||
static constexpr auto none = ROCPROFILER_HSA_FINALIZE_EXT_API_ID_NONE;
|
||||
static constexpr auto last = ROCPROFILER_HSA_FINALIZE_EXT_API_ID_LAST;
|
||||
using enum_type = rocprofiler_hsa_finalize_ext_api_id_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_AmdTool>
|
||||
: hsa_domain_info<ROCPROFILER_HSA_TABLE_ID_LAST>
|
||||
{
|
||||
static constexpr auto callback_domain_idx = ROCPROFILER_CALLBACK_TRACING_NONE;
|
||||
static constexpr auto buffered_domain_idx = ROCPROFILER_BUFFER_TRACING_NONE;
|
||||
static constexpr auto none = 0;
|
||||
static constexpr auto last = 0;
|
||||
};
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -44,6 +44,7 @@ using hsa_core_table_t = ::CoreApiTable;
|
||||
using hsa_amd_ext_table_t = ::AmdExtTable;
|
||||
using hsa_fini_ext_table_t = ::FinalizerExtTable;
|
||||
using hsa_img_ext_table_t = ::ImageExtTable;
|
||||
using hsa_amd_tool_table_t = ::ToolsApiTable;
|
||||
|
||||
hsa_api_table_t&
|
||||
get_table();
|
||||
@@ -63,6 +64,9 @@ get_fini_ext_table();
|
||||
hsa_img_ext_table_t*
|
||||
get_img_ext_table();
|
||||
|
||||
hsa_amd_tool_table_t*
|
||||
get_amd_tool_table();
|
||||
|
||||
hsa_core_table_t*
|
||||
get_tracing_core_table();
|
||||
|
||||
@@ -75,6 +79,9 @@ get_tracing_fini_ext_table();
|
||||
hsa_img_ext_table_t*
|
||||
get_tracing_img_ext_table();
|
||||
|
||||
hsa_amd_tool_table_t*
|
||||
get_tracing_amd_tool_table();
|
||||
|
||||
template <size_t Idx>
|
||||
struct hsa_table_lookup;
|
||||
|
||||
|
||||
@@ -217,7 +217,7 @@ private:
|
||||
inline rocprofiler_queue_id_t
|
||||
Queue::get_id() const
|
||||
{
|
||||
return {.handle = reinterpret_cast<uint64_t>(intercept_queue())};
|
||||
return {.handle = intercept_queue()->id};
|
||||
};
|
||||
|
||||
template <typename FuncT>
|
||||
|
||||
@@ -0,0 +1,720 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/common/defines.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
#include "lib/rocprofiler-sdk/buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/defines.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa/api_id.h>
|
||||
#include <rocprofiler-sdk/hsa/table_id.h>
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include <hsa/amd_hsa_signal.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_amd_tool.h>
|
||||
#include <hsa/hsa_api_trace.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
HSA_API_TABLE_LOOKUP_DEFINITION(ROCPROFILER_HSA_TABLE_ID_AmdTool, ::ToolsApiTable, amd_tool)
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
using remove_cvref_t = std::remove_cv_t<std::remove_reference_t<T>>;
|
||||
|
||||
enum scratch_event_kind
|
||||
{
|
||||
hsa_amd_tool_id_none = 0,
|
||||
// scratch reporting
|
||||
hsa_amd_tool_id_scratch_event_alloc_start,
|
||||
hsa_amd_tool_id_scratch_event_alloc_end,
|
||||
hsa_amd_tool_id_scratch_event_free_start,
|
||||
hsa_amd_tool_id_scratch_event_free_end,
|
||||
hsa_amd_tool_id_scratch_event_async_reclaim_start,
|
||||
hsa_amd_tool_id_scratch_event_async_reclaim_end,
|
||||
hsa_amd_tool_id_scratch_event_last,
|
||||
};
|
||||
} // namespace
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
hsa_amd_tool_id_scratch_event_alloc_start,
|
||||
hsa_amd_tool_scratch_event_alloc_start,
|
||||
hsa_amd_tool_scratch_event_alloc_start_fn);
|
||||
HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
hsa_amd_tool_id_scratch_event_alloc_end,
|
||||
hsa_amd_tool_scratch_event_alloc_end,
|
||||
hsa_amd_tool_scratch_event_alloc_end_fn);
|
||||
HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
hsa_amd_tool_id_scratch_event_free_start,
|
||||
hsa_amd_tool_scratch_event_free_start,
|
||||
hsa_amd_tool_scratch_event_free_start_fn);
|
||||
HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
hsa_amd_tool_id_scratch_event_free_end,
|
||||
hsa_amd_tool_scratch_event_free_end,
|
||||
hsa_amd_tool_scratch_event_free_end_fn);
|
||||
HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
hsa_amd_tool_id_scratch_event_async_reclaim_start,
|
||||
hsa_amd_tool_scratch_event_async_reclaim_start,
|
||||
hsa_amd_tool_scratch_event_async_reclaim_start_fn);
|
||||
HSA_API_META_DEFINITION_NP(ROCPROFILER_HSA_TABLE_ID_AmdTool,
|
||||
hsa_amd_tool_id_scratch_event_async_reclaim_end,
|
||||
hsa_amd_tool_scratch_event_async_reclaim_end,
|
||||
hsa_amd_tool_scratch_event_async_reclaim_end_fn);
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
namespace scratch_memory
|
||||
{
|
||||
using context_t = context::context;
|
||||
using context_array_t = common::container::small_vector<const context_t*>;
|
||||
using correlation_service = context::correlation_tracing_service;
|
||||
|
||||
bool
|
||||
context_filter(const context::context* ctx)
|
||||
{
|
||||
const auto need_buffering =
|
||||
(ctx->buffered_tracer &&
|
||||
(ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)));
|
||||
const auto need_callbacks =
|
||||
(ctx->callback_tracer &&
|
||||
(ctx->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY)));
|
||||
|
||||
return need_buffering || need_callbacks;
|
||||
}
|
||||
|
||||
bool
|
||||
should_wrap_functor(const context::context_array_t& _contexts, int _operation)
|
||||
{
|
||||
// we loop over all the *registered* contexts and see if any of them, at any point in time,
|
||||
// might require callback or buffered API tracing
|
||||
for(const auto& itr : _contexts)
|
||||
{
|
||||
if(!itr) continue;
|
||||
|
||||
// if there is a callback tracer enabled for the given domain and op, we need to wrap
|
||||
if(itr->callback_tracer &&
|
||||
itr->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY, _operation))
|
||||
return true;
|
||||
|
||||
// if there is a buffered tracer enabled for the given domain and op, we need to wrap
|
||||
if(itr->buffered_tracer &&
|
||||
itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, _operation))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx>
|
||||
auto&
|
||||
get_next_dispatch()
|
||||
{
|
||||
using function_t = typename hsa_api_meta<TableIdx, OpIdx>::function_type;
|
||||
static function_t _v = nullptr;
|
||||
return _v;
|
||||
}
|
||||
|
||||
template <typename FuncT, typename ArgsT, size_t... Idx>
|
||||
decltype(auto)
|
||||
invoke(FuncT&& _func, ArgsT&& _args, std::index_sequence<Idx...>)
|
||||
{
|
||||
using RetT = decltype(std::forward<FuncT>(_func)(std::get<Idx>(_args)...));
|
||||
|
||||
// Scratch function pointers that we saved could've been null
|
||||
if constexpr(std::is_same_v<RetT, hsa_status_t>)
|
||||
{
|
||||
if(_func != nullptr)
|
||||
{
|
||||
return std::forward<FuncT>(_func)(std::get<Idx>(_args)...);
|
||||
}
|
||||
else
|
||||
{
|
||||
return hsa_status_t::HSA_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(sizeof(RetT) < 0, "Unexpected types for HSA tools table invoke");
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
struct amd_tool_api_info;
|
||||
|
||||
template <size_t N>
|
||||
struct scratch_op_info;
|
||||
|
||||
template <>
|
||||
struct scratch_op_info<hsa_amd_tool_id_none>
|
||||
{
|
||||
static constexpr auto operation = ROCPROFILER_SCRATCH_MEMORY_NONE;
|
||||
static constexpr auto phase = ROCPROFILER_CALLBACK_PHASE_NONE;
|
||||
};
|
||||
|
||||
#define SPECIALIZE_AMD_TOOL(TOOL_OP, STARTPHASE, ENDPHASE) \
|
||||
template <> \
|
||||
struct scratch_op_info<hsa_amd_tool_id_##STARTPHASE> \
|
||||
{ \
|
||||
static constexpr auto operation = ROCPROFILER_##TOOL_OP; \
|
||||
static constexpr rocprofiler_callback_phase_t phase = ROCPROFILER_CALLBACK_PHASE_ENTER; \
|
||||
using function_t = \
|
||||
decltype(ToolsApiTable::IMPL_DETAIL_CONCATENATE(hsa_amd_tool_##STARTPHASE, _fn)); \
|
||||
}; \
|
||||
template <> \
|
||||
struct scratch_op_info<hsa_amd_tool_id_##ENDPHASE> \
|
||||
{ \
|
||||
static constexpr auto operation = ROCPROFILER_##TOOL_OP; \
|
||||
static constexpr rocprofiler_callback_phase_t phase = ROCPROFILER_CALLBACK_PHASE_EXIT; \
|
||||
using function_t = \
|
||||
decltype(ToolsApiTable::IMPL_DETAIL_CONCATENATE(hsa_amd_tool_##ENDPHASE, _fn)); \
|
||||
}; \
|
||||
template <> \
|
||||
struct amd_tool_api_info<ROCPROFILER_##TOOL_OP> \
|
||||
{ \
|
||||
using start_fn_t = scratch_op_info<hsa_amd_tool_id_##STARTPHASE>::function_t; \
|
||||
using end_fn_t = scratch_op_info<hsa_amd_tool_id_##ENDPHASE>::function_t; \
|
||||
static constexpr auto operation_idx = ROCPROFILER_##TOOL_OP; \
|
||||
static constexpr auto name = #TOOL_OP; \
|
||||
}
|
||||
|
||||
SPECIALIZE_AMD_TOOL(SCRATCH_MEMORY_ALLOC, scratch_event_alloc_start, scratch_event_alloc_end);
|
||||
SPECIALIZE_AMD_TOOL(SCRATCH_MEMORY_FREE, scratch_event_free_start, scratch_event_free_end);
|
||||
SPECIALIZE_AMD_TOOL(SCRATCH_MEMORY_ASYNC_RECLAIM,
|
||||
scratch_event_async_reclaim_start,
|
||||
scratch_event_async_reclaim_end);
|
||||
|
||||
template <>
|
||||
struct amd_tool_api_info<ROCPROFILER_SCRATCH_MEMORY_NONE>
|
||||
{
|
||||
using start_fn_t = std::nullptr_t;
|
||||
using end_fn_t = std::nullptr_t;
|
||||
static constexpr auto operation_idx = ROCPROFILER_SCRATCH_MEMORY_NONE;
|
||||
static constexpr auto name = "SCRATCH_MEMORY_NONE";
|
||||
static constexpr auto start_phase = nullptr;
|
||||
static constexpr auto end_phase = nullptr;
|
||||
};
|
||||
|
||||
#undef SPECIALIZE_AMD_TOOL
|
||||
template <size_t S>
|
||||
struct event_info_t;
|
||||
|
||||
template <size_t OpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
impl(Args... args);
|
||||
|
||||
namespace
|
||||
{
|
||||
template <size_t Idx, size_t... IdxTail>
|
||||
const char*
|
||||
name_by_id(const uint32_t id, std::index_sequence<Idx, IdxTail...>)
|
||||
{
|
||||
if(Idx == id) return amd_tool_api_info<Idx>::name;
|
||||
if constexpr(sizeof...(IdxTail) > 0)
|
||||
return name_by_id(id, std::index_sequence<IdxTail...>{});
|
||||
else
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <size_t Idx, size_t... IdxTail>
|
||||
uint32_t
|
||||
id_by_name(const char* name, std::index_sequence<Idx, IdxTail...>)
|
||||
{
|
||||
if(std::string_view{amd_tool_api_info<Idx>::name} == std::string_view{name})
|
||||
return amd_tool_api_info<Idx>::operation_idx;
|
||||
if constexpr(sizeof...(IdxTail) > 0)
|
||||
return id_by_name(name, std::index_sequence<IdxTail...>{});
|
||||
else
|
||||
return ROCPROFILER_SCRATCH_MEMORY_NONE;
|
||||
}
|
||||
|
||||
template <size_t... Idx>
|
||||
void
|
||||
get_ids(std::vector<uint32_t>& _id_list, std::index_sequence<Idx...>)
|
||||
{
|
||||
auto _emplace = [](auto& _vec, uint32_t _v) {
|
||||
if(_v < static_cast<uint32_t>(ROCPROFILER_SCRATCH_MEMORY_LAST)) _vec.emplace_back(_v);
|
||||
};
|
||||
|
||||
(_emplace(_id_list, amd_tool_api_info<Idx>::operation_idx), ...);
|
||||
}
|
||||
|
||||
template <size_t... Idx>
|
||||
void
|
||||
get_names(std::vector<const char*>& _name_list, std::index_sequence<Idx...>)
|
||||
{
|
||||
auto _emplace = [](auto& _vec, const char* _v) {
|
||||
if(_v != nullptr && strnlen(_v, 1) > 0) _vec.emplace_back(_v);
|
||||
};
|
||||
|
||||
(_emplace(_name_list, amd_tool_api_info<Idx>::name), ...);
|
||||
}
|
||||
|
||||
template <size_t TableIdx, typename LookupT = internal_table, size_t OpIdx>
|
||||
void
|
||||
copy_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance)
|
||||
{
|
||||
using table_type = typename hsa_table_lookup<TableIdx>::type;
|
||||
|
||||
static_assert(std::is_same<hsa_amd_tool_table_t, table_type>::value);
|
||||
|
||||
if constexpr(OpIdx > hsa_amd_tool_id_none)
|
||||
{
|
||||
auto _info = hsa_api_meta<TableIdx, OpIdx>{};
|
||||
|
||||
auto& _orig_table = _info.get_table(_orig);
|
||||
auto& _orig_func = _info.get_table_func(_orig_table);
|
||||
// make sure we don't access a field that doesn't exist in input table
|
||||
if(_info.offset() >= _orig->version.minor_id) return;
|
||||
|
||||
auto& _copy_table = _info.get_table(hsa_table_lookup<TableIdx>{}(LookupT{}));
|
||||
auto& _copy_func = _info.get_table_func(_copy_table);
|
||||
|
||||
LOG_IF(FATAL, _copy_func && _tbl_instance == 0)
|
||||
<< _info.name << " has non-null function pointer " << _copy_func
|
||||
<< " despite this being the first instance of the library being copies";
|
||||
|
||||
if(!_copy_func)
|
||||
{
|
||||
LOG(INFO) << "copying table entry for " << _info.name;
|
||||
_copy_func = _orig_func;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG(INFO) << "skipping copying table entry for " << _info.name
|
||||
<< " from table instance " << _tbl_instance;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct buffered_context_data
|
||||
{
|
||||
const context::context* ctx = nullptr;
|
||||
};
|
||||
|
||||
struct callback_context_data
|
||||
{
|
||||
const context::context* ctx = nullptr;
|
||||
rocprofiler_callback_tracing_record_t record = {};
|
||||
rocprofiler_user_data_t user_data = {.value = 0};
|
||||
};
|
||||
|
||||
void
|
||||
populate_contexts(int operation_idx,
|
||||
std::vector<callback_context_data>& callback_contexts,
|
||||
std::vector<buffered_context_data>& buffered_contexts)
|
||||
{
|
||||
callback_contexts.clear();
|
||||
buffered_contexts.clear();
|
||||
|
||||
auto active_contexts = context::context_array_t{};
|
||||
for(const auto* itr : context::get_active_contexts(active_contexts))
|
||||
{
|
||||
if(!itr) continue;
|
||||
|
||||
if(itr->callback_tracer)
|
||||
{
|
||||
// if the given domain + op is not enabled, skip this context
|
||||
if(itr->callback_tracer->domains(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
|
||||
operation_idx))
|
||||
callback_contexts.emplace_back(
|
||||
callback_context_data{itr, rocprofiler_callback_tracing_record_t{}});
|
||||
}
|
||||
|
||||
if(itr->buffered_tracer)
|
||||
{
|
||||
// if the given domain + op is not enabled, skip this context
|
||||
if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
operation_idx))
|
||||
buffered_contexts.emplace_back(buffered_context_data{itr});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
static_assert(ROCPROFILER_SCRATCH_MEMORY_ALLOC ==
|
||||
scratch_op_info<hsa_amd_tool_id_scratch_event_alloc_end>::operation);
|
||||
static_assert(ROCPROFILER_SCRATCH_MEMORY_FREE ==
|
||||
scratch_op_info<hsa_amd_tool_id_scratch_event_free_start>::operation);
|
||||
|
||||
static_assert(ROCPROFILER_CALLBACK_PHASE_EXIT ==
|
||||
scratch_op_info<hsa_amd_tool_id_scratch_event_alloc_end>::phase);
|
||||
static_assert(ROCPROFILER_CALLBACK_PHASE_ENTER ==
|
||||
scratch_op_info<hsa_amd_tool_id_scratch_event_free_start>::phase);
|
||||
|
||||
#define ASSERT_SAME_OFFSET(S) \
|
||||
static_assert(offsetof(hsa_amd_event_scratch_alloc_start_t, flags) == \
|
||||
offsetof(hsa_amd_event_scratch_##S, flags));
|
||||
|
||||
ASSERT_SAME_OFFSET(alloc_start_t);
|
||||
ASSERT_SAME_OFFSET(alloc_end_t);
|
||||
ASSERT_SAME_OFFSET(free_start_t);
|
||||
ASSERT_SAME_OFFSET(free_end_t);
|
||||
ASSERT_SAME_OFFSET(async_reclaim_start_t);
|
||||
ASSERT_SAME_OFFSET(async_reclaim_end_t);
|
||||
|
||||
#undef ASSERT_SAME_OFFSET
|
||||
|
||||
template <typename T, typename... Ts>
|
||||
constexpr bool have_same_offset(T /*m*/)
|
||||
{
|
||||
return (offsetof(Ts, m) == ...);
|
||||
}
|
||||
|
||||
template <typename T, typename... Ts>
|
||||
struct same_flags_offset
|
||||
{
|
||||
static constexpr auto value = ((offsetof(T, flags) == offsetof(Ts, flags)) && ...);
|
||||
};
|
||||
|
||||
auto
|
||||
get_flags(hsa_amd_tool_event_t event)
|
||||
{
|
||||
static_assert(same_flags_offset<hsa_amd_event_scratch_alloc_start_t,
|
||||
hsa_amd_event_scratch_alloc_end_t,
|
||||
hsa_amd_event_scratch_free_start_t,
|
||||
hsa_amd_event_scratch_free_end_t,
|
||||
hsa_amd_event_scratch_async_reclaim_start_t,
|
||||
hsa_amd_event_scratch_async_reclaim_end_t>::value);
|
||||
return static_cast<rocprofiler_scratch_alloc_flag_t>(event.scratch_alloc_start->flags);
|
||||
}
|
||||
|
||||
/*
|
||||
Template instantiation per start/stop pairs to track event data through thread local storage
|
||||
*/
|
||||
template <size_t OpIdx>
|
||||
auto&
|
||||
get_tls_pair(rocprofiler_callback_phase_t phase)
|
||||
{
|
||||
// Tony and Laurent's suggestion
|
||||
// To pair up a start event with an end event because we get them as separate callback
|
||||
// invocations, use thread local storage to track the item through a single callback
|
||||
// function for both start and end. Template on the buffer types instead of the callback
|
||||
// types
|
||||
// OpIdx = rocprofiler_callback_phase_t
|
||||
static_assert(
|
||||
(OpIdx > ROCPROFILER_SCRATCH_MEMORY_NONE) && (OpIdx < ROCPROFILER_SCRATCH_MEMORY_LAST),
|
||||
"Invalid event pair OpIdx");
|
||||
|
||||
using callback_data_t = rocprofiler_callback_tracing_scratch_memory_data_t;
|
||||
using buffered_data_t = rocprofiler_buffer_tracing_scratch_memory_record_t;
|
||||
|
||||
struct tls_data
|
||||
{
|
||||
callback_data_t callback_data = common::init_public_api_struct(callback_data_t{});
|
||||
buffered_data_t buffered_data = common::init_public_api_struct(buffered_data_t{});
|
||||
std::vector<callback_context_data> callback_ctxs = {};
|
||||
std::vector<buffered_context_data> buffered_ctxs = {};
|
||||
};
|
||||
|
||||
static thread_local auto tls = tls_data{};
|
||||
static thread_local auto held = false;
|
||||
|
||||
if(phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
|
||||
{
|
||||
LOG_IF(FATAL, held) << "Overwriting scratch memory TLS data";
|
||||
held = true;
|
||||
populate_contexts(OpIdx, tls.callback_ctxs, tls.buffered_ctxs);
|
||||
}
|
||||
else
|
||||
{
|
||||
held = false;
|
||||
}
|
||||
|
||||
return tls;
|
||||
}
|
||||
|
||||
template <size_t ScratchOpIdx, typename... Args>
|
||||
hsa_status_t
|
||||
impl(Args... args)
|
||||
{
|
||||
using arg_event_t =
|
||||
common::mpl::unqualified_type_t<decltype(std::get<0>(std::make_tuple(args...)))>;
|
||||
static_assert(std::is_same_v<arg_event_t, hsa_amd_tool_event_t>, "unexpected type");
|
||||
|
||||
constexpr auto OpIdx = scratch_op_info<ScratchOpIdx>::operation;
|
||||
constexpr auto OpPhase = scratch_op_info<ScratchOpIdx>::phase;
|
||||
|
||||
auto&& _tied_args = std::tie(args...);
|
||||
auto& event_data = std::get<0>(_tied_args);
|
||||
|
||||
// this lets start and end of the same type have the same thread local storage
|
||||
auto& tls = get_tls_pair<OpIdx>(OpPhase);
|
||||
|
||||
if(tls.callback_ctxs.empty() && tls.buffered_ctxs.empty()) return HSA_STATUS_SUCCESS;
|
||||
|
||||
const auto tid = common::get_tid();
|
||||
|
||||
const auto get_agent_id = [](const hsa_queue_t* hsa_queue) -> rocprofiler_agent_id_t {
|
||||
rocprofiler_agent_id_t _agent_id{static_cast<uint64_t>(-1)};
|
||||
bool found_agent{false};
|
||||
|
||||
rocprofiler::hsa::get_queue_controller()->iterate_queues(
|
||||
[&](const rocprofiler::hsa::Queue* queue_ptr) {
|
||||
if(queue_ptr->intercept_queue()->id == hsa_queue->id)
|
||||
{
|
||||
_agent_id = queue_ptr->get_agent().get_rocp_agent()->id;
|
||||
found_agent = true;
|
||||
}
|
||||
});
|
||||
|
||||
LOG_IF(FATAL, !found_agent) << fmt::format(
|
||||
"Scratch memory tracing: Could not find a valid agent for queue id {}", hsa_queue->id);
|
||||
return _agent_id;
|
||||
};
|
||||
|
||||
auto* corr_id = context::get_latest_correlation_id();
|
||||
const auto invoke_callbacks = [&](const rocprofiler_callback_phase_t _phase) {
|
||||
for(auto& itr : tls.callback_ctxs)
|
||||
{
|
||||
auto corr_id_v = rocprofiler_correlation_id_t{};
|
||||
if(corr_id)
|
||||
{
|
||||
corr_id_v.internal = corr_id->internal;
|
||||
corr_id_v.external = itr.ctx->correlation_tracer.external_correlator.get(tid);
|
||||
}
|
||||
|
||||
auto record = rocprofiler_callback_tracing_record_t{
|
||||
.context_id = rocprofiler_context_id_t{itr.ctx->context_idx},
|
||||
.thread_id = tid,
|
||||
.correlation_id = corr_id_v,
|
||||
.kind = ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
|
||||
.operation = OpIdx,
|
||||
.phase = _phase,
|
||||
.payload = static_cast<void*>(&tls.callback_data),
|
||||
};
|
||||
|
||||
auto& cb_info = itr.ctx->callback_tracer->callback_data.at(
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY);
|
||||
|
||||
cb_info.callback(record, &itr.user_data, &cb_info.data);
|
||||
}
|
||||
};
|
||||
|
||||
if constexpr(OpPhase == ROCPROFILER_CALLBACK_PHASE_ENTER)
|
||||
{
|
||||
if(!tls.callback_ctxs.empty())
|
||||
{
|
||||
tls.callback_data.agent_id = get_agent_id(event_data.scratch_alloc_start->queue);
|
||||
tls.callback_data.queue_id = {event_data.scratch_alloc_start->queue->id};
|
||||
tls.callback_data.args_kind = event_data.none->kind;
|
||||
tls.callback_data.flags = get_flags(event_data);
|
||||
|
||||
if constexpr(OpIdx == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
|
||||
{
|
||||
tls.callback_data.args.alloc_start.dispatch_id =
|
||||
event_data.scratch_alloc_start->dispatch_id;
|
||||
}
|
||||
|
||||
invoke_callbacks(OpPhase); // NOLINT readability-misleading-indentation
|
||||
}
|
||||
|
||||
if(!tls.buffered_ctxs.empty())
|
||||
{
|
||||
tls.buffered_data.kind = ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY;
|
||||
tls.buffered_data.operation = OpIdx;
|
||||
tls.buffered_data.agent_id = get_agent_id(event_data.scratch_alloc_start->queue);
|
||||
tls.buffered_data.queue_id = {event_data.scratch_alloc_start->queue->id};
|
||||
tls.buffered_data.thread_id = tid;
|
||||
tls.buffered_data.start_timestamp = common::timestamp_ns();
|
||||
}
|
||||
}
|
||||
else if constexpr(OpPhase == ROCPROFILER_CALLBACK_PHASE_EXIT)
|
||||
{
|
||||
if(!tls.buffered_ctxs.empty())
|
||||
{
|
||||
tls.buffered_data.flags = get_flags(event_data);
|
||||
tls.buffered_data.end_timestamp = common::timestamp_ns();
|
||||
}
|
||||
|
||||
if(!tls.callback_ctxs.empty())
|
||||
{
|
||||
tls.callback_data.flags = get_flags(event_data);
|
||||
tls.callback_data.args_kind = event_data.none->kind;
|
||||
if constexpr(OpIdx == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
|
||||
{
|
||||
auto& data_args = tls.callback_data.args.alloc_end;
|
||||
data_args.dispatch_id = event_data.scratch_alloc_end->dispatch_id;
|
||||
data_args.size = event_data.scratch_alloc_end->size;
|
||||
data_args.num_slots = event_data.scratch_alloc_end->num_slots;
|
||||
}
|
||||
|
||||
invoke_callbacks(OpPhase); // NOLINT readability-misleading-indentation
|
||||
}
|
||||
|
||||
if(!tls.buffered_ctxs.empty())
|
||||
{
|
||||
for(const auto& itr : tls.buffered_ctxs)
|
||||
{
|
||||
auto* _buffer = buffer::get_buffer(itr.ctx->buffered_tracer->buffer_data.at(
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY));
|
||||
|
||||
auto corr_id_v = rocprofiler_correlation_id_t{};
|
||||
if(corr_id)
|
||||
{
|
||||
// TODO(mkuriche) should the id be generated at entry?
|
||||
corr_id_v.internal = corr_id->internal;
|
||||
corr_id_v.external = itr.ctx->correlation_tracer.external_correlator.get(tid);
|
||||
}
|
||||
|
||||
auto _record = tls.buffered_data;
|
||||
_record.correlation_id = corr_id_v;
|
||||
|
||||
CHECK_NOTNULL(_buffer)->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
_record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return invoke(get_next_dispatch<ROCPROFILER_HSA_TABLE_ID_AmdTool, ScratchOpIdx>(),
|
||||
std::move(_tied_args),
|
||||
std::make_index_sequence<sizeof...(Args)>{});
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx, typename RetT, typename... Args>
|
||||
auto get_hsa_amd_tool_api_impl(RetT (*)(Args...))
|
||||
{
|
||||
return &scratch_memory::impl<OpIdx, Args...>;
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t OpIdx>
|
||||
void
|
||||
update_table(const context_array_t& ctxs, hsa_amd_tool_table_t* _orig)
|
||||
{
|
||||
if constexpr(OpIdx > hsa_amd_tool_id_none)
|
||||
{
|
||||
auto _info = hsa_api_meta<TableIdx, OpIdx>{};
|
||||
|
||||
if(!should_wrap_functor(ctxs, OpIdx)) return;
|
||||
|
||||
LOG(INFO) << "updating table entry for " << _info.name;
|
||||
|
||||
auto _meta = hsa_api_meta<TableIdx, OpIdx>{};
|
||||
auto& _table = _meta.get_table(_orig);
|
||||
auto& _func = _meta.get_table_func(_table);
|
||||
|
||||
_func = get_hsa_amd_tool_api_impl<TableIdx, OpIdx>(_func);
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t TableIdx, size_t... OpIdx>
|
||||
void
|
||||
update_table(context_array_t ctxs, hsa_amd_tool_table_t* _orig, std::index_sequence<OpIdx...>)
|
||||
{
|
||||
static_assert(
|
||||
std::is_same<hsa_amd_tool_table_t, typename hsa_table_lookup<TableIdx>::type>::value,
|
||||
"unexpected type");
|
||||
|
||||
(update_table<TableIdx, OpIdx>(ctxs, _orig), ...);
|
||||
}
|
||||
|
||||
template <size_t TableIdx, typename LookupT = internal_table, size_t... OpIdx>
|
||||
void
|
||||
copy_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance, std::index_sequence<OpIdx...>)
|
||||
{
|
||||
static_assert(
|
||||
std::is_same<hsa_amd_tool_table_t, typename hsa_table_lookup<TableIdx>::type>::value,
|
||||
"unexpected type");
|
||||
|
||||
(copy_table<TableIdx, LookupT, OpIdx>(_orig, _tbl_instance), ...);
|
||||
}
|
||||
|
||||
void
|
||||
copy_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance)
|
||||
{
|
||||
if(_orig)
|
||||
copy_table<ROCPROFILER_HSA_TABLE_ID_AmdTool, internal_table>(
|
||||
_orig, _tbl_instance, std::make_index_sequence<hsa_amd_tool_id_scratch_event_last>{});
|
||||
}
|
||||
|
||||
void
|
||||
update_table(hsa_amd_tool_table_t* _orig, uint64_t _tbl_instance)
|
||||
{
|
||||
if(_orig)
|
||||
{
|
||||
auto ctxs = context::get_registered_contexts(context_filter);
|
||||
if(!ctxs.empty())
|
||||
{
|
||||
copy_table<ROCPROFILER_HSA_TABLE_ID_AmdTool, tracing_table>(
|
||||
_orig,
|
||||
_tbl_instance,
|
||||
std::make_index_sequence<hsa_amd_tool_id_scratch_event_last>{});
|
||||
|
||||
update_table<ROCPROFILER_HSA_TABLE_ID_AmdTool>(
|
||||
ctxs, _orig, std::make_index_sequence<hsa_amd_tool_id_scratch_event_last>{});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const char*
|
||||
name_by_id(uint32_t id)
|
||||
{
|
||||
return name_by_id(id, std::make_index_sequence<ROCPROFILER_SCRATCH_MEMORY_LAST>{});
|
||||
}
|
||||
|
||||
uint32_t
|
||||
id_by_name(const char* name)
|
||||
{
|
||||
return id_by_name(name, std::make_index_sequence<ROCPROFILER_SCRATCH_MEMORY_LAST>{});
|
||||
}
|
||||
|
||||
std::vector<uint32_t>
|
||||
get_ids()
|
||||
{
|
||||
auto _data = std::vector<uint32_t>{};
|
||||
_data.reserve(ROCPROFILER_SCRATCH_MEMORY_LAST);
|
||||
get_ids(_data, std::make_index_sequence<ROCPROFILER_SCRATCH_MEMORY_LAST>{});
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::vector<const char*>
|
||||
get_names()
|
||||
{
|
||||
auto _data = std::vector<const char*>{};
|
||||
_data.reserve(ROCPROFILER_SCRATCH_MEMORY_LAST);
|
||||
get_names(_data, std::make_index_sequence<ROCPROFILER_SCRATCH_MEMORY_LAST>{});
|
||||
return _data;
|
||||
}
|
||||
} // namespace scratch_memory
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
@@ -0,0 +1,57 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
namespace scratch_memory
|
||||
{
|
||||
const char*
|
||||
name_by_id(uint32_t id);
|
||||
|
||||
uint32_t
|
||||
id_by_name(const char* name);
|
||||
|
||||
std::vector<const char*>
|
||||
get_names();
|
||||
|
||||
std::vector<uint32_t>
|
||||
get_ids();
|
||||
|
||||
void
|
||||
copy_table(hsa_amd_tool_table_t* _orig, uint64_t lib_instance);
|
||||
|
||||
void
|
||||
update_table(hsa_amd_tool_table_t* _orig, uint64_t lib_instance);
|
||||
} // namespace scratch_memory
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
@@ -42,15 +42,19 @@ static_assert(HSA_FINALIZER_API_TABLE_MAJOR_VERSION == 0x01,
|
||||
"Change in the major version of HSA finalizer API table");
|
||||
static_assert(HSA_IMAGE_API_TABLE_MAJOR_VERSION == 0x01,
|
||||
"Change in the major version of HSA image API table");
|
||||
static_assert(HSA_TOOLS_API_TABLE_MAJOR_VERSION == 0x01,
|
||||
"Change in the major version of HSA AMD tool API table");
|
||||
|
||||
static_assert(HSA_CORE_API_TABLE_STEP_VERSION == 0x00,
|
||||
"Change in the major version of HSA core API table");
|
||||
"Change in the step version of HSA core API table");
|
||||
static_assert(HSA_AMD_EXT_API_TABLE_STEP_VERSION == 0x00,
|
||||
"Change in the major version of HSA amd-extended API table");
|
||||
"Change in the step version of HSA amd-extended API table");
|
||||
static_assert(HSA_FINALIZER_API_TABLE_STEP_VERSION == 0x00,
|
||||
"Change in the major version of HSA finalizer API table");
|
||||
"Change in the step version of HSA finalizer API table");
|
||||
static_assert(HSA_IMAGE_API_TABLE_STEP_VERSION == 0x00,
|
||||
"Change in the major version of HSA image API table");
|
||||
"Change in the step version of HSA image API table");
|
||||
static_assert(HSA_TOOLS_API_TABLE_STEP_VERSION == 0x00,
|
||||
"Change in the step version of HSA AMD tool API table");
|
||||
|
||||
// this should always be updated to latest table size
|
||||
template <size_t VersionCode>
|
||||
@@ -121,6 +125,8 @@ static_assert(HSA_FINALIZER_API_TABLE_MAJOR_VERSION == 0x02,
|
||||
"Change in the major version of HSA finalizer API table");
|
||||
static_assert(HSA_IMAGE_API_TABLE_MAJOR_VERSION == 0x02,
|
||||
"Change in the major version of HSA image API table");
|
||||
static_assert(HSA_TOOLS_API_TABLE_MAJOR_VERSION == 0x01,
|
||||
"Change in the major version of HSA AMD tool API table");
|
||||
|
||||
// this should always be updated to latest table size
|
||||
template <size_t VersionCode>
|
||||
@@ -151,6 +157,23 @@ struct table_size<ROCPROFILER_COMPUTE_VERSION(1, 12, 0)>
|
||||
# endif
|
||||
};
|
||||
|
||||
// specialization for v1.13
|
||||
template <>
|
||||
struct table_size<ROCPROFILER_COMPUTE_VERSION(1, 13, 0)>
|
||||
{
|
||||
static constexpr size_t finalizer_ext = 64;
|
||||
static constexpr size_t image_ext = 120;
|
||||
static constexpr size_t core_api_ext = 1016;
|
||||
static constexpr size_t amd_tool = 64;
|
||||
|
||||
// TODO(jomadsen): come up with a better way of handling this
|
||||
# if HSA_AMD_EXT_API_TABLE_STEP_VERSION == 0x00
|
||||
static constexpr size_t amd_ext = 552;
|
||||
# else
|
||||
static constexpr size_t amd_ext = 560;
|
||||
# endif
|
||||
};
|
||||
|
||||
// default static asserts to check against latest version
|
||||
// e.g. v1.12 might have the same table sizes as v1.11 so
|
||||
// we don't want to fail to compile if nothing has changed
|
||||
|
||||
@@ -99,7 +99,12 @@ TaskGroup::wait()
|
||||
auto lk = std::unique_lock<std::mutex>{m_mutex};
|
||||
for(auto& itr : m_tasks)
|
||||
itr->wait();
|
||||
m_tasks.clear();
|
||||
// we hold the handles for the completed tasks to prevent a rare (but possible) data race on the
|
||||
// destruction of the shared_ptr
|
||||
m_completed_tasks.clear();
|
||||
// makes m_tasks empty but delays the destruction of the shared_ptrs until the next wait or the
|
||||
// destruction of the task group
|
||||
std::swap(m_tasks, m_completed_tasks);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -62,9 +63,10 @@ public:
|
||||
void join();
|
||||
|
||||
private:
|
||||
std::mutex m_mutex = {};
|
||||
thread_pool_t* m_pool = nullptr;
|
||||
std::deque<std::shared_ptr<task_type>> m_tasks = {};
|
||||
std::mutex m_mutex = {};
|
||||
thread_pool_t* m_pool = nullptr;
|
||||
std::deque<std::shared_ptr<task_type>> m_tasks = {};
|
||||
std::deque<std::shared_ptr<task_type>> m_completed_tasks = {};
|
||||
};
|
||||
|
||||
using task_group_t = TaskGroup;
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
|
||||
#include "lib/rocprofiler-sdk/hsa/scratch_memory.hpp"
|
||||
#include "lib/rocprofiler-sdk/intercept_table.hpp"
|
||||
#include "lib/rocprofiler-sdk/internal_threading.hpp"
|
||||
#include "lib/rocprofiler-sdk/marker/marker.hpp"
|
||||
@@ -725,6 +726,7 @@ rocprofiler_set_api_table(const char* name,
|
||||
rocprofiler::hsa::copy_table(hsa_api_table->amd_ext_, lib_instance);
|
||||
rocprofiler::hsa::copy_table(hsa_api_table->image_ext_, lib_instance);
|
||||
rocprofiler::hsa::copy_table(hsa_api_table->finalizer_ext_, lib_instance);
|
||||
rocprofiler::hsa::copy_table(hsa_api_table->tools_, lib_instance);
|
||||
|
||||
// need to construct agent mappings before initializing the queue controller
|
||||
rocprofiler::agent::construct_agent_cache(hsa_api_table);
|
||||
@@ -737,6 +739,7 @@ rocprofiler_set_api_table(const char* name,
|
||||
rocprofiler::hsa::update_table(hsa_api_table->amd_ext_, lib_instance);
|
||||
rocprofiler::hsa::update_table(hsa_api_table->image_ext_, lib_instance);
|
||||
rocprofiler::hsa::update_table(hsa_api_table->finalizer_ext_, lib_instance);
|
||||
rocprofiler::hsa::update_table(hsa_api_table->tools_, lib_instance);
|
||||
|
||||
// allow tools to install API wrappers
|
||||
rocprofiler::intercept_table::notify_intercept_table_registration(
|
||||
|
||||
@@ -49,6 +49,7 @@ add_subdirectory(bin)
|
||||
# validation tests
|
||||
add_subdirectory(kernel-tracing)
|
||||
add_subdirectory(async-copy-tracing)
|
||||
add_subdirectory(scratch-memory-tracing)
|
||||
add_subdirectory(c-tool)
|
||||
|
||||
# rocprofv3 validation tests
|
||||
|
||||
@@ -12,6 +12,7 @@ add_subdirectory(simple-transpose)
|
||||
add_subdirectory(multistream)
|
||||
add_subdirectory(vector-operations)
|
||||
add_subdirectory(hip-in-libraries)
|
||||
add_subdirectory(scratch-memory)
|
||||
|
||||
set(CMAKE_BUILD_RPATH
|
||||
"\$ORIGIN:\$ORIGIN/../lib:$<TARGET_FILE_DIR:rocprofiler-sdk-roctx::rocprofiler-sdk-roctx-shared-library>"
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-tool-test-app-scratch-memory LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
set_source_files_properties(scratch-memory.cpp PROPERTIES LANGUAGE HIP)
|
||||
add_executable(scratch-memory)
|
||||
target_sources(scratch-memory PRIVATE scratch-memory.cpp)
|
||||
target_compile_options(scratch-memory PRIVATE -W -Wall -Wextra -Wpedantic -Wshadow
|
||||
-Werror)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(scratch-memory PRIVATE Threads::Threads hsa-runtime64)
|
||||
|
||||
install(
|
||||
TARGETS scratch-memory
|
||||
DESTINATION bin
|
||||
COMPONENT tests)
|
||||
@@ -0,0 +1,239 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#define hipCheckErr(errval) \
|
||||
do \
|
||||
{ \
|
||||
hipCheckAndFail((errval), __FILE__, __LINE__); \
|
||||
} while(0)
|
||||
|
||||
#define hipCheckLastError() \
|
||||
do \
|
||||
{ \
|
||||
hipCheckErr(hipGetLastError()); \
|
||||
} while(0)
|
||||
|
||||
#define HSA_CALL2(cmd) \
|
||||
do \
|
||||
{ \
|
||||
hsa_status_t error = (cmd); \
|
||||
if(error != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
const char* errorStr; \
|
||||
hsa_status_string(error, &errorStr); \
|
||||
std::cout << "Encountered HSA error (" << errorStr << ") at line " << __LINE__ \
|
||||
<< " in file " << __FILE__ << "\n"; \
|
||||
exit(-1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace
|
||||
{
|
||||
inline void
|
||||
hipCheckAndFail(hipError_t errval, const char* file, int line)
|
||||
{
|
||||
if(errval != hipSuccess)
|
||||
{
|
||||
std::cerr << "hip error: " << hipGetErrorString(errval) << std::endl;
|
||||
std::cerr << " Location: " << file << ":" << line << std::endl;
|
||||
exit(errval);
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t
|
||||
find_gpu_agents(hsa_agent_t agent, void* data)
|
||||
{
|
||||
hsa_status_t status;
|
||||
hsa_device_type_t device_type;
|
||||
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
|
||||
if(status == HSA_STATUS_SUCCESS && device_type == HSA_DEVICE_TYPE_GPU)
|
||||
{
|
||||
std::vector<hsa_agent_t>* agents = reinterpret_cast<std::vector<hsa_agent_t>*>(data);
|
||||
agents->push_back(agent);
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
__global__ void
|
||||
test_kern_large(uint64_t* output)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int test[4000];
|
||||
memset(test, 5, 4000);
|
||||
for(int& i : test)
|
||||
{
|
||||
i = i + 7;
|
||||
*output += i;
|
||||
result += i;
|
||||
}
|
||||
*output ^= result;
|
||||
*output ^= result;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kern_medium(uint64_t* output)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int test[175];
|
||||
memset(test, 5, 175);
|
||||
for(int& i : test)
|
||||
{
|
||||
i = i + 7;
|
||||
*output += i;
|
||||
result += i;
|
||||
}
|
||||
*output ^= result;
|
||||
*output ^= result;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kern_small(uint64_t* output)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int test[2];
|
||||
for(int& i : test)
|
||||
{
|
||||
i = i + 7;
|
||||
*output += i;
|
||||
result += i;
|
||||
}
|
||||
*output ^= result;
|
||||
*output ^= result;
|
||||
}
|
||||
|
||||
// Checks whether we get a request-more-scratch when grid-x is incremented
|
||||
int
|
||||
test_gridx(uint64_t* data_ptr)
|
||||
{
|
||||
*data_ptr = 0;
|
||||
printf("Running Medium\n");
|
||||
test_kern_medium<<<1000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Medium - done\n");
|
||||
|
||||
printf("Running Medium-2 - should trigger more-scratch requests\n");
|
||||
test_kern_medium<<<1500, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
|
||||
printf("Running Medium-2 - done\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 1st allocation should go to primary, then large should still trigger a USO
|
||||
int
|
||||
test_primary_then_uso(uint64_t* data_ptr)
|
||||
{
|
||||
printf("Running Medium - all slots\n");
|
||||
test_kern_medium<<<10000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Medium - done\n");
|
||||
|
||||
printf("Running Large - should trigger USO\n");
|
||||
test_kern_large<<<1100, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Large - done\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
test_scratch()
|
||||
{
|
||||
uint64_t* data_ptr;
|
||||
hipCheckErr(hipHostMalloc(&data_ptr, sizeof(uint64_t), 0));
|
||||
|
||||
std::vector<float> host_floats(1024);
|
||||
float* dev;
|
||||
|
||||
hipCheckErr(hipMalloc((void**) &dev, host_floats.size() * sizeof(float)));
|
||||
hipCheckErr(hipMemcpy(
|
||||
dev, host_floats.data(), host_floats.size() * sizeof(float), hipMemcpyHostToDevice));
|
||||
|
||||
*data_ptr = 0;
|
||||
|
||||
printf("Running test_primary_then_uso========================\n");
|
||||
test_primary_then_uso(data_ptr);
|
||||
printf("=====================================================\n");
|
||||
|
||||
printf("Running test_gridx===================================\n");
|
||||
test_gridx(data_ptr);
|
||||
printf("=====================================================\n");
|
||||
|
||||
printf("Running Small\n");
|
||||
test_kern_small<<<1000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Small - done\n");
|
||||
|
||||
printf("Running Medium\n");
|
||||
test_kern_medium<<<1000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Medium - done\n");
|
||||
|
||||
printf("Running Small\n");
|
||||
test_kern_small<<<1000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Small - done\n");
|
||||
|
||||
printf("Running Large\n");
|
||||
test_kern_large<<<1100, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Large - done\n");
|
||||
|
||||
printf("Running Large\n");
|
||||
test_kern_large<<<1000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Large - done\n");
|
||||
|
||||
printf("Running Large\n");
|
||||
test_kern_large<<<1000, 1>>>(data_ptr);
|
||||
hipCheckErr(hipFree(dev));
|
||||
hipCheckErr(hipDeviceSynchronize());
|
||||
printf("Running Large - done\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
hipCheckErr(hipInit(0));
|
||||
|
||||
std::vector<hsa_agent_t> agents;
|
||||
HSA_CALL2(hsa_iterate_agents(find_gpu_agents, &agents));
|
||||
size_t numAgents = agents.size();
|
||||
printf("Detected %ld agents\n", numAgents);
|
||||
|
||||
for(size_t i = 0; i < agents.size(); ++i)
|
||||
{
|
||||
hipCheckErr(hipSetDevice(i));
|
||||
test_scratch();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -160,6 +160,62 @@ save(ArchiveT& ar, rocprofiler_hsa_api_retval_t data)
|
||||
SAVE_DATA_FIELD(uint64_t_retval);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, const hsa_queue_t& data)
|
||||
{
|
||||
ar(make_nvp("queue_id", data.id));
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, hsa_amd_event_scratch_alloc_start_t data)
|
||||
{
|
||||
ar(make_nvp("queue_id", *data.queue));
|
||||
SAVE_DATA_FIELD(dispatch_id);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, hsa_amd_event_scratch_alloc_end_t data)
|
||||
{
|
||||
ar(make_nvp("queue_id", *data.queue));
|
||||
SAVE_DATA_FIELD(dispatch_id);
|
||||
SAVE_DATA_FIELD(size);
|
||||
SAVE_DATA_FIELD(num_slots);
|
||||
SAVE_DATA_FIELD(flags);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, hsa_amd_event_scratch_free_start_t data)
|
||||
{
|
||||
ar(make_nvp("queue_id", *data.queue));
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, hsa_amd_event_scratch_free_end_t data)
|
||||
{
|
||||
ar(make_nvp("queue_id", *data.queue));
|
||||
SAVE_DATA_FIELD(flags);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, hsa_amd_event_scratch_async_reclaim_start_t data)
|
||||
{
|
||||
ar(make_nvp("queue_id", *data.queue));
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, hsa_amd_event_scratch_async_reclaim_end_t data)
|
||||
{
|
||||
ar(make_nvp("queue_id", *data.queue));
|
||||
SAVE_DATA_FIELD(flags);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_marker_api_retval_t data)
|
||||
@@ -201,6 +257,17 @@ save(ArchiveT& ar, rocprofiler_callback_tracing_hip_api_data_t data)
|
||||
SAVE_DATA_FIELD(retval);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_callback_tracing_scratch_memory_data_t data)
|
||||
{
|
||||
SAVE_DATA_FIELD(size);
|
||||
SAVE_DATA_FIELD(agent_id);
|
||||
SAVE_DATA_FIELD(queue_id);
|
||||
SAVE_DATA_FIELD(flags);
|
||||
SAVE_DATA_FIELD(args_kind);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_callback_tracing_record_t data)
|
||||
@@ -288,6 +355,22 @@ save(ArchiveT& ar, rocprofiler_buffer_tracing_memory_copy_record_t data)
|
||||
SAVE_DATA_FIELD(src_agent_id);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_buffer_tracing_scratch_memory_record_t data)
|
||||
{
|
||||
SAVE_DATA_FIELD(size);
|
||||
SAVE_DATA_FIELD(kind);
|
||||
SAVE_DATA_FIELD(operation);
|
||||
SAVE_DATA_FIELD(agent_id);
|
||||
SAVE_DATA_FIELD(queue_id);
|
||||
SAVE_DATA_FIELD(thread_id);
|
||||
SAVE_DATA_FIELD(start_timestamp);
|
||||
SAVE_DATA_FIELD(end_timestamp);
|
||||
SAVE_DATA_FIELD(correlation_id);
|
||||
SAVE_DATA_FIELD(flags);
|
||||
}
|
||||
|
||||
template <typename ArchiveT>
|
||||
void
|
||||
save(ArchiveT& ar, rocprofiler_buffer_tracing_correlation_id_retirement_record_t data)
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(
|
||||
rocprofiler-tests-scratch-memory-tracing
|
||||
LANGUAGES CXX
|
||||
VERSION 0.0.0)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
if(ROCPROFILER_MEMCHECK_PRELOAD_ENV)
|
||||
set(PRELOAD_ENV
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$<TARGET_FILE:rocprofiler-sdk-json-tool>")
|
||||
else()
|
||||
set(PRELOAD_ENV "LD_PRELOAD=$<TARGET_FILE:rocprofiler-sdk-json-tool>")
|
||||
endif()
|
||||
|
||||
add_test(NAME test-scratch-memory-tracing-execute COMMAND $<TARGET_FILE:scratch-memory>)
|
||||
|
||||
set(scratch-memory-tracing-env
|
||||
"${PRELOAD_ENV}"
|
||||
"HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
|
||||
"ROCPROFILER_TOOL_OUTPUT_FILE=scratch-memory-tracing-test.json"
|
||||
"LD_LIBRARY_PATH=$<TARGET_FILE_DIR:rocprofiler::rocprofiler-shared-library>:$ENV{LD_LIBRARY_PATH}"
|
||||
)
|
||||
|
||||
set_tests_properties(
|
||||
test-scratch-memory-tracing-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
|
||||
"${scratch-memory-tracing-env}" FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
|
||||
foreach(FILENAME validate.py pytest.ini conftest.py)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${FILENAME} COPYONLY)
|
||||
endforeach()
|
||||
|
||||
add_test(NAME test-scratch-memory-tracing-validate
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/scratch-memory-tracing-test.json)
|
||||
|
||||
set_tests_properties(
|
||||
test-scratch-memory-tracing-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
test-scratch-memory-tracing-execute FAIL_REGULAR_EXPRESSION
|
||||
"threw an exception")
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--input",
|
||||
action="store",
|
||||
default="scratch-memory-tracing-test.json",
|
||||
help="Input JSON",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_data(request):
|
||||
filename = request.config.getoption("--input")
|
||||
with open(filename, "r") as inp:
|
||||
return json.load(inp)
|
||||
@@ -0,0 +1,4 @@
|
||||
|
||||
[pytest]
|
||||
addopts = --durations=20 -rA -s -vv
|
||||
testpaths = validate.py
|
||||
Исполняемый файл
+293
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
import json
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# helper function
|
||||
def node_exists(name, data, min_len=1):
|
||||
assert name in data
|
||||
assert data[name] is not None
|
||||
if isinstance(data[name], (list, tuple, dict, set)):
|
||||
assert len(data[name]) >= min_len
|
||||
|
||||
|
||||
def test_data_structure(input_data):
|
||||
"""verify minimum amount of expected data is present"""
|
||||
data = input_data
|
||||
sdk_data = input_data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
node_exists("rocprofiler-sdk-json-tool", data)
|
||||
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
num_agents = len([agent for agent in sdk_data["agents"] if agent["type"] == 2])
|
||||
|
||||
node_exists("metadata", sdk_data)
|
||||
node_exists("pid", sdk_data["metadata"])
|
||||
node_exists("main_tid", sdk_data["metadata"])
|
||||
node_exists("init_time", sdk_data["metadata"])
|
||||
node_exists("fini_time", sdk_data["metadata"])
|
||||
|
||||
node_exists("agents", sdk_data)
|
||||
node_exists("call_stack", sdk_data)
|
||||
node_exists("callback_records", sdk_data)
|
||||
node_exists("buffer_records", sdk_data)
|
||||
|
||||
node_exists("names", sdk_data["callback_records"])
|
||||
node_exists("code_objects", sdk_data["callback_records"])
|
||||
node_exists("kernel_symbols", sdk_data["callback_records"])
|
||||
node_exists("hsa_api_traces", sdk_data["callback_records"])
|
||||
node_exists("hip_api_traces", sdk_data["callback_records"], 0)
|
||||
node_exists("scratch_memory_traces", sdk_data["callback_records"], min_len=8)
|
||||
|
||||
node_exists("names", sdk_data["buffer_records"])
|
||||
node_exists("kernel_dispatches", sdk_data["buffer_records"])
|
||||
node_exists("memory_copies", sdk_data["buffer_records"], num_agents)
|
||||
node_exists("hsa_api_traces", sdk_data["buffer_records"])
|
||||
node_exists("hip_api_traces", sdk_data["buffer_records"], 0)
|
||||
node_exists("retired_correlation_ids", sdk_data["buffer_records"])
|
||||
node_exists("scratch_memory_traces", sdk_data["buffer_records"], min_len=8)
|
||||
|
||||
|
||||
def test_timestamps(input_data):
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
cb_start = {}
|
||||
cb_end = {}
|
||||
for titr in ["hsa_api_traces", "hip_api_traces"]:
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
cid = itr["record"]["correlation_id"]["internal"]
|
||||
phase = itr["record"]["phase"]
|
||||
if phase == 1:
|
||||
cb_start[cid] = itr["timestamp"]
|
||||
elif phase == 2:
|
||||
cb_end[cid] = itr["timestamp"]
|
||||
assert cb_start[cid] <= itr["timestamp"]
|
||||
else:
|
||||
assert phase == 1 or phase == 2
|
||||
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["start_timestamp"] <= itr["end_timestamp"]
|
||||
|
||||
for titr in ["kernel_dispatches", "memory_copies"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["start_timestamp"] < itr["end_timestamp"]
|
||||
assert itr["correlation_id"]["internal"] > 0
|
||||
assert itr["correlation_id"]["external"] > 0
|
||||
assert sdk_data["metadata"]["init_time"] < itr["start_timestamp"]
|
||||
assert sdk_data["metadata"]["init_time"] < itr["end_timestamp"]
|
||||
assert sdk_data["metadata"]["fini_time"] > itr["start_timestamp"]
|
||||
assert sdk_data["metadata"]["fini_time"] > itr["end_timestamp"]
|
||||
|
||||
# TODO(Is this check applicable for scratch, which doesn't use any correlation id?)
|
||||
# api_start = cb_start[itr["correlation_id"]["internal"]]
|
||||
# api_end = cb_end[itr["correlation_id"]["internal"]]
|
||||
# assert api_start < itr["start_timestamp"]
|
||||
# assert api_end <= itr["end_timestamp"]
|
||||
|
||||
|
||||
def test_internal_correlation_ids(input_data):
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
api_corr_ids = []
|
||||
for titr in ["hsa_api_traces", "hip_api_traces"]:
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
api_corr_ids.append(itr["record"]["correlation_id"]["internal"])
|
||||
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
api_corr_ids.append(itr["correlation_id"]["internal"])
|
||||
|
||||
api_corr_ids_sorted = sorted(api_corr_ids)
|
||||
api_corr_ids_unique = list(set(api_corr_ids))
|
||||
|
||||
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
|
||||
assert itr["correlation_id"]["internal"] in api_corr_ids_unique
|
||||
|
||||
for itr in sdk_data["buffer_records"]["memory_copies"]:
|
||||
assert itr["correlation_id"]["internal"] in api_corr_ids_unique
|
||||
|
||||
len_corr_id_unq = len(api_corr_ids_unique)
|
||||
assert len(api_corr_ids) != len_corr_id_unq
|
||||
assert max(api_corr_ids_sorted) == len_corr_id_unq
|
||||
|
||||
|
||||
def test_external_correlation_ids(input_data):
|
||||
data = input_data
|
||||
sdk_data = data["rocprofiler-sdk-json-tool"]
|
||||
|
||||
extern_corr_ids = []
|
||||
for titr in ["hsa_api_traces", "hip_api_traces"]:
|
||||
for itr in sdk_data["callback_records"][titr]:
|
||||
assert itr["record"]["correlation_id"]["external"] > 0
|
||||
assert (
|
||||
itr["record"]["thread_id"] == itr["record"]["correlation_id"]["external"]
|
||||
)
|
||||
extern_corr_ids.append(itr["record"]["correlation_id"]["external"])
|
||||
|
||||
extern_corr_ids = list(set(sorted(extern_corr_ids)))
|
||||
for titr in ["hsa_api_traces", "hip_api_traces"]:
|
||||
for itr in sdk_data["buffer_records"][titr]:
|
||||
assert itr["correlation_id"]["external"] > 0
|
||||
assert itr["thread_id"] == itr["correlation_id"]["external"]
|
||||
assert itr["thread_id"] in extern_corr_ids
|
||||
assert itr["correlation_id"]["external"] in extern_corr_ids
|
||||
|
||||
for itr in sdk_data["buffer_records"]["kernel_dispatches"]:
|
||||
assert itr["correlation_id"]["external"] > 0
|
||||
assert itr["correlation_id"]["external"] in extern_corr_ids
|
||||
|
||||
for itr in sdk_data["buffer_records"]["memory_copies"]:
|
||||
assert itr["correlation_id"]["external"] > 0
|
||||
assert itr["correlation_id"]["external"] in extern_corr_ids
|
||||
|
||||
|
||||
def op_name(op_name, record):
|
||||
found_op = False
|
||||
op_key = None
|
||||
|
||||
for kind_node in record["names"]["kind_names"]:
|
||||
if kind_node["value"] == op_name:
|
||||
op_key = kind_node["key"]
|
||||
|
||||
for op_node in record["names"]["operation_names"]:
|
||||
if op_node["key"] == op_key:
|
||||
return op_node
|
||||
|
||||
|
||||
# Tests above are identical to async-copy. Update as needed
|
||||
|
||||
|
||||
def test_scratch_memory_tracking(input_data):
|
||||
sdk_data = input_data["rocprofiler-sdk-json-tool"]
|
||||
callback_records = sdk_data["callback_records"]
|
||||
buffer_records = sdk_data["buffer_records"]
|
||||
|
||||
scratch_callback_data = sdk_data["callback_records"]["scratch_memory_traces"]
|
||||
scratch_buffer_data = sdk_data["buffer_records"]["scratch_memory_traces"]
|
||||
|
||||
cb_op_names = op_name("SCRATCH_MEMORY", callback_records)["value"]
|
||||
bf_op_names = op_name("SCRATCH_MEMORY", buffer_records)["value"]
|
||||
|
||||
assert len(cb_op_names) == 4
|
||||
assert len(bf_op_names) == 4
|
||||
|
||||
# op name -> enum value
|
||||
scratch_cb_op_map = {node["value"]: node["key"] for node in cb_op_names}
|
||||
scratch_bf_op_map = {node["value"]: node["key"] for node in bf_op_names}
|
||||
assert scratch_cb_op_map == scratch_bf_op_map
|
||||
|
||||
scratch_reported_agent_ids = set()
|
||||
detected_agents_ids = set(
|
||||
agent["id"]["handle"] for agent in sdk_data["agents"] if agent["type"] == 2
|
||||
)
|
||||
# check buffering data
|
||||
for node in scratch_buffer_data:
|
||||
assert "size" in node
|
||||
assert "kind" in node
|
||||
assert "flags" in node
|
||||
assert "thread_id" in node
|
||||
assert "end_timestamp" in node
|
||||
assert "start_timestamp" in node
|
||||
|
||||
assert "queue_id" in node
|
||||
assert "agent_id" in node
|
||||
assert "operation" in node
|
||||
assert "handle" in node["queue_id"]
|
||||
|
||||
assert node["start_timestamp"] > 0
|
||||
assert node["start_timestamp"] < node["end_timestamp"]
|
||||
|
||||
scratch_reported_agent_ids.add(node["agent_id"]["handle"])
|
||||
|
||||
assert 2**64 - 1 not in scratch_reported_agent_ids
|
||||
assert scratch_reported_agent_ids == detected_agents_ids
|
||||
|
||||
# { thread-id -> [ events ], ... }
|
||||
cb_threads = defaultdict(list)
|
||||
bf_threads = defaultdict(list)
|
||||
|
||||
# fetch node["payload"]
|
||||
pl = lambda x: x["payload"]
|
||||
# fetch node["record"]
|
||||
rc = lambda x: x["record"]
|
||||
|
||||
for node in scratch_callback_data:
|
||||
cb_threads[rc(node)["thread_id"]].append(node)
|
||||
|
||||
for node in scratch_buffer_data:
|
||||
bf_threads[node["thread_id"]].append(node)
|
||||
|
||||
for thread_id, nodes in cb_threads.items():
|
||||
assert thread_id > 0
|
||||
|
||||
# start must be followed by end
|
||||
for inx in range(0, len(nodes), 2):
|
||||
this_node = nodes[inx]
|
||||
next_node = nodes[inx + 1]
|
||||
|
||||
assert rc(this_node)["phase"] + 1 == rc(next_node)["phase"]
|
||||
assert rc(this_node)["thread_id"] == rc(next_node)["thread_id"]
|
||||
assert this_node["timestamp"] < next_node["timestamp"]
|
||||
|
||||
# alloc has more data vs free and async reclaim
|
||||
scratch_alloc_node = (
|
||||
this_node["record"]["operation"]
|
||||
== scratch_cb_op_map["SCRATCH_MEMORY_ALLOC"]
|
||||
)
|
||||
if scratch_alloc_node:
|
||||
assert (
|
||||
pl(this_node)["queue_id"]["handle"]
|
||||
== pl(next_node)["queue_id"]["handle"]
|
||||
)
|
||||
assert (
|
||||
this_node["args"]["dispatch_id"] == next_node["args"]["dispatch_id"]
|
||||
)
|
||||
assert "size" in pl(next_node) and pl(next_node)["size"] > 0
|
||||
assert (
|
||||
"num_slots" in next_node["args"]
|
||||
and next_node["args"]["num_slots"] > 0
|
||||
)
|
||||
assert "flags" in pl(next_node)
|
||||
|
||||
# callback data and buffer data must agree with each other
|
||||
for bf_thr, bf_nodes in bf_threads.items():
|
||||
cb_nodes = cb_threads[bf_thr]
|
||||
|
||||
for bf_node_inx in range(len(bf_nodes)):
|
||||
# All these 3 should have same data
|
||||
# timestamps are not same as callback records them at
|
||||
# a different instant in time. Callback timestamp
|
||||
# should be more than buffer timestamp
|
||||
bf_node = bf_nodes[bf_node_inx]
|
||||
cb_enter = cb_nodes[bf_node_inx * 2]
|
||||
cb_exit = cb_nodes[bf_node_inx * 2 + 1]
|
||||
|
||||
assert (
|
||||
bf_node["operation"]
|
||||
== rc(cb_enter)["operation"]
|
||||
== rc(cb_exit)["operation"]
|
||||
)
|
||||
assert (
|
||||
bf_op_names[bf_node["operation"]]
|
||||
== cb_op_names[rc(cb_enter)["operation"]]
|
||||
== cb_op_names[rc(cb_exit)["operation"]]
|
||||
)
|
||||
|
||||
assert bf_node["flags"] == pl(cb_exit)["flags"]
|
||||
|
||||
assert (
|
||||
bf_node["thread_id"]
|
||||
== rc(cb_enter)["thread_id"]
|
||||
== rc(cb_exit)["thread_id"]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
|
||||
sys.exit(exit_code)
|
||||
@@ -239,6 +239,7 @@ get_callback_tracing_names()
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_MARKER_NAME_API,
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
};
|
||||
|
||||
@@ -302,6 +303,7 @@ get_buffer_tracing_names()
|
||||
ROCPROFILER_BUFFER_TRACING_MARKER_CONTROL_API,
|
||||
ROCPROFILER_BUFFER_TRACING_MARKER_NAME_API,
|
||||
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
};
|
||||
|
||||
auto cb_name_info = buffer_name_info{};
|
||||
@@ -488,12 +490,48 @@ struct marker_api_callback_record_t
|
||||
}
|
||||
};
|
||||
|
||||
struct scratch_memory_callback_record_t
|
||||
{
|
||||
uint64_t timestamp = 0;
|
||||
rocprofiler_callback_tracing_record_t record = {};
|
||||
rocprofiler_callback_tracing_scratch_memory_data_t payload = {};
|
||||
|
||||
template <typename ArchiveT>
|
||||
void save(ArchiveT& ar) const
|
||||
{
|
||||
ar(cereal::make_nvp("timestamp", timestamp));
|
||||
ar(cereal::make_nvp("record", record));
|
||||
ar(cereal::make_nvp("payload", payload));
|
||||
|
||||
if constexpr(std::is_same<ArchiveT, cereal::BinaryOutputArchive>::value ||
|
||||
std::is_same<ArchiveT, cereal::PortableBinaryOutputArchive>::value)
|
||||
{}
|
||||
else
|
||||
{
|
||||
ar.setNextName("args");
|
||||
ar.startNode();
|
||||
if(payload.args_kind == HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_START)
|
||||
{
|
||||
ar(cereal::make_nvp("dispatch_id", payload.args.alloc_start.dispatch_id));
|
||||
}
|
||||
else if(payload.args_kind == HSA_AMD_TOOL_EVENT_SCRATCH_ALLOC_END)
|
||||
{
|
||||
ar(cereal::make_nvp("dispatch_id", payload.args.alloc_end.dispatch_id));
|
||||
ar(cereal::make_nvp("size", payload.args.alloc_end.size));
|
||||
ar(cereal::make_nvp("num_slots", payload.args.alloc_end.num_slots));
|
||||
}
|
||||
ar.finishNode();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto code_object_records = std::deque<code_object_callback_record_t>{};
|
||||
auto kernel_symbol_records = std::deque<kernel_symbol_callback_record_t>{};
|
||||
auto hsa_api_cb_records = std::deque<hsa_api_callback_record_t>{};
|
||||
auto marker_api_cb_records = std::deque<marker_api_callback_record_t>{};
|
||||
auto counter_collection_bf_records = std::deque<rocprofiler_record_counter_t>{};
|
||||
auto hip_api_cb_records = std::deque<hip_api_callback_record_t>{};
|
||||
auto scratch_memory_cb_records = std::deque<scratch_memory_callback_record_t>{};
|
||||
|
||||
rocprofiler_thread_id_t
|
||||
push_external_correlation();
|
||||
@@ -688,6 +726,15 @@ tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
marker_api_cb_records.emplace_back(
|
||||
marker_api_callback_record_t{ts, record, *data, std::move(args)});
|
||||
}
|
||||
else if(record.kind == ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* data =
|
||||
static_cast<rocprofiler_callback_tracing_scratch_memory_data_t*>(record.payload);
|
||||
|
||||
static auto _mutex = std::mutex{};
|
||||
auto _lk = std::unique_lock<std::mutex>{_mutex};
|
||||
scratch_memory_cb_records.emplace_back(scratch_memory_callback_record_t{ts, record, *data});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error{"unsupported callback kind"};
|
||||
@@ -699,6 +746,7 @@ auto marker_api_bf_records = std::deque<rocprofiler_buffer_tracing_marker_api_
|
||||
auto hip_api_bf_records = std::deque<rocprofiler_buffer_tracing_hip_api_record_t>{};
|
||||
auto kernel_dispatch_records = std::deque<rocprofiler_buffer_tracing_kernel_dispatch_record_t>{};
|
||||
auto memory_copy_records = std::deque<rocprofiler_buffer_tracing_memory_copy_record_t>{};
|
||||
auto scratch_memory_records = std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>{};
|
||||
auto corr_id_retire_records =
|
||||
std::deque<rocprofiler_buffer_tracing_correlation_id_retirement_record_t>{};
|
||||
|
||||
@@ -783,6 +831,13 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
|
||||
|
||||
memory_copy_records.emplace_back(*record);
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* record = static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
|
||||
header->payload);
|
||||
|
||||
scratch_memory_records.emplace_back(*record);
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT)
|
||||
{
|
||||
auto* record =
|
||||
@@ -854,6 +909,7 @@ rocprofiler_context_id_t marker_api_buffered_ctx = {};
|
||||
rocprofiler_context_id_t kernel_dispatch_ctx = {};
|
||||
rocprofiler_context_id_t memory_copy_ctx = {};
|
||||
rocprofiler_context_id_t counter_collection_ctx = {};
|
||||
rocprofiler_context_id_t scratch_memory_ctx = {};
|
||||
rocprofiler_context_id_t corr_id_retire_ctx = {};
|
||||
// buffers
|
||||
rocprofiler_buffer_id_t hsa_api_buffered_buffer = {};
|
||||
@@ -862,6 +918,7 @@ rocprofiler_buffer_id_t marker_api_buffered_buffer = {};
|
||||
rocprofiler_buffer_id_t kernel_dispatch_buffer = {};
|
||||
rocprofiler_buffer_id_t memory_copy_buffer = {};
|
||||
rocprofiler_buffer_id_t counter_collection_buffer = {};
|
||||
rocprofiler_buffer_id_t scratch_memory_buffer = {};
|
||||
rocprofiler_buffer_id_t corr_id_retire_buffer = {};
|
||||
|
||||
auto contexts = std::unordered_map<std::string_view, rocprofiler_context_id_t*>{
|
||||
@@ -875,18 +932,18 @@ auto contexts = std::unordered_map<std::string_view, rocprofiler_context_id_t*>{
|
||||
{"KERNEL_DISPATCH", &kernel_dispatch_ctx},
|
||||
{"MEMORY_COPY", &memory_copy_ctx},
|
||||
{"COUNTER_COLLECTION", &counter_collection_ctx},
|
||||
{"SCRATCH_MEMORY", &scratch_memory_ctx},
|
||||
{"CORRELATION_ID_RETIREMENT", &corr_id_retire_ctx},
|
||||
};
|
||||
|
||||
auto buffers = std::array<rocprofiler_buffer_id_t*, 7>{
|
||||
&hsa_api_buffered_buffer,
|
||||
&hip_api_buffered_buffer,
|
||||
&marker_api_buffered_buffer,
|
||||
&kernel_dispatch_buffer,
|
||||
&memory_copy_buffer,
|
||||
&counter_collection_buffer,
|
||||
&corr_id_retire_buffer,
|
||||
};
|
||||
auto buffers = std::array<rocprofiler_buffer_id_t*, 8>{&hsa_api_buffered_buffer,
|
||||
&hip_api_buffered_buffer,
|
||||
&marker_api_buffered_buffer,
|
||||
&kernel_dispatch_buffer,
|
||||
&memory_copy_buffer,
|
||||
&scratch_memory_buffer,
|
||||
&counter_collection_buffer,
|
||||
&corr_id_retire_buffer};
|
||||
|
||||
auto agents = std::vector<rocprofiler_agent_t>{};
|
||||
|
||||
@@ -987,6 +1044,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
nullptr),
|
||||
"hsa api tracing service configure");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_callback_tracing_service(scratch_memory_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY,
|
||||
nullptr,
|
||||
0,
|
||||
tool_tracing_callback,
|
||||
nullptr),
|
||||
"scratch memory tracing service configure");
|
||||
|
||||
constexpr auto buffer_size = 8192;
|
||||
constexpr auto watermark = 7936;
|
||||
|
||||
@@ -1035,6 +1101,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
&memory_copy_buffer),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(scratch_memory_ctx,
|
||||
buffer_size,
|
||||
watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS,
|
||||
tool_tracing_buffered,
|
||||
tool_data,
|
||||
&scratch_memory_buffer),
|
||||
"buffer creation");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(corr_id_retire_ctx,
|
||||
buffer_size,
|
||||
watermark,
|
||||
@@ -1111,6 +1186,14 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
memory_copy_buffer),
|
||||
"buffer tracing service for memory copy configure");
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_buffer_tracing_service(scratch_memory_ctx,
|
||||
ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY,
|
||||
nullptr,
|
||||
0,
|
||||
scratch_memory_buffer),
|
||||
"buffer tracing service for scratch memory configure");
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
corr_id_retire_ctx,
|
||||
ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT,
|
||||
@@ -1247,6 +1330,7 @@ tool_fini(void* tool_data)
|
||||
<< ", marker_api_callback_records=" << marker_api_cb_records.size()
|
||||
<< ", kernel_dispatch_records=" << kernel_dispatch_records.size()
|
||||
<< ", memory_copy_records=" << memory_copy_records.size()
|
||||
<< ", scratch_memory_records=" << scratch_memory_records.size()
|
||||
<< ", hsa_api_bf_records=" << hsa_api_bf_records.size()
|
||||
<< ", hip_api_bf_records=" << hip_api_bf_records.size()
|
||||
<< ", marker_api_bf_records=" << marker_api_bf_records.size()
|
||||
@@ -1334,6 +1418,7 @@ write_json(call_stack_t* _call_stack)
|
||||
json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_cb_records));
|
||||
json_ar(cereal::make_nvp("hip_api_traces", hip_api_cb_records));
|
||||
json_ar(cereal::make_nvp("marker_api_traces", marker_api_cb_records));
|
||||
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_cb_records));
|
||||
} catch(std::exception& e)
|
||||
{
|
||||
std::cerr << "[" << getpid() << "][" << __FUNCTION__
|
||||
@@ -1349,6 +1434,7 @@ write_json(call_stack_t* _call_stack)
|
||||
json_ar(cereal::make_nvp("names", buffer_name_info));
|
||||
json_ar(cereal::make_nvp("kernel_dispatches", kernel_dispatch_records));
|
||||
json_ar(cereal::make_nvp("memory_copies", memory_copy_records));
|
||||
json_ar(cereal::make_nvp("scratch_memory_traces", scratch_memory_records));
|
||||
json_ar(cereal::make_nvp("hsa_api_traces", hsa_api_bf_records));
|
||||
json_ar(cereal::make_nvp("hip_api_traces", hip_api_bf_records));
|
||||
json_ar(cereal::make_nvp("marker_api_traces", marker_api_bf_records));
|
||||
|
||||
Ссылка в новой задаче
Block a user