From f4e9846e1ce6654cd1e3b7ce5518aa469781153f Mon Sep 17 00:00:00 2001 From: Sajina PK Date: Mon, 21 Jul 2025 14:07:00 -0400 Subject: [PATCH 1/6] Manually search for rocdecode and rocjpeg libraries in cmake (#294) * Manually search for rocdecode ad rocjpeg libraries * Update examples/jpegdecode/CMakeLists.txt Fix typo. Co-authored-by: David Galiffi --------- Co-authored-by: David Galiffi --- examples/jpegdecode/CMakeLists.txt | 48 ++++++++++++++++++++++++++--- examples/videodecode/CMakeLists.txt | 44 ++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/examples/jpegdecode/CMakeLists.txt b/examples/jpegdecode/CMakeLists.txt index e92c8f5e34..bc1eed1d55 100644 --- a/examples/jpegdecode/CMakeLists.txt +++ b/examples/jpegdecode/CMakeLists.txt @@ -60,7 +60,47 @@ if(ROCPROFSYS_DISABLE_EXAMPLES) endif() endif() -find_package(rocjpeg QUIET) +# find rocJPEG - library and headers +find_path( + rocjpeg_ROOT_DIR + NAMES include/rocjpeg/rocjpeg.h + HINTS ${ROCmVersion_DIR} ${ROCM_PATH} + PATHS ${ROCmVersion_DIR} ${ROCM_PATH} +) + +mark_as_advanced(rocjpeg_ROOT_DIR) + +find_path( + rocjpeg_INCLUDE_DIR + NAMES rocjpeg/rocjpeg.h + HINTS ${rocjpeg_ROOT_DIR} + PATHS ${rocjpeg_ROOT_DIR} + PATH_SUFFIXES include +) + +find_library( + rocjpeg_LIBRARY + NAMES rocjpeg + HINTS ${rocjpeg_ROOT_DIR} + PATHS ${rocjpeg_ROOT_DIR} + PATH_SUFFIXES lib +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + rocjpeg + FOUND_VAR rocjpeg_FOUND + REQUIRED_VARS rocjpeg_INCLUDE_DIR rocjpeg_LIBRARY +) + +if(rocjpeg_FOUND) + if(NOT TARGET rocjpeg::rocjpeg) + add_library(rocjpeg::rocjpeg INTERFACE IMPORTED) + target_link_libraries(rocjpeg::rocjpeg INTERFACE ${rocjpeg_LIBRARY}) + target_include_directories(rocjpeg::rocjpeg INTERFACE ${rocjpeg_INCLUDE_DIR}) + endif() +endif() + find_package(rocprofiler-register QUIET) # Copy image files to build directory @@ -107,7 +147,7 @@ if(HIP_FOUND AND rocjpeg_FOUND AND Threads_FOUND AND rocprofiler-register_FOUND) set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} rocprofiler-register::rocprofiler-register) # rocJPEG - message(STATUS "RocJPEG library found: ${rocjpeg_LIBRARIES}") + message(STATUS "RocJPEG library found: ${rocjpeg_LIBRARY}") include_directories(${rocjpeg_INCLUDE_DIR}) set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} rocjpeg::rocjpeg) list(APPEND SOURCES ${PROJECT_SOURCE_DIR} jpegdecodeperf.cpp) @@ -136,11 +176,11 @@ else() message(WARNING "-- ERROR!: rocJPEG Not Found! - please install rocJPEG!") endif() if(NOT Threads_FOUND) - message(FATAL_ERROR "-- ERROR!: Threads Not Found! - please insatll Threads!") + message(WARNING "-- ERROR!: Threads Not Found! - please install Threads!") endif() if(NOT rocprofiler-register_FOUND) message( - FATAL_ERROR + WARNING "-- ERROR!: rocprofiler-register Not Found! - please install rocprofiler-register!" ) endif() diff --git a/examples/videodecode/CMakeLists.txt b/examples/videodecode/CMakeLists.txt index f8e8fcbf09..4e4a5bb5a7 100644 --- a/examples/videodecode/CMakeLists.txt +++ b/examples/videodecode/CMakeLists.txt @@ -54,8 +54,48 @@ function(videodecode_message _MSG_TYPE) endfunction() # Find RocDecode -find_package(rocdecode QUIET) -if(NOT rocdecode_FOUND) +find_path( + rocdecode_ROOT_DIR + NAMES include/rocdecode/rocdecode.h + HINTS ${ROCmVersion_DIR} ${ROCM_PATH} + PATHS ${ROCmVersion_DIR} ${ROCM_PATH} +) + +mark_as_advanced(rocdecode_ROOT_DIR) + +find_path( + rocdecode_INCLUDE_DIR + NAMES rocdecode/rocdecode.h + HINTS ${rocdecode_ROOT_DIR} + PATHS ${rocdecode_ROOT_DIR} + PATH_SUFFIXES include +) + +find_library( + rocdecode_LIBRARY + NAMES rocdecode + HINTS ${rocdecode_ROOT_DIR} + PATHS ${rocdecode_ROOT_DIR} + PATH_SUFFIXES lib +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + rocdecode + FOUND_VAR rocdecode_FOUND + REQUIRED_VARS rocdecode_INCLUDE_DIR rocdecode_LIBRARY +) + +if(rocdecode_FOUND) + if(NOT TARGET rocdecode::rocdecode) + add_library(rocdecode::rocdecode INTERFACE IMPORTED) + target_link_libraries(rocdecode::rocdecode INTERFACE ${rocdecode_LIBRARY}) + target_include_directories( + rocdecode::rocdecode + INTERFACE ${rocdecode_INCLUDE_DIR} + ) + endif() +else() videodecode_message(AUTHOR_WARNING "${PROJECT_NAME} skipped. Missing RocDecode...") return() endif() From 67ec52b523b89a3a0758a5aee64149a819ef4b94 Mon Sep 17 00:00:00 2001 From: Sajina PK Date: Wed, 23 Jul 2025 16:02:05 -0400 Subject: [PATCH 2/6] Fix to find MPI symbols from undefined symbols (#293) * Fix to find MPI symbols from undefined symbols * Moved condition checks before * Fixing format --------- Co-authored-by: Anuj Shukla --- source/bin/rocprof-sys-instrument/details.cpp | 71 +++++++++++++++++++ source/bin/rocprof-sys-instrument/fwd.hpp | 3 + .../rocprof-sys-instrument.cpp | 8 +++ 3 files changed, 82 insertions(+) diff --git a/source/bin/rocprof-sys-instrument/details.cpp b/source/bin/rocprof-sys-instrument/details.cpp index 698e8adf14..5da3bd4ba1 100644 --- a/source/bin/rocprof-sys-instrument/details.cpp +++ b/source/bin/rocprof-sys-instrument/details.cpp @@ -488,6 +488,77 @@ find_function(image_t* app_image, const std::string& _name, const strset_t& _ext return _func; } +//======================================================================================// +// +// Find undefined function symbols (external references) in the binary +// +symtab_symbol_t* +find_undefined_function_symbol(image_t* app_image, const std::string& _name) +{ + if(_name.empty()) return nullptr; + + // Get all objects from the image + BPatch_Vector app_objects; + app_image->getObjects(app_objects); + + if(app_objects.empty()) + { + verbprintf(3, "No objects found in image for symbol search\n"); + return nullptr; + } + // Search helper lambda for code reuse + auto _find_symbol = [](SymTab::Symtab* symtab, + const std::string& target_name) -> symtab_symbol_t* { + if(!symtab) return nullptr; + + std::vector all_symbols; + if(!symtab->getAllSymbols(all_symbols)) return nullptr; + + for(auto* symbol : all_symbols) + { + if(!symbol || symbol->getType() != SymTab::Symbol::ST_FUNCTION || + symbol->getRegion()) + continue; + + // Try all possible symbol name representations + std::string symbol_name = symbol->getPrettyName(); + if(symbol_name.empty()) symbol_name = symbol->getMangledName(); + if(symbol_name.empty()) symbol_name = symbol->getTypedName(); + + // Check for exact match and undefined function criteria + if(symbol_name == target_name) return symbol; + } + return nullptr; + }; + + // Search through each object + for(auto* app_object : app_objects) + { + if(!app_object) continue; + + std::string binary_path = app_object->name(); + // Open Symtab directly for comprehensive symbol access + SymTab::Symtab* symtab = nullptr; + if(!SymTab::Symtab::openFile(symtab, binary_path)) + { + verbprintf(3, "Failed to open Symtab for: %s\n", binary_path.c_str()); + continue; + } + + // Search for the primary symbol name + auto* result = _find_symbol(symtab, _name); + if(result) + { + verbprintf(1, "Found undefined function symbol: '%s' in %s\n", _name.c_str(), + binary_path.c_str()); + return result; + } + } + + verbprintf(1, "Undefined function symbol: '%s' ... not found\n", _name.c_str()); + return nullptr; +} + //======================================================================================// // // Get the realpath to this exe diff --git a/source/bin/rocprof-sys-instrument/fwd.hpp b/source/bin/rocprof-sys-instrument/fwd.hpp index 900cd8f17e..ca9a6e7b48 100644 --- a/source/bin/rocprof-sys-instrument/fwd.hpp +++ b/source/bin/rocprof-sys-instrument/fwd.hpp @@ -360,6 +360,9 @@ insert_instr(address_space_t* mutatee, Tp traceFunc, procedure_loc_t traceLoc, procedure_t* find_function(image_t* appImage, const string_t& functionName, const strset_t& = {}); +symtab_symbol_t* +find_undefined_function_symbol(image_t* app_image, const std::string& _name); + void error_func_real(error_level_t level, int num, const char* const* params); diff --git a/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp b/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp index 3c0f9877fd..852473eca7 100644 --- a/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp +++ b/source/bin/rocprof-sys-instrument/rocprof-sys-instrument.cpp @@ -1721,6 +1721,14 @@ main(int argc, char** argv) use_mpi = true; break; } + else if(find_undefined_function_symbol(app_image, itr) != nullptr) + { + verbprintf(0, + "Found undefined symbol '%s' in '%s'. Enabling MPI support...\n", + itr, _cmdv[0]); + use_mpi = true; + break; + } } #endif From 4b4a846b58c57c2df94f00687f614b9dba426309 Mon Sep 17 00:00:00 2001 From: ajanicijamd Date: Wed, 23 Jul 2025 21:28:26 -0400 Subject: [PATCH 3/6] Allow events to be grouped by HIP stream ID (#274) - Corelate memory_copy and kernel_dispatch events with their HIP stream_id and add stream_id as an annotation in Perfetto. - By default, group memory_copy and kernel_dispatch events in Perfetto output by their stream_id. - Add option, with the configuration setting ROCPROFSYS_ROCM_GROUP_BY_QUEUE, to group by HSA queue instead. --------- Signed-off-by: David Galiffi Co-authored-by: David Galiffi --- CHANGELOG.md | 2 + docs/how-to/configuring-runtime-options.rst | 8 + docs/reference/development-guide.rst | 2 +- source/lib/core/categories.hpp | 2 + source/lib/core/rocprofiler-sdk.cpp | 26 ++ source/lib/core/rocprofiler-sdk.hpp | 3 + .../rocprofiler-systems/categories.h | 1 + .../rocprof-sys/library/rocprofiler-sdk.cpp | 414 +++++++++++++----- tests/CMakeLists.txt | 1 + tests/rocprof-sys-rocm-hip-stream.cmake | 65 +++ tests/rocprof-sys-testing.cmake | 12 +- 11 files changed, 416 insertions(+), 120 deletions(-) create mode 100644 tests/rocprof-sys-rocm-hip-stream.cmake diff --git a/CHANGELOG.md b/CHANGELOG.md index d849c5628e..263c1572c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs. - How-to document for VCN and JPEG activity sampling and tracing. - Support for tracing Fortran applications. - Support for tracing MPI API in Fortran. +- By default, group "kernel dispatch" and "memory copy" events by HIP stream ID in Perfetto traces. + - Add the "ROCPROFSYS_ROCM_GROUP_BY_QUEUE" configuration setting to group events by queue, instead. ### Changed diff --git a/docs/how-to/configuring-runtime-options.rst b/docs/how-to/configuring-runtime-options.rst index 59c1b9099e..9e540243f0 100644 --- a/docs/how-to/configuring-runtime-options.rst +++ b/docs/how-to/configuring-runtime-options.rst @@ -217,6 +217,14 @@ The following example: ROCPROFSYS_ROCM_EVENTS = GPUBusy SQ_WAVES:device=0 SQ_INSTS_VALU:device=1 +ROCPROFSYS_ROCM_GROUP_BY_QUEUE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, Perfetto trace will show the HIP streams to which kernel +and memory copy operations submitted. With the +``ROCPROFSYS_ROCM_GROUP_BY_QUEUE=ON`` setting, the trace will display HSA queues +to which these kernel and memory operations were submitted. + Exploring GPU Metrics --------------------- diff --git a/docs/reference/development-guide.rst b/docs/reference/development-guide.rst index 6003021484..5bc7385641 100644 --- a/docs/reference/development-guide.rst +++ b/docs/reference/development-guide.rst @@ -32,7 +32,7 @@ rocprof-sys-sample: `source/bin/rocprof-sys-sample `` and a modified environment -rocprof-sys-casual: `source/bin/rocprof-sys-causal `_ +rocprof-sys-causal: `source/bin/rocprof-sys-causal `_ --------------------------------------------------------------------------------------------------------------------------------------------------- When there is exactly one causal profiling configuration variant (which enables debugging), diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index bb9db10b03..57326e2b66 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -96,6 +96,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip_api, ROCPROFSYS_CATEGORY_ROCM_HIP_ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hsa_api, ROCPROFSYS_CATEGORY_ROCM_HSA_API, "rocm_hsa_api", "ROCm HSA functions") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_kernel_dispatch, ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH, "rocm_kernel_dispatch", "ROCm Kernel dispatch") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_memory_copy, ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY, "rocm_memory_copy", "ROCm Async Memory Copy") +ROCPROFSYS_DEFINE_CATEGORY(category, rocm_hip_stream, ROCPROFSYS_CATEGORY_ROCM_HIP_STREAM, "rocm_hip_stream", "ROCm HIP Stream") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_scratch_memory, ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY, "rocm_scratch_memory", "ROCm kernel scratch memory reallocations") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_page_migration, ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION, "rocm_page_migration", "ROCm memory page migration") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION, "rocm_counter_collection", "ROCm device counter collection") @@ -166,6 +167,7 @@ using name = perfetto_category; ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hsa_api), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_kernel_dispatch), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_memory_copy), \ + ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_hip_stream), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_scratch_memory), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_page_migration), \ ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_counter_collection), \ diff --git a/source/lib/core/rocprofiler-sdk.cpp b/source/lib/core/rocprofiler-sdk.cpp index 917efaf0fc..c889fc2526 100644 --- a/source/lib/core/rocprofiler-sdk.cpp +++ b/source/lib/core/rocprofiler-sdk.cpp @@ -372,6 +372,22 @@ config_settings(const std::shared_ptr& _config) for(const auto& itr : buffered_tracing_info) _add_operation_settings(itr.name, itr, buffered_operation_option_names); + + // Add the ROCPROFSYS_ROCM_GROUP_BY_QUEUE setting if the hip_stream domain is present + // in supported ROCProfiler-SDK domains. + auto _has_hip_stream = std::find(_domain_choices.begin(), _domain_choices.end(), + "hip_stream") != _domain_choices.end(); + + if(_has_hip_stream) + { + ROCPROFSYS_CONFIG_SETTING( + bool, "ROCPROFSYS_ROCM_GROUP_BY_QUEUE", + "By default, Perfetto trace will show the HIP streams to which kernel " + "and memory copy operations submitted. With the " + "`ROCPROFSYS_ROCM_GROUP_BY_QUEUE` option, the trace will display HSA queues " + "to which these kernel and memory operations were submitted.", + false, "rocm", "perfetto"); + } } std::unordered_set @@ -550,6 +566,16 @@ get_rocm_events() " ,;\t\n"); } +bool +get_group_by_queue(void) +{ + std::optional _group_by_queue = + config::get_setting_value("ROCPROFSYS_ROCM_GROUP_BY_QUEUE"); + bool _ret = _group_by_queue.value_or(true); + + return _ret; +} + std::vector get_operations(rocprofiler_callback_tracing_kind_t kindv) { diff --git a/source/lib/core/rocprofiler-sdk.hpp b/source/lib/core/rocprofiler-sdk.hpp index b70c1af633..f4d21a37d1 100644 --- a/source/lib/core/rocprofiler-sdk.hpp +++ b/source/lib/core/rocprofiler-sdk.hpp @@ -70,6 +70,9 @@ get_operations(rocprofiler_buffer_tracing_kind_t kindv); std::vector get_rocm_events(); +bool +get_group_by_queue(); + std::unordered_set get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv); diff --git a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h index 2084113032..27fb063ff3 100644 --- a/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h +++ b/source/lib/rocprof-sys-user/rocprofiler-systems/categories.h @@ -49,6 +49,7 @@ extern "C" ROCPROFSYS_CATEGORY_ROCM_KERNEL_DISPATCH, ROCPROFSYS_CATEGORY_ROCM_MEMORY_COPY, ROCPROFSYS_CATEGORY_ROCM_SCRATCH_MEMORY, + ROCPROFSYS_CATEGORY_ROCM_HIP_STREAM, ROCPROFSYS_CATEGORY_ROCM_PAGE_MIGRATION, ROCPROFSYS_CATEGORY_ROCM_COUNTER_COLLECTION, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, diff --git a/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index 26af5b6261..324b6941b7 100644 --- a/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -92,6 +92,70 @@ thread_postcreate(rocprofiler_runtime_library_t /*lib*/, void* /*tool_data*/) pop_thread_state(); } +#if(ROCPROFILER_VERSION < 700) +/** + * @brief Stream ID. + */ +typedef struct rocprofiler_stream_id_t +{ + uint64_t handle; +} rocprofiler_stream_id_t; + +#endif + +auto& +get_stream_stack() +{ + static thread_local std::vector _v{ rocprofiler_stream_id_t{ + 0 } }; + return _v; +} + +void +stream_id_push(rocprofiler_stream_id_t stream_id) +{ + get_stream_stack().emplace_back(stream_id); +} + +rocprofiler_stream_id_t +stream_id_top() +{ + auto stream_id = get_stream_stack().back(); + return stream_id; +} + +void +stream_id_pop() +{ + get_stream_stack().pop_back(); +} + +// Stores stream ids and kernel region ids for kernel-rename service and hip stream +// display service +struct kernel_rename_and_stream_data +{ + uint64_t region_id = 0; // roctx region correlation id + rocprofiler_stream_id_t stream_id = { 0 }; +}; + +template +rocprofiler_stream_id_t +get_stream_id(Tp* _record) +{ + auto _stream_id = rocprofiler_stream_id_t{ 0 }; + if(_record->correlation_id.external.ptr != nullptr) + { + // Extract the stream id + auto* _ecid_data = static_cast( + _record->correlation_id.external.ptr); + _stream_id = _ecid_data->stream_id; + auto _region_id = _ecid_data->region_id; + _record->correlation_id.external.value = _region_id; + delete _ecid_data; + } + return _stream_id; +} + // this function creates a rocprofiler profile config on the first entry std::vector create_agent_profile(rocprofiler_agent_id_t agent_id, @@ -369,8 +433,9 @@ tool_tracing_callback_stop( &args); } - uint64_t _beg_ts = begin_ts; - uint64_t _end_ts = ts; + uint64_t _beg_ts = begin_ts; + uint64_t _end_ts = ts; + auto stream_id = stream_id_top(); tracing::push_perfetto_ts( CategoryT{}, _name.data(), _beg_ts, @@ -381,6 +446,9 @@ tool_tracing_callback_stop( tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ts); tracing::add_perfetto_annotation(ctx, "corr_id", record.correlation_id.internal); + if(stream_id.handle != 0) + tracing::add_perfetto_annotation(ctx, "stream_id", + stream_id.handle); for(const auto& [key, val] : args) tracing::add_perfetto_annotation(ctx, key, val); @@ -717,6 +785,15 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, { if(num_headers == 0 || headers == nullptr) return; + auto _track_desc_stream = [](uint64_t _stream_id) { + return JOIN("", "HIP Activity Stream ", _stream_id); + }; + + bool _group_by_queue = get_group_by_queue(); + + static auto _mtx = std::mutex{}; + auto _lk = std::unique_lock{ _mtx }; + for(size_t i = 0; i < num_headers; ++i) { auto* header = headers[i]; @@ -740,6 +817,12 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, auto _queue_id = record->dispatch_info.queue_id; const auto* _agent = tool_data->get_gpu_tool_agent(_agent_id); + uint64_t _stream_id = get_stream_id(record).handle; + ROCPROFSYS_CI_THROW( + _stream_id == 0, + "Unexpected zero stream_id in kernel dispatch record: %s.", + _name.c_str()); + if(get_use_timemory()) { const auto& _tinfo = thread_info::get(record->thread_id, SystemTID); @@ -757,59 +840,77 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, if(get_use_perfetto()) { - auto _track_desc = [](int32_t _device_id_v, int64_t _queue_id_v) { - return JOIN("", "GPU Kernel Dispatch [", _device_id_v, "] Queue ", - _queue_id_v); + // Lambda to add common perfetto annotations for kernel dispatch + auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns); + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); + tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id); + tracing::add_perfetto_annotation(ctx, "stream_id", + _stream_id); + + tracing::add_perfetto_annotation(ctx, "queue", + _queue_id.handle); + tracing::add_perfetto_annotation( + ctx, "dispatch_id", record->dispatch_info.dispatch_id); + tracing::add_perfetto_annotation( + ctx, "kernel_id", record->dispatch_info.kernel_id); + tracing::add_perfetto_annotation( + ctx, "private_segment_size", + record->dispatch_info.private_segment_size); + tracing::add_perfetto_annotation( + ctx, "group_segment_size", + record->dispatch_info.group_segment_size); + tracing::add_perfetto_annotation( + ctx, "workgroup_size", + JOIN("", "(", + JOIN(',', record->dispatch_info.workgroup_size.x, + record->dispatch_info.workgroup_size.y, + record->dispatch_info.workgroup_size.z), + ")")); + tracing::add_perfetto_annotation( + ctx, "grid_size", + JOIN("", "(", + JOIN(',', record->dispatch_info.grid_size.x, + record->dispatch_info.grid_size.y, + record->dispatch_info.grid_size.z), + ")")); + } }; - const auto _track = tracing::get_perfetto_track( - category::rocm_kernel_dispatch{}, _track_desc, _agent->device_id, - _queue_id.handle); + if(_group_by_queue) + { + auto _track_desc = [](int32_t _device_id_v, int64_t _queue_id_v) { + return JOIN("", "GPU Kernel Dispatch [", _device_id_v, + "] Queue ", _queue_id_v); + }; - tracing::push_perfetto( - category::rocm_kernel_dispatch{}, _name.c_str(), _track, _beg_ns, - ::perfetto::Flow::ProcessScoped(_corr_id), - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", - _beg_ns); - tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); - tracing::add_perfetto_annotation(ctx, "corr_id", - _corr_id); - tracing::add_perfetto_annotation( - ctx, "node_id", _agent->agent->logical_node_id); - tracing::add_perfetto_annotation(ctx, "queue", - _queue_id.handle); - tracing::add_perfetto_annotation( - ctx, "dispatch_id", - record->dispatch_info.dispatch_id); - tracing::add_perfetto_annotation( - ctx, "kernel_id", record->dispatch_info.kernel_id); - tracing::add_perfetto_annotation( - ctx, "private_segment_size", - record->dispatch_info.private_segment_size); - tracing::add_perfetto_annotation( - ctx, "group_segment_size", - record->dispatch_info.group_segment_size); - tracing::add_perfetto_annotation( - ctx, "workgroup_size", - JOIN("", "(", - JOIN(',', record->dispatch_info.workgroup_size.x, - record->dispatch_info.workgroup_size.y, - record->dispatch_info.workgroup_size.z), - ")")); - tracing::add_perfetto_annotation( - ctx, "grid_size", - JOIN("", "(", - JOIN(',', record->dispatch_info.grid_size.x, - record->dispatch_info.grid_size.y, - record->dispatch_info.grid_size.z), - ")")); - } - }); - tracing::pop_perfetto(category::rocm_kernel_dispatch{}, _name.c_str(), - _track, _end_ns); + const auto _track = tracing::get_perfetto_track( + category::rocm_kernel_dispatch{}, _track_desc, + _agent->device_id, _queue_id.handle); + + tracing::push_perfetto(category::rocm_kernel_dispatch{}, + _name.c_str(), _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_kernel_dispatch{}, + _name.c_str(), _track, _end_ns); + } + else + { + const auto _track = tracing::get_perfetto_track( + category::rocm_hip_stream{}, _track_desc_stream, _stream_id); + + tracing::push_perfetto(category::rocm_hip_stream{}, _name.c_str(), + _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_hip_stream{}, _name.c_str(), + _track, _end_ns); + } } } else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) @@ -828,6 +929,11 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, auto _name = tool_data->buffered_tracing_info.at(record->kind, record->operation); + uint64_t _stream_id = get_stream_id(record).handle; + ROCPROFSYS_CI_THROW( + _stream_id == 0, + "Unexpected zero stream_id in memory copy record: %s.", _name.data()); + if(get_use_timemory()) { const auto& _tinfo = thread_info::get(record->thread_id, SystemTID); @@ -845,36 +951,55 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, if(get_use_perfetto()) { - auto _track_desc = [](int32_t _device_id_v, - rocprofiler_thread_id_t _tid) { - const auto& _tid_v = thread_info::get(_tid, SystemTID); - return JOIN("", "GPU Memory Copy to Agent [", _device_id_v, - "] Thread ", _tid_v->index_data->sequent_value); + auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns); + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); + tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id); + tracing::add_perfetto_annotation(ctx, "stream_id", + _stream_id); + tracing::add_perfetto_annotation(ctx, "dst_agent", + _dst_agent->logical_node_id); + tracing::add_perfetto_annotation(ctx, "src_agent", + _src_agent->logical_node_id); + } }; - const auto _track = tracing::get_perfetto_track( - category::rocm_memory_copy{}, _track_desc, - _dst_agent->logical_node_id, record->thread_id); + if(_group_by_queue) + { + auto _track_desc = [](int32_t _device_id_v, + rocprofiler_thread_id_t _tid) { + const auto& _tid_v = thread_info::get(_tid, SystemTID); + return JOIN("", "GPU Memory Copy to Agent [", _device_id_v, + "] Thread ", _tid_v->index_data->sequent_value); + }; - tracing::push_perfetto( - category::rocm_memory_copy{}, _name.data(), _track, _beg_ns, - ::perfetto::Flow::ProcessScoped(_corr_id), - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", - _beg_ns); - tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); - tracing::add_perfetto_annotation(ctx, "corr_id", - _corr_id); - tracing::add_perfetto_annotation( - ctx, "dst_agent", _dst_agent->logical_node_id); - tracing::add_perfetto_annotation( - ctx, "src_agent", _src_agent->logical_node_id); - } - }); - tracing::pop_perfetto(category::rocm_memory_copy{}, "", _track, - _end_ns); + const auto _track = tracing::get_perfetto_track( + category::rocm_memory_copy{}, _track_desc, + _dst_agent->logical_node_id, record->thread_id); + + tracing::push_perfetto(category::rocm_memory_copy{}, _name.data(), + _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_memory_copy{}, "", _track, + _end_ns); + } + else + { + const auto _track = tracing::get_perfetto_track( + category::rocm_hip_stream{}, _track_desc_stream, _stream_id); + + tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(), + _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track, + _end_ns); + } } } else @@ -1005,30 +1130,6 @@ dispatch_counting_service_callback( } } -// int -// external_correlation_id_callback( -// rocprofiler_thread_id_t /*thr_id*/, rocprofiler_context_id_t /*ctx_id*/, -// rocprofiler_external_correlation_id_request_kind_t /*kind*/, -// rocprofiler_tracing_operation_t /*op*/, uint64_t /*internal_corr_id*/, -// rocprofiler_user_data_t* external_corr_id, void* /*user_data*/) -// { -// auto* _data = new kernel_dispatch_bundle_t{ "kernel_dispatch" }; -// _data->push(); -// external_corr_id->ptr = _data; -// return 0; -// } - -// void -// agent_counter_profile_callback(rocprofiler_context_id_t context_id, -// rocprofiler_agent_id_t agent, -// rocprofiler_agent_set_profile_callback_t set_config, void*) -// { -// if(!agent_counter_profiles) return; -// if(auto itr = agent_counter_profiles->find(agent); -// itr != agent_counter_profiles->end() && itr->second) -// set_config(context_id, *itr->second); -// } - bool is_initialized(rocprofiler_context_id_t ctx) { @@ -1069,6 +1170,73 @@ flush() } } +int +set_kernel_rename_and_stream_correlation_id( + rocprofiler_thread_id_t /* thr_id */, rocprofiler_context_id_t /* ctx_id */, + rocprofiler_external_correlation_id_request_kind_t /* kind */, + rocprofiler_tracing_operation_t /* op */, uint64_t /* internal_corr_id */, + rocprofiler_user_data_t* external_corr_id, void* /* user_data */) +{ + auto* _info = new kernel_rename_and_stream_data{}; + + _info->stream_id = stream_id_top(); + + // Set the external correlation id service to point to struct + external_corr_id->ptr = _info; + + return 0; +} + +#if(ROCPROFILER_VERSION >= 700) +void +tool_hip_stream_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, void* /* data */) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_HIP_STREAM) return; + // Extract stream ID from record + auto* stream_handle_data = + static_cast(record.payload); + auto stream_id = stream_handle_data->stream_id; + + // STREAM_HANDLE_CREATE and DESTROY are no-ops + if(record.operation == ROCPROFILER_HIP_STREAM_CREATE) + { + ROCPROFSYS_VERBOSE_F( + 2, "Entered hip_streams_callback function for ROCPROFILER_HIP_STREAM_CREATE"); + } + else if(record.operation == ROCPROFILER_HIP_STREAM_DESTROY) + { + ROCPROFSYS_VERBOSE_F( + 2, + "Entered hip_streams_callback function for ROCPROFILER_HIP_STREAM_DESTROY"); + } + else if(record.operation == ROCPROFILER_HIP_STREAM_SET) + { + // Push the stream ID onto the stream stack before underlying HIP function is + // called + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + ROCPROFSYS_VERBOSE_F( + 2, "Entered hip_streams_callback function for ROCPROFILER_HIP_STREAM_SET " + "with ROCPROFILER_CALLBACK_PHASE_ENTER"); + stream_id_push(stream_id); + } + // Pop stream ID off of stream stack after underlying HIP function is completed + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT) + { + ROCPROFSYS_VERBOSE_F( + 2, "Entered hip_stream_callback function for ROCPROFILER_HIP_STREAM_SET " + "with ROCPROFILER_CALLBACK_PHASE_EXIT"); + stream_id_pop(); + } + } + else + { + ROCPROFSYS_FAIL_F("Unknown operation for hip_stream_callback!"); + } +} +#endif + int tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) { @@ -1128,6 +1296,30 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) constexpr auto buffer_size = 8192; constexpr auto watermark = 7936; + // Configure external correlation id request service for kernel dispatch + // and memory copy. + + auto external_corr_id_request_kinds = + std::array{ + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY + }; + + ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service( + _data->primary_ctx, external_corr_id_request_kinds.data(), + external_corr_id_request_kinds.size(), + set_kernel_rename_and_stream_correlation_id, _data)); + +#if(ROCPROFILER_VERSION >= 700) + if((_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) > 0) || + (_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) > 0)) + { + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + _data->primary_ctx, ROCPROFILER_CALLBACK_TRACING_HIP_STREAM, nullptr, 0, + tool_hip_stream_callback, nullptr)); + } +#endif + if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) > 0) { ROCPROFILER_CALL(rocprofiler_create_buffer( @@ -1138,16 +1330,6 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, _data->kernel_dispatch_buffer)); - - // auto external_corr_id_request_kinds = - // std::array{ - // ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH - // }; - - // ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service( - // _data->primary_ctx, external_corr_id_request_kinds.data(), - // external_corr_id_request_kinds.size(), external_correlation_id_callback, - // _data)); } if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) > 0) @@ -1157,12 +1339,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, &_data->memory_copy_buffer)); - auto _ops = - rocprofiler_sdk::get_operations(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY); - ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( - _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, - (_ops.empty()) ? nullptr : _ops.data(), _ops.size(), + _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, _data->memory_copy_buffer)); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6e3241fb86..b2e46cee4c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -48,5 +48,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-python-tests.cmake) include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-decode-tests.cmake) include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-nic-perf.cmake) include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-roctx-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-rocm-hip-stream.cmake) add_subdirectory(source) diff --git a/tests/rocprof-sys-rocm-hip-stream.cmake b/tests/rocprof-sys-rocm-hip-stream.cmake new file mode 100644 index 0000000000..84e3a855c7 --- /dev/null +++ b/tests/rocprof-sys-rocm-hip-stream.cmake @@ -0,0 +1,65 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# -------------------------------------------------------------------------------------- # +# +# ROCm tests +# +# -------------------------------------------------------------------------------------- # + +find_package(ROCmVersion) + +if(NOT ROCmVersion_FOUND) + message( + WARNING + "ROCmVersion_FOUND not found, skipping tests in ${CMAKE_CURRENT_LIST_FILE}" + ) + return() +endif() + +if(${ROCmVersion_FULL_VERSION} VERSION_GREATER_EQUAL "7.0") + message(STATUS "Adding Group-By Tests") + + rocprofiler_systems_add_test( + SKIP_REWRITE SKIP_RUNTIME SKIP_BASELINE + NAME transpose-group-by-queue + TARGET transpose + MPI ${TRANSPOSE_USE_MPI} + GPU ON + NUM_PROCS ${NUM_PROCS} + ENVIRONMENT "${_base_environment};ROCPROFSYS_ROCM_GROUP_BY_QUEUE=YES" + LABEL "group-by-queue" + RUNTIME_TIMEOUT 480 + ) + + rocprofiler_systems_add_test( + SKIP_REWRITE SKIP_RUNTIME SKIP_BASELINE + NAME transpose-group-by-stream + TARGET transpose + MPI ${TRANSPOSE_USE_MPI} + GPU ON + NUM_PROCS ${NUM_PROCS} + ENVIRONMENT "${_base_environment};ROCPROFSYS_ROCM_GROUP_BY_QUEUE=NO" + LABEL "group-by-queue" + RUNTIME_TIMEOUT 480 + ) +endif() diff --git a/tests/rocprof-sys-testing.cmake b/tests/rocprof-sys-testing.cmake index 98886fe90e..6e16c96717 100644 --- a/tests/rocprof-sys-testing.cmake +++ b/tests/rocprof-sys-testing.cmake @@ -532,7 +532,7 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) cmake_parse_arguments( TEST "SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME" - "NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" + "NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;WILL_FAIL;DISABLED" "${_KWARGS}" ${ARGN} ) @@ -584,6 +584,14 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) set(TEST_SAMPLING_TIMEOUT 120) endif() + if(NOT TEST_DISABLED) + set(TEST_DISABLED OFF) + endif() + + if(NOT TEST_WILL_FAIL) + set(TEST_WILL_FAIL OFF) + endif() + if(NOT DEFINED TEST_ENVIRONMENT OR "${TEST_ENVIRONMENT}" STREQUAL "") set(TEST_ENVIRONMENT "${_test_environment}") endif() @@ -777,6 +785,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST) PASS_REGULAR_EXPRESSION "${${_PASS_REGEX}}" FAIL_REGULAR_EXPRESSION "${${_FAIL_REGEX}}" SKIP_REGULAR_EXPRESSION "${${_SKIP_REGEX}}" + WILL_FAIL ${TEST_WILL_FAIL} + DISABLED ${TEST_DISABLED} ${_props} ) endif() From 26ae543012c4ef876bc40e306f0a11916736e64b Mon Sep 17 00:00:00 2001 From: Aleksandar Djordjevic Date: Mon, 28 Jul 2025 17:33:52 +0200 Subject: [PATCH 4/6] ROCpd support [Part 1] (#279) - Add rocpd support for - cpu_frequency - amd_smi - sampling --- .gitmodules | 3 + CHANGELOG.md | 1 + CMakeLists.txt | 10 + cmake/Packages.cmake | 11 + cmake/SQLite3.cmake | 48 ++ source/bin/rocprof-sys-avail/avail.cpp | 3 +- source/lib/CMakeLists.txt | 1 + source/lib/common/CMakeLists.txt | 2 + source/lib/common/md5sum.hpp | 469 ++++++++++++ source/lib/common/traits.hpp | 83 ++ source/lib/core/CMakeLists.txt | 28 +- source/lib/core/agent.hpp | 62 ++ source/lib/core/agent_manager.cpp | 137 ++++ source/lib/core/agent_manager.hpp | 62 ++ source/lib/core/benchmark/benchmark.hpp | 353 +++++++++ source/lib/core/benchmark/category.hpp | 68 ++ source/lib/core/config.cpp | 44 ++ source/lib/core/config.hpp | 6 + source/lib/core/cpu.cpp | 168 ++++ source/lib/core/cpu.hpp | 57 ++ source/lib/core/gpu.cpp | 71 +- source/lib/core/node_info.cpp | 72 ++ source/lib/core/node_info.hpp | 58 ++ source/lib/core/rocpd/CMakeLists.txt | 13 + source/lib/core/rocpd/data_processor.cpp | 674 ++++++++++++++++ source/lib/core/rocpd/data_processor.hpp | 252 ++++++ .../core/rocpd/data_storage/CMakeLists.txt | 14 + .../lib/core/rocpd/data_storage/database.cpp | 170 +++++ .../lib/core/rocpd/data_storage/database.hpp | 204 +++++ .../data_storage/insert_query_builders.hpp | 126 +++ .../rocpd/data_storage/schema/data_views.sql | 722 ++++++++++++++++++ .../data_storage/schema/marker_views.sql | 3 + .../data_storage/schema/rocpd_indexes.sql | 45 ++ .../data_storage/schema/rocpd_tables.sql | 373 +++++++++ .../rocpd/data_storage/schema/rocpd_views.sql | 139 ++++ .../data_storage/schema/summary_views.sql | 376 +++++++++ .../rocpd/data_storage/table_insert_query.hpp | 57 ++ source/lib/core/rocpd/json.cpp | 99 +++ source/lib/core/rocpd/json.hpp | 57 ++ source/lib/rocprof-sys/library.cpp | 101 ++- source/lib/rocprof-sys/library/amd_smi.cpp | 407 ++++++---- .../library/components/backtrace_metrics.cpp | 269 +++++++ .../library/components/backtrace_metrics.hpp | 3 + .../library/components/comm_data.cpp | 459 ++++++++--- .../library/components/comm_data.hpp | 2 +- source/lib/rocprof-sys/library/cpu_freq.cpp | 314 ++++++-- source/lib/rocprof-sys/library/kokkosp.cpp | 74 ++ .../library/rocprofiler-sdk/counters.cpp | 18 +- source/lib/rocprof-sys/library/sampling.cpp | 347 ++++++++- 49 files changed, 6770 insertions(+), 365 deletions(-) create mode 100644 cmake/SQLite3.cmake create mode 100644 source/lib/common/md5sum.hpp create mode 100644 source/lib/common/traits.hpp create mode 100644 source/lib/core/agent.hpp create mode 100644 source/lib/core/agent_manager.cpp create mode 100644 source/lib/core/agent_manager.hpp create mode 100644 source/lib/core/benchmark/benchmark.hpp create mode 100644 source/lib/core/benchmark/category.hpp create mode 100644 source/lib/core/cpu.cpp create mode 100644 source/lib/core/cpu.hpp create mode 100644 source/lib/core/node_info.cpp create mode 100644 source/lib/core/node_info.hpp create mode 100644 source/lib/core/rocpd/CMakeLists.txt create mode 100644 source/lib/core/rocpd/data_processor.cpp create mode 100644 source/lib/core/rocpd/data_processor.hpp create mode 100644 source/lib/core/rocpd/data_storage/CMakeLists.txt create mode 100644 source/lib/core/rocpd/data_storage/database.cpp create mode 100644 source/lib/core/rocpd/data_storage/database.hpp create mode 100644 source/lib/core/rocpd/data_storage/insert_query_builders.hpp create mode 100644 source/lib/core/rocpd/data_storage/schema/data_views.sql create mode 100644 source/lib/core/rocpd/data_storage/schema/marker_views.sql create mode 100644 source/lib/core/rocpd/data_storage/schema/rocpd_indexes.sql create mode 100644 source/lib/core/rocpd/data_storage/schema/rocpd_tables.sql create mode 100644 source/lib/core/rocpd/data_storage/schema/rocpd_views.sql create mode 100644 source/lib/core/rocpd/data_storage/schema/summary_views.sql create mode 100644 source/lib/core/rocpd/data_storage/table_insert_query.hpp create mode 100644 source/lib/core/rocpd/json.cpp create mode 100644 source/lib/core/rocpd/json.hpp diff --git a/.gitmodules b/.gitmodules index 89b551a769..c70439d0ab 100644 --- a/.gitmodules +++ b/.gitmodules @@ -24,3 +24,6 @@ [submodule "external/pybind11"] path = external/pybind11 url = https://github.com/jrmadsen/pybind11.git +[submodule "external/sqlite"] + path = external/sqlite + url = https://github.com/sqlite/sqlite.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 263c1572c8..88de0f6a25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs. - How-to document for VCN and JPEG activity sampling and tracing. - Support for tracing Fortran applications. - Support for tracing MPI API in Fortran. +- Initial support for rocPD database output with the `ROCPROFSYS_USE_ROCPD` configuration setting. - By default, group "kernel dispatch" and "memory copy" events by HIP stream ID in Perfetto traces. - Add the "ROCPROFSYS_ROCM_GROUP_BY_QUEUE" configuration setting to group events by queue, instead. diff --git a/CMakeLists.txt b/CMakeLists.txt index 04bb87d551..605b070f0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,12 @@ rocprofiler_systems_add_option(ROCPROFSYS_BUILD_CODECOV "Build for code coverage rocprofiler_systems_add_option(ROCPROFSYS_INSTALL_PERFETTO_TOOLS "Install perfetto tools (i.e. traced, perfetto, etc.)" OFF ) +rocprofiler_systems_add_option(ROCPROFILER_BUILD_SQLITE3 + "Enable building sqlite3 library internally" OFF +) +rocprofiler_systems_add_option(ROCPROFSYS_ENABLE_BENCHMARK + "Enable performance benchmarking capabilities for the project" OFF +) if(ROCPROFSYS_USE_PAPI) rocprofiler_systems_add_option(ROCPROFSYS_BUILD_PAPI "Build PAPI from submodule" ON) @@ -328,6 +334,10 @@ if(ROCPROFSYS_BUILD_TESTING OR "$ENV{ROCPROFSYS_CI}" MATCHES "[1-9]+|ON|on|y|yes include(CTest) endif() +if(ROCPROFSYS_ENABLE_BENCHMARK) + add_compile_definitions(-DROCPROFSYS_USE_BENCHMARK=1) +endif() + # ------------------------------------------------------------------------------# # # library and executables diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index f24c2c653a..fd9b302a0b 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -53,6 +53,9 @@ rocprofiler_systems_add_interface_library(rocprofiler-systems-python rocprofiler_systems_add_interface_library(rocprofiler-systems-perfetto "Enables Perfetto support" ) +rocprofiler_systems_add_interface_library(rocprofiler-systems-sqlite3 + "Use SQLite3 for rocpd data storage" +) rocprofiler_systems_add_interface_library(rocprofiler-systems-timemory "Provides timemory libraries" ) @@ -532,6 +535,14 @@ rocprofiler_systems_checkout_git_submodule( include(Perfetto) +# ----------------------------------------------------------------------------------------# +# +# SQLite3 +# +# ----------------------------------------------------------------------------------------# + +include(SQLite3) + # ----------------------------------------------------------------------------------------# # # ELFIO diff --git a/cmake/SQLite3.cmake b/cmake/SQLite3.cmake new file mode 100644 index 0000000000..44669c3280 --- /dev/null +++ b/cmake/SQLite3.cmake @@ -0,0 +1,48 @@ +include_guard(GLOBAL) + +if(ROCPROFILER_BUILD_SQLITE3) + message(STATUS "Building SQLite3 from source!") + execute_process( + COMMAND ${CMAKE_COMMAND} -E make_directory ${PROJECT_BINARY_DIR}/external/sqlite + ) + # checkout submodule if not already checked out or clone repo if no .gitmodules file + rocprofiler_systems_checkout_git_submodule( + RELATIVE_PATH external/sqlite + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + TEST_FILE configure + REPO_URL https://github.com/sqlite/sqlite.git + REPO_BRANCH "version-3.45.3" + ) + + find_program(MAKE_COMMAND NAMES make gmake PATH_SUFFIXES bin REQUIRED) + + include(ExternalProject) + ExternalProject_Add( + rocprofiler-systems-sqlite-build + PREFIX ${PROJECT_BINARY_DIR}/external/sqlite/build + SOURCE_DIR ${PROJECT_SOURCE_DIR}/external/sqlite + BUILD_IN_SOURCE 0 + CONFIGURE_COMMAND + /configure --prefix=${PROJECT_BINARY_DIR}/external/sqlite/install + --libdir=${PROJECT_BINARY_DIR}/external/sqlite/install/lib --disable-shared + --with-tempstore=yes --enable-all --disable-tcl CFLAGS=-O3\ -g1 + BUILD_COMMAND ${MAKE_COMMAND} install -s + INSTALL_COMMAND "" + ) + + target_link_libraries( + rocprofiler-systems-sqlite3 + INTERFACE + $ + ) + target_include_directories( + rocprofiler-systems-sqlite3 + SYSTEM + INTERFACE $ + ) + add_dependencies(rocprofiler-systems-sqlite3 rocprofiler-systems-sqlite-build) +else() + message(STATUS "Using system SQLite3 library") + find_package(SQLite3 REQUIRED) + target_link_libraries(rocprofiler-systems-sqlite3 INTERFACE SQLite::SQLite3) +endif() diff --git a/source/bin/rocprof-sys-avail/avail.cpp b/source/bin/rocprof-sys-avail/avail.cpp index 93128f8651..9313a06fe9 100644 --- a/source/bin/rocprof-sys-avail/avail.cpp +++ b/source/bin/rocprof-sys-avail/avail.cpp @@ -118,7 +118,7 @@ write_hw_counter_info(std::ostream&, const array_t& = {}, namespace { // initialize HIP before main so that librocprof-sys is not HSA_TOOLS_LIB -int gpu_count = rocprofsys::gpu::device_count(); +int gpu_count = 0; // statically allocated shared_ptrs to prevent use after free errors auto timemory_manager = tim::manager::master_instance(); @@ -138,6 +138,7 @@ main(int argc, char** argv) tim::unwind::set_bfd_verbose(3); tim::set_env("ROCPROFSYS_INIT_TOOLING", "OFF", 1); rocprofsys_init_library(); + gpu_count = rocprofsys::gpu::device_count(); std::set _category_options = component_categories{}(); { diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 14e81112e7..e6699c833d 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -40,6 +40,7 @@ target_link_libraries( $ $ $ + $ $ $ $ diff --git a/source/lib/common/CMakeLists.txt b/source/lib/common/CMakeLists.txt index d096ebf933..8ea90f2aed 100644 --- a/source/lib/common/CMakeLists.txt +++ b/source/lib/common/CMakeLists.txt @@ -26,6 +26,8 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/invoke.hpp ${CMAKE_CURRENT_SOURCE_DIR}/join.hpp ${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/traits.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/md5sum.hpp ${CMAKE_CURRENT_SOURCE_DIR}/static_object.hpp ${CMAKE_CURRENT_SOURCE_DIR}/synchronized.hpp ) diff --git a/source/lib/common/md5sum.hpp b/source/lib/common/md5sum.hpp new file mode 100644 index 0000000000..6de09280e6 --- /dev/null +++ b/source/lib/common/md5sum.hpp @@ -0,0 +1,469 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "traits.hpp" + +namespace rocprofsys +{ +inline namespace common +{ + +class md5sum +{ +public: + using size_type = uint32_t; // must be 32bit + using raw_digest_t = std::array; + static constexpr int blocksize = 64; + + template + explicit md5sum(Tp&& arg, Args&&... args); + + md5sum() = default; + ~md5sum() = default; + md5sum(const md5sum&) = default; + md5sum(md5sum&&) = default; + + md5sum& operator=(const md5sum&) = default; + md5sum& operator=(md5sum&&) = default; + + md5sum& update(std::string_view inp); + md5sum& update(const unsigned char* buf, size_type length); + md5sum& update(const char* buf, size_type length); + md5sum& finalize(); + std::string hexdigest() const; + std::string hexliteral() const; + raw_digest_t rawdigest() const { return digest; } + + template ::value, int>> + md5sum& update(Tp inp); + + friend std::ostream& operator<<(std::ostream&, md5sum md5); + +private: + void transform(const uint8_t block[blocksize]); + + bool finalized = false; + // 64bit counter for number of bits (lo, hi) + std::array count = { 0, 0 }; + std::array buffer{}; // overflow bytes from last 64 byte chunk + // digest so far, initialized to magic initialization constants. + std::array state = { 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 }; + std::array digest{}; // result +}; + +template +md5sum::md5sum(Tp&& arg, Args&&... args) +{ + auto _update = [&](auto&& _val) { + using value_type = + std::remove_reference_t>>; + static_assert(!std::is_pointer::value, + "constructor cannot be called with pointer argument"); + update(std::forward(_val)); + }; + + _update(std::forward(arg)); + (_update(std::forward(args)), ...); + finalize(); +} + +template +md5sum& +md5sum::update(Tp inp) +{ + static_assert(std::is_arithmetic::value, "expected arithmetic type"); + return update(reinterpret_cast(&inp), sizeof(Tp)); +} + +template