Use rocprofiler-sdk for RCCL-API tracing (#126)

- Add support for RCCL API tracing through rocprofiler-sdk.
- Refactored the comm_data code to use the SDK RCCL_API callbacks.
- Add a runtime version check for SDK to gate callback enablement, rather than just the compile-time check.
- Fixed: SAMPLING_TIMEOUT was not being handled correctly in add_test.

[ROCm/rocprofiler-systems commit: af77d93f75]
This commit is contained in:
David Galiffi
2025-06-06 11:36:17 -04:00
committed by GitHub
vanhempi f718bd907c
commit c7c3c3f97e
31 muutettua tiedostoa jossa 350 lisäystä ja 1252 poistoa
@@ -562,10 +562,9 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_data.processed_environs.emplace("periods");
}
strset_t _backend_choices = {
"all", "kokkosp", "mpip", "ompt", "rcclp",
"amd-smi", "rocprofiler-sdk", "mutex-locks", "spin-locks", "rw-locks"
};
strset_t _backend_choices = { "all", "kokkosp", "mpip", "ompt",
"rcclp", "amd-smi", "rocm", "mutex-locks",
"spin-locks", "rw-locks" };
#if !defined(ROCPROFSYS_USE_MPI) && !defined(ROCPROFSYS_USE_MPI_HEADERS)
_backend_choices.erase("mpip");
@@ -575,14 +574,10 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
_backend_choices.erase("ompt");
#endif
#if !defined(ROCPROFSYS_USE_RCCL)
_backend_choices.erase("rcclp");
#endif
#if !defined(ROCPROFSYS_USE_ROCM)
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rocm");
_backend_choices.erase("rcclp");
#endif
if(gpu::device_count() == 0)
@@ -590,13 +585,8 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
// remove GPU-specific backends
_backend_choices.erase("rcclp");
_backend_choices.erase("amd-smi");
_backend_choices.erase("rocprofiler-sdk");
_backend_choices.erase("rocm");
#if defined(ROCPROFSYS_USE_RCCL)
update_env(_data, "ROCPROFSYS_USE_RCCLP", false);
#endif
#if defined(ROCPROFSYS_USE_ROCM)
update_env(_data, "ROCPROFSYS_USE_AMD_SMI", false);
update_env(_data, "ROCPROFSYS_USE_ROCM", false);