SDK: create CMake option for strict checks on CPU vs. GPU timestamps (#1159)

* SDK: create CMake option for strict checks on CPU vs. GPU timestamps

- Configurating CMake with `ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS=ON` will enable fatal errors if dispatch/memcpy timestamps on GPU are outside of the start/end time from the CPU
- `ROCPROFIELR_BUILD_CI_STRICT_TIMESTAMPS` defaults to the value of `ROCPROFILER_BUILD_CI`

* Formatting

* Disable async_copy frequency scaling

* Disable profiling dispatch time frequency scaling

* Support runtime configuration via env variables

- ROCPROFILER_CI_FREQ_SCALE_TIMESTAMPS env variable will enable scaling the timestamps based on the hsa timestamp period
- ROCPROFILER_CI_STRICT_TIMESTAMPS env variable will enable strict timestamp checks
  - when cmake is configured with ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS=ON, this env variable defaults to true

* ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS defaults to OFF

* Update cmake-target

* Common tracing::adjust_profiling_time

---------

Co-authored-by: Gopesh Bhardwaj <gopesh.bhardwaj@amd.com>

[ROCm/rocprofiler-sdk commit: ad48201912]
이 커밋은 다음에 포함됨:
Jonathan R. Madsen
2024-11-01 23:12:51 -05:00
커밋한 사람 GitHub
부모 d8cfdd2887
커밋 3e64cedc0c
9개의 변경된 파일282개의 추가작업 그리고 131개의 파일을 삭제
+5
파일 보기
@@ -198,6 +198,11 @@ if(ROCPROFILER_UNSAFE_NO_VERSION_CHECK)
INTERFACE ROCPROFILER_UNSAFE_NO_VERSION_CHECK)
endif()
if(ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS)
rocprofiler_target_compile_definitions(rocprofiler-sdk-build-flags
INTERFACE ROCPROFILER_CI_STRICT_TIMESTAMPS)
endif()
# ----------------------------------------------------------------------------------------#
# user customization
#
+3
파일 보기
@@ -38,6 +38,9 @@ rocprofiler_add_option(ROCPROFILER_BUILD_TESTS "Enable building the tests"
${ROCPROFILER_BUILD_CI})
rocprofiler_add_option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples"
${ROCPROFILER_BUILD_CI})
rocprofiler_add_option(
ROCPROFILER_BUILD_CI_STRICT_TIMESTAMPS
"Disable adjusting for clock skew b/t CPU and GPU timestamps" OFF ADVANCED)
rocprofiler_add_option(ROCPROFILER_BUILD_CODECOV
"Enable building for code coverage analysis" OFF)
rocprofiler_add_option(ROCPROFILER_BUILD_DOCS
+20 -44
파일 보기
@@ -21,6 +21,8 @@
// THE SOFTWARE.
#include "lib/rocprofiler-sdk/hsa/async_copy.hpp"
#include "lib/common/defines.hpp"
#include "lib/common/environment.hpp"
#include "lib/common/logging.hpp"
#include "lib/common/scope_destructor.hpp"
#include "lib/common/static_object.hpp"
@@ -28,9 +30,9 @@
#include "lib/rocprofiler-sdk/agent.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "lib/rocprofiler-sdk/tracing/fwd.hpp"
#include "lib/rocprofiler-sdk/tracing/profiling_time.hpp"
#include "lib/rocprofiler-sdk/tracing/tracing.hpp"
#include <rocprofiler-sdk/callback_tracing.h>
@@ -317,30 +319,6 @@ convert_hsa_handle(Up _hsa_object)
return reinterpret_cast<Tp*>(_hsa_object.handle);
}
hsa_amd_profiling_async_copy_time_t&
operator+=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start += rhs;
lhs.end += rhs;
return lhs;
}
hsa_amd_profiling_async_copy_time_t&
operator-=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start -= rhs;
lhs.end -= rhs;
return lhs;
}
hsa_amd_profiling_async_copy_time_t&
operator*=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start *= rhs;
lhs.end *= rhs;
return lhs;
}
bool
async_copy_handler(hsa_signal_value_t signal_value, void* arg)
{
@@ -352,41 +330,38 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg)
return false;
}
static auto sysclock_period = hsa::get_hsa_timestamp_period();
auto ts = common::timestamp_ns();
auto* _data = static_cast<async_copy_data*>(arg);
auto copy_time = hsa_amd_profiling_async_copy_time_t{};
auto copy_time_status = get_amd_ext_table()->hsa_amd_profiling_get_async_copy_time_fn(
_data->rocp_signal, &copy_time);
// normalize
copy_time *= sysclock_period;
auto _profile_time = tracing::profiling_time{copy_time_status, copy_time.start, copy_time.end};
// below is a hack for clock skew issues:
// the timestamp of this handler for the copy will always be after when the copy ended
if(ts < copy_time.end) copy_time -= (copy_time.end - ts);
if(_profile_time.status == HSA_STATUS_SUCCESS)
{
_profile_time = tracing::adjust_profiling_time(
"memcpy",
_profile_time,
tracing::profiling_time{HSA_STATUS_SUCCESS, _data->start_ts, ts});
// below is a hack for clock skew issues:
// the timestamp of the function call triggering the copy will always be before when the copy
// started
if(copy_time.start < _data->start_ts) copy_time += (_data->start_ts - copy_time.start);
// if we encounter this in CI, it will cause test to fail
ROCP_CI_LOG_IF(ERROR, copy_time_status == HSA_STATUS_SUCCESS && copy_time.end < copy_time.start)
<< "hsa_amd_profiling_get_async_copy_time for returned async times where the end time ("
<< copy_time.end << ") was less than the start time (" << copy_time.start << ")";
// if we encounter this in CI, it will cause test to fail
ROCP_CI_LOG_IF(ERROR, _profile_time.end < _profile_time.start)
<< "hsa_amd_profiling_get_async_copy_time for returned async times where the end time ("
<< _profile_time.end << ") was less than the start time (" << _profile_time.start
<< ")";
}
// get the contexts that were active when the signal was created
const auto& tracing_data = _data->tracing_data;
// we need to decrement this reference count at the end of the functions
auto* _corr_id = _data->correlation_id;
if(copy_time_status == HSA_STATUS_SUCCESS && !tracing_data.empty())
if(_profile_time.status == HSA_STATUS_SUCCESS && !tracing_data.empty())
{
if(!_data->tracing_data.callback_contexts.empty())
{
auto _tracer_data = _data->get_callback_data(copy_time.start, copy_time.end);
auto _tracer_data = _data->get_callback_data(_profile_time.start, _profile_time.end);
tracing::execute_phase_exit_callbacks(_data->tracing_data.callback_contexts,
_data->tracing_data.external_correlation_ids,
@@ -397,7 +372,8 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg)
if(!_data->tracing_data.buffered_contexts.empty())
{
auto record = _data->get_buffered_record(nullptr, copy_time.start, copy_time.end);
auto record =
_data->get_buffered_record(nullptr, _profile_time.start, _profile_time.end);
tracing::execute_buffer_record_emplace(_data->tracing_data.buffered_contexts,
_data->tid,
@@ -21,10 +21,13 @@
// THE SOFTWARE.
#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp"
#include "lib/common/defines.hpp"
#include "lib/common/environment.hpp"
#include "lib/common/logging.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/agent.hpp"
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
#include "lib/rocprofiler-sdk/tracing/profiling_time.hpp"
#include <rocprofiler-sdk/fwd.h>
@@ -36,93 +39,35 @@ namespace rocprofiler
{
namespace kernel_dispatch
{
namespace
{
hsa_amd_profiling_dispatch_time_t&
operator+=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start += rhs;
lhs.end += rhs;
return lhs;
}
hsa_amd_profiling_dispatch_time_t&
operator-=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start -= rhs;
lhs.end -= rhs;
return lhs;
}
hsa_amd_profiling_dispatch_time_t&
operator*=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start *= rhs;
lhs.end *= rhs;
return lhs;
}
} // namespace
profiling_time&
profiling_time::operator+=(uint64_t offset)
{
start += offset;
end += offset;
return *this;
}
profiling_time&
profiling_time::operator-=(uint64_t offset)
{
start -= offset;
end -= offset;
return *this;
}
profiling_time&
profiling_time::operator*=(uint64_t scale)
{
start *= scale;
end *= scale;
return *this;
}
profiling_time
get_dispatch_time(hsa_agent_t _hsa_agent,
hsa_signal_t _signal,
rocprofiler_kernel_id_t _kernel_id,
std::optional<uint64_t> _baseline)
{
static auto sysclock_period = hsa::get_hsa_timestamp_period();
auto ts = common::timestamp_ns();
auto dispatch_time = hsa_amd_profiling_dispatch_time_t{};
auto dispatch_time_status = hsa::get_amd_ext_table()->hsa_amd_profiling_get_dispatch_time_fn(
_hsa_agent, _signal, &dispatch_time);
if(dispatch_time_status == HSA_STATUS_SUCCESS)
auto _profile_time =
tracing::profiling_time{dispatch_time_status, dispatch_time.start, dispatch_time.end};
if(_profile_time.status == HSA_STATUS_SUCCESS)
{
// if we encounter this in CI, it will cause test to fail
ROCP_CI_LOG_IF(ERROR, dispatch_time.end < dispatch_time.start)
ROCP_CI_LOG_IF(ERROR, _profile_time.end < _profile_time.start)
<< "hsa_amd_profiling_get_dispatch_time for kernel_id=" << _kernel_id
<< " on rocprofiler_agent="
<< CHECK_NOTNULL(agent::get_rocprofiler_agent(_hsa_agent))->id.handle
<< " returned dispatch times where the end time (" << dispatch_time.end
<< ") was less than the start time (" << dispatch_time.start << ")";
<< CHECK_NOTNULL(agent::get_rocprofiler_agent(_hsa_agent))->node_id
<< " returned dispatch times where the end time (" << _profile_time.end
<< ") was less than the start time (" << _profile_time.start << ")";
// normalize
dispatch_time *= sysclock_period;
// below is a hack for clock skew issues:
// the timestamp of this handler for the kernel dispatch will always be after when the
// kernel completed
if(ts < dispatch_time.end) dispatch_time -= (dispatch_time.end - ts);
// below is a hack for clock skew issues:
// the timestamp of the packet rewriter for the kernel packet will always be before when the
// kernel started
if(_baseline && dispatch_time.start < *_baseline)
dispatch_time += (*_baseline - dispatch_time.start);
_profile_time = tracing::adjust_profiling_time(
"dispatch",
_profile_time,
tracing::profiling_time{
HSA_STATUS_SUCCESS, _baseline.value_or(dispatch_time.start), ts});
}
else
{
@@ -133,8 +78,7 @@ get_dispatch_time(hsa_agent_t _hsa_agent,
<< " :: " << hsa::get_hsa_status_string(dispatch_time_status);
}
return profiling_time{
.status = dispatch_time_status, .start = dispatch_time.start, .end = dispatch_time.end};
return _profile_time;
}
} // namespace kernel_dispatch
} // namespace rocprofiler
@@ -22,6 +22,8 @@
#pragma once
#include "lib/rocprofiler-sdk/tracing/profiling_time.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
@@ -34,16 +36,7 @@ namespace rocprofiler
{
namespace kernel_dispatch
{
struct profiling_time
{
hsa_status_t status = HSA_STATUS_ERROR_INVALID_SIGNAL;
uint64_t start = 0;
uint64_t end = 0;
profiling_time& operator+=(uint64_t offset);
profiling_time& operator-=(uint64_t offset);
profiling_time& operator*=(uint64_t scale);
};
using profiling_time = tracing::profiling_time;
// get the profiling time for a signal on an agent, if start time is less than baseline, correct to
// start at baseline. If kernel_id is provided, it will be included in error log message if there is
+2 -2
파일 보기
@@ -23,7 +23,7 @@
#pragma once
#include "lib/rocprofiler-sdk/hsa/queue_info_session.hpp"
// #include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp"
#include "lib/rocprofiler-sdk/tracing/profiling_time.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
@@ -45,7 +45,7 @@ using context_t = context::context;
using user_data_map_t = std::unordered_map<const context_t*, rocprofiler_user_data_t>;
using external_corr_id_map_t = user_data_map_t;
struct profiling_time;
using profiling_time = tracing::profiling_time;
profiling_time
get_dispatch_time(const hsa::queue_info_session& session);
+2 -2
파일 보기
@@ -1,6 +1,6 @@
#
set(ROCPROFILER_LIB_TRACING_SOURCES)
set(ROCPROFILER_LIB_TRACING_HEADERS fwd.hpp tracing.hpp)
set(ROCPROFILER_LIB_TRACING_SOURCES profiling_time.cpp)
set(ROCPROFILER_LIB_TRACING_HEADERS fwd.hpp profiling_time.hpp tracing.hpp)
target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_TRACING_SOURCES}
${ROCPROFILER_LIB_TRACING_HEADERS})
+101
파일 보기
@@ -0,0 +1,101 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "lib/rocprofiler-sdk/tracing/profiling_time.hpp"
namespace rocprofiler
{
namespace tracing
{
hsa_amd_profiling_dispatch_time_t&
operator+=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start += rhs;
lhs.end += rhs;
return lhs;
}
hsa_amd_profiling_dispatch_time_t&
operator-=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start -= rhs;
lhs.end -= rhs;
return lhs;
}
hsa_amd_profiling_dispatch_time_t&
operator*=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start *= rhs;
lhs.end *= rhs;
return lhs;
}
hsa_amd_profiling_async_copy_time_t&
operator+=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start += rhs;
lhs.end += rhs;
return lhs;
}
hsa_amd_profiling_async_copy_time_t&
operator-=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start -= rhs;
lhs.end -= rhs;
return lhs;
}
hsa_amd_profiling_async_copy_time_t&
operator*=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start *= rhs;
lhs.end *= rhs;
return lhs;
}
profiling_time&
profiling_time::operator+=(uint64_t offset)
{
start += offset;
end += offset;
return *this;
}
profiling_time&
profiling_time::operator-=(uint64_t offset)
{
start -= offset;
end -= offset;
return *this;
}
profiling_time&
profiling_time::operator*=(uint64_t scale)
{
start *= scale;
end *= scale;
return *this;
}
} // namespace tracing
} // namespace rocprofiler
+129
파일 보기
@@ -0,0 +1,129 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
#include "lib/common/environment.hpp"
#include "lib/common/logging.hpp"
#include "lib/common/mpl.hpp"
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <fmt/format.h>
#include <cstdint>
namespace rocprofiler
{
namespace tracing
{
hsa_amd_profiling_dispatch_time_t&
operator+=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs);
hsa_amd_profiling_dispatch_time_t&
operator-=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs);
hsa_amd_profiling_dispatch_time_t&
operator*=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs);
hsa_amd_profiling_async_copy_time_t&
operator+=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs);
hsa_amd_profiling_async_copy_time_t&
operator-=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs);
hsa_amd_profiling_async_copy_time_t&
operator*=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs);
#if !defined(ROCPROFILER_CI_STRICT_TIMESTAMPS)
# define ROCPROFILER_CI_STRICT_TIMESTAMPS 0
#endif
struct profiling_time
{
hsa_status_t status = HSA_STATUS_ERROR_INVALID_SIGNAL;
uint64_t start = 0;
uint64_t end = 0;
profiling_time& operator+=(uint64_t offset);
profiling_time& operator-=(uint64_t offset);
profiling_time& operator*=(uint64_t scale);
};
inline profiling_time
adjust_profiling_time(std::string_view _label, profiling_time _value, profiling_time&& _bounds)
{
static auto sysclock_period = hsa::get_hsa_timestamp_period();
static auto normalize_env = common::get_env("ROCPROFILER_CI_FREQ_SCALE_TIMESTAMPS", false);
static auto strict_ts_env = common::get_env(
"ROCPROFILER_CI_STRICT_TIMESTAMPS", (ROCPROFILER_CI_STRICT_TIMESTAMPS > 0) ? true : false);
// normalize
if(ROCPROFILER_UNLIKELY(normalize_env)) _value *= sysclock_period;
if(strict_ts_env)
{
ROCP_FATAL_IF(ROCPROFILER_UNLIKELY(_value.end < _value.start))
<< fmt::format("Invalid {} time value: {} end time ({}) is less than the {} start time "
"({}) :: difference={}",
_label,
_label,
_value.end,
_label,
_value.start,
(_value.end - _value.start));
ROCP_FATAL_IF(ROCPROFILER_UNLIKELY(_value.start < _bounds.start))
<< fmt::format("Invalid {} time value: {} start time ({}) is less than the enqueue "
"time on the CPU ({}) :: difference={}",
_label,
_label,
_value.start,
_label,
_bounds.start,
(_bounds.start - _value.start));
ROCP_FATAL_IF(ROCPROFILER_UNLIKELY(_value.end > _bounds.end))
<< fmt::format("Invalid {} time value: {} end time ({}) is greater than the current "
"time on the CPU ({}) :: difference={}",
_label,
_label,
_value.end,
_label,
_bounds.end,
(_value.end - _bounds.end));
}
// below are hacks for clock skew issues:
//
// the timestamp of this handler will always be after when the profiling time ended
if(_bounds.end < _value.end) _value -= (_value.end - _bounds.end);
// the timestamp of the enqueue will always be before when the profiling time started
if(_value.start < _bounds.start) _value += (_bounds.start - _value.start);
return _value;
}
} // namespace tracing
} // namespace rocprofiler