Fix misaligned stores in buffer (#1063)
* Fix misaligned read/write to buffer
- causes undefined behavior
* Update run-ci.py
- fix spurious CDash submission failure warning
* Improve run-ci.py support for UBSan
* Relax rocprofv3 summary stats count expectation
* Update CHANGELOG
[ROCm/rocprofiler-sdk commit: 37e0d7efce]
此提交包含在:
@@ -4,7 +4,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
|
||||
## ROCprofiler-SDK for AFAR I
|
||||
|
||||
## Additions
|
||||
### Additions
|
||||
|
||||
- HSA API Tracing
|
||||
- Kernel Dispatch Tracing
|
||||
@@ -14,7 +14,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
|
||||
## ROCprofiler-SDK for AFAR II
|
||||
|
||||
## Additions
|
||||
### Additions
|
||||
|
||||
- HIP API Tracing
|
||||
- ROCTx Tracing
|
||||
@@ -25,7 +25,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
|
||||
## ROCprofiler-SDK for AFAR III
|
||||
|
||||
## Additions
|
||||
### Additions
|
||||
|
||||
- Kernel Dispatch Counter Collection – (includes serialization and multidimensional instances)
|
||||
- Kernel serialization
|
||||
@@ -43,7 +43,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
|
||||
## ROCprofiler-SDK for AFAR IV
|
||||
|
||||
## Additions
|
||||
### Additions
|
||||
|
||||
- Page Migration Reporting (API)
|
||||
- Scratch Memory Reporting (API)
|
||||
@@ -55,7 +55,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
|
||||
## ROCprofiler-SDK for AFAR V
|
||||
|
||||
## Additions
|
||||
### Additions
|
||||
|
||||
- Agent/Device Counter Collection (API)
|
||||
- Single JSON output format support (tool)
|
||||
@@ -67,17 +67,17 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
- ROCProf V3 Multi-GPU Support:
|
||||
- Multi-process (multiple files)
|
||||
|
||||
## Fixes
|
||||
### Fixes
|
||||
|
||||
- SQ_ACCUM_PREV and SQ_ACCUM_PREV_HIRE overwriting issue
|
||||
|
||||
## Changes
|
||||
### Changes
|
||||
|
||||
- rocprofv3 tool now needs `--` in front of application. For detailed uses, please [Click Here](source/docs/rocprofv3.md)
|
||||
|
||||
|
||||
## ROCprofiler-SDK for AFAR VI
|
||||
|
||||
## Additions
|
||||
### Additions
|
||||
|
||||
- OTF2 Tool Support
|
||||
- Kernel and Range Filtering
|
||||
@@ -90,6 +90,16 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
|
||||
- Dispatch_Id
|
||||
- Added CSV column for counter_collection
|
||||
|
||||
## Fixes
|
||||
### Fixes
|
||||
|
||||
- Miscellaneous bug fixes
|
||||
|
||||
## ROCprofiler-SDK for AFAR VII
|
||||
|
||||
### Additions
|
||||
|
||||
### Fixes
|
||||
|
||||
### Changes
|
||||
|
||||
- Fix misaligned stores (undefined behavior) for buffer records
|
||||
|
||||
@@ -235,6 +235,7 @@ record_header_buffer::emplace(uint64_t _hash, Tp& _v)
|
||||
if(m_headers.empty()) return false;
|
||||
|
||||
constexpr auto request_size = sizeof(Tp);
|
||||
constexpr auto align_size = alignof(Tp);
|
||||
|
||||
// notify there was a request
|
||||
m_requested.fetch_add(1);
|
||||
@@ -242,7 +243,7 @@ record_header_buffer::emplace(uint64_t _hash, Tp& _v)
|
||||
// in theory, we shouldn't need to lock here but the thread sanitizer says there is a race.
|
||||
// the lock will be short-lived so hopefully, it will scale fine
|
||||
write_lock();
|
||||
auto* _addr = m_buffer.request(request_size, false);
|
||||
auto* _addr = m_buffer.request(request_size, align_size, false);
|
||||
write_unlock();
|
||||
|
||||
read_lock();
|
||||
@@ -277,6 +278,7 @@ record_header_buffer::emplace(uint32_t _category, uint32_t _kind, Tp& _v)
|
||||
if(m_headers.empty()) return false;
|
||||
|
||||
constexpr auto request_size = sizeof(Tp);
|
||||
constexpr auto align_size = alignof(Tp);
|
||||
|
||||
// notify there was a request
|
||||
m_requested.fetch_add(1);
|
||||
@@ -284,7 +286,7 @@ record_header_buffer::emplace(uint32_t _category, uint32_t _kind, Tp& _v)
|
||||
// in theory, we shouldn't need to lock here but the thread sanitizer says there is a race.
|
||||
// the lock will be short-lived so hopefully, it will scale fine
|
||||
write_lock();
|
||||
auto* _addr = m_buffer.request(request_size, false);
|
||||
auto* _addr = m_buffer.request(request_size, align_size, false);
|
||||
write_unlock();
|
||||
|
||||
read_lock();
|
||||
|
||||
@@ -134,30 +134,35 @@ ring_buffer::as_string() const
|
||||
//
|
||||
|
||||
void*
|
||||
ring_buffer::request(size_t _length, bool _wrap)
|
||||
ring_buffer::request(size_t _length, size_t _align, bool _wrap)
|
||||
{
|
||||
if(m_ptr == nullptr || m_size == 0) return nullptr;
|
||||
|
||||
if(is_full()) return (_wrap) ? retrieve(_length) : nullptr;
|
||||
|
||||
LOG_IF(FATAL, _align == 0) << "alignment must be non-zero";
|
||||
|
||||
// if write count is at the tail of buffer, bump to the end of buffer
|
||||
size_t _write_count = 0;
|
||||
size_t _offset = 0;
|
||||
size_t _write_pos = 0;
|
||||
do
|
||||
{
|
||||
// Make sure we don't put in more than there's room for, by writing no
|
||||
// more than there is free.
|
||||
if(_length > free()) return nullptr;
|
||||
|
||||
_offset = 0;
|
||||
_write_count = m_write_count.load(std::memory_order_acquire);
|
||||
auto _modulo = m_size - (_write_count % m_size);
|
||||
if(_modulo < _length) _offset = _modulo;
|
||||
auto _align_modulo = (_write_count % _align);
|
||||
auto _align_offset = (_align_modulo > 0) ? (_align - _align_modulo) : 0;
|
||||
_write_pos = _write_count + _align_offset;
|
||||
} while(!m_write_count.compare_exchange_strong(
|
||||
_write_count, _write_count + _length + _offset, std::memory_order_seq_cst));
|
||||
_write_count, _write_pos + _length + _offset, std::memory_order_seq_cst));
|
||||
|
||||
// pointer in buffer
|
||||
void* _out = write_ptr(_write_count);
|
||||
void* _out = write_ptr(_write_pos);
|
||||
|
||||
return _out;
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ struct ring_buffer
|
||||
void destroy();
|
||||
|
||||
/// Request a pointer for writing at least \param n bytes.
|
||||
void* request(size_t n, bool wrap = true);
|
||||
void* request(size_t n, size_t align, bool wrap = true);
|
||||
|
||||
/// Retrieve a pointer for reading at least \param n bytes.
|
||||
void* retrieve(size_t n) const;
|
||||
@@ -174,8 +174,9 @@ ring_buffer::write(Tp* in, std::enable_if_t<std::is_class<Tp>::value, int>)
|
||||
{
|
||||
if(in == nullptr || m_ptr == nullptr) return {0, nullptr};
|
||||
|
||||
auto _length = sizeof(Tp);
|
||||
void* _out_p = request(_length);
|
||||
constexpr auto _length = sizeof(Tp);
|
||||
constexpr auto _align = alignof(Tp);
|
||||
void* _out_p = request(_length, _align, true);
|
||||
|
||||
if(_out_p == nullptr) return {0, nullptr};
|
||||
|
||||
@@ -194,8 +195,9 @@ ring_buffer::write(Tp* in, std::enable_if_t<!std::is_class<Tp>::value, int>)
|
||||
{
|
||||
if(in == nullptr || m_ptr == nullptr) return {0, nullptr};
|
||||
|
||||
auto _length = sizeof(Tp);
|
||||
void* _out_p = request(_length);
|
||||
constexpr auto _length = sizeof(Tp);
|
||||
constexpr auto _align = alignof(Tp);
|
||||
void* _out_p = request(_length, _align, true);
|
||||
|
||||
if(_out_p == nullptr) return {0, nullptr};
|
||||
|
||||
@@ -214,7 +216,7 @@ ring_buffer::request(bool wrap)
|
||||
{
|
||||
if(m_ptr == nullptr) return nullptr;
|
||||
|
||||
return reinterpret_cast<Tp*>(request(sizeof(Tp), wrap));
|
||||
return reinterpret_cast<Tp*>(request(sizeof(Tp), alignof(Tp), wrap));
|
||||
}
|
||||
//
|
||||
template <typename Tp>
|
||||
|
||||
@@ -156,7 +156,8 @@ validate(const std::vector<rocprofiler_record_header_t*>& _headers)
|
||||
if(itr->hash == typeid(data_type).hash_code())
|
||||
{
|
||||
auto* _data = static_cast<data_type*>(itr->payload);
|
||||
EXPECT_EQ(_ref_data, *_data);
|
||||
EXPECT_EQ(_ref_data, *_data)
|
||||
<< "\nREF: " << _ref_data.to_string() << "\nINP: " << _data->to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,7 +218,7 @@ TEST(buffering, save_load)
|
||||
|
||||
// verify the data, at a high-level is correct
|
||||
EXPECT_EQ(_buffer.size(), num_variants);
|
||||
EXPECT_EQ(_buffer.count(), data_size);
|
||||
EXPECT_GE(_buffer.count(), data_size);
|
||||
EXPECT_GE(_buffer.free(), 0);
|
||||
EXPECT_GE(_buffer.capacity(), data_size);
|
||||
EXPECT_FALSE(_buffer.is_empty());
|
||||
|
||||
@@ -117,6 +117,17 @@ def generate_custom(args, cmake_args, ctest_args):
|
||||
os.environ.get("TSAN_OPTIONS", ""),
|
||||
]
|
||||
)
|
||||
elif MEMCHECK_TYPE == "UndefinedBehaviorSanitizer":
|
||||
MEMCHECK_SUPPRESSION_FILE = (
|
||||
f"{SOURCE_DIR}/source/scripts/undef-behavior-sanitizer-suppr.txt"
|
||||
)
|
||||
os.environ["UBSAN_OPTIONS"] = " ".join(
|
||||
[
|
||||
"print_stacktrace=1",
|
||||
f"suppressions={SOURCE_DIR}/source/scripts/undef-behavior-sanitizer-suppr.txt",
|
||||
os.environ.get("UBSAN_OPTIONS", ""),
|
||||
]
|
||||
)
|
||||
|
||||
codecov_exclude = [
|
||||
"/usr/.*",
|
||||
@@ -235,12 +246,12 @@ def generate_dashboard_script(args):
|
||||
RETRY_DELAY 10
|
||||
CAPTURE_CMAKE_ERROR _cdash_submit_err)
|
||||
|
||||
if("{STRICT_SUBMIT}" GREATER 0)
|
||||
if(NOT _cdash_submit_err EQUAL 0)
|
||||
if(NOT _cdash_submit_err EQUAL 0)
|
||||
if("{STRICT_SUBMIT}" GREATER 0)
|
||||
message(FATAL_ERROR "CDash submission failed: {SUBMIT_ERR}")
|
||||
else()
|
||||
message(AUTHOR_WARNING "CDash submission failure ignored due to absence of --require-cdash-submission")
|
||||
endif()
|
||||
else()
|
||||
message(AUTHOR_WARNING "CDash submission failure ignored due to absence of --require-cdash-submission")
|
||||
endif()
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
@@ -185,7 +185,7 @@ def test_summary_data(json_data):
|
||||
)
|
||||
assert oitr.value.count == 2
|
||||
elif itr.domain == "HIP_API":
|
||||
assert itr.stats.count == 2135
|
||||
assert itr.stats.count >= 2130 and itr.stats.count <= 2140
|
||||
elif itr.domain == "MEMORY_COPY":
|
||||
assert itr.stats.count == 12
|
||||
elif itr.domain == "MARKER_API":
|
||||
|
||||
新增問題並參考
封鎖使用者