Fix misaligned stores in buffer (#1063)

* Fix misaligned read/write to buffer

- causes undefined behavior

* Update run-ci.py

- fix spurious CDash submission failure warning

* Improve run-ci.py support for UBSan

* Relax rocprofv3 summary stats count expectation

* Update CHANGELOG

[ROCm/rocprofiler-sdk commit: 37e0d7efce]
此提交包含在:
Jonathan R. Madsen
2024-09-10 17:08:57 -05:00
提交者 GitHub
父節點 a2c173ea40
當前提交 34c35c26ba
共有 9 個檔案被更改,包括 60 行新增29 行删除
+20 -10
查看文件
@@ -4,7 +4,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
## ROCprofiler-SDK for AFAR I
## Additions
### Additions
- HSA API Tracing
- Kernel Dispatch Tracing
@@ -14,7 +14,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
## ROCprofiler-SDK for AFAR II
## Additions
### Additions
- HIP API Tracing
- ROCTx Tracing
@@ -25,7 +25,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
## ROCprofiler-SDK for AFAR III
## Additions
### Additions
- Kernel Dispatch Counter Collection (includes serialization and multidimensional instances)
- Kernel serialization
@@ -43,7 +43,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
## ROCprofiler-SDK for AFAR IV
## Additions
### Additions
- Page Migration Reporting (API)
- Scratch Memory Reporting (API)
@@ -55,7 +55,7 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
## ROCprofiler-SDK for AFAR V
## Additions
### Additions
- Agent/Device Counter Collection (API)
- Single JSON output format support (tool)
@@ -67,17 +67,17 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
- ROCProf V3 Multi-GPU Support:
- Multi-process (multiple files)
## Fixes
### Fixes
- SQ_ACCUM_PREV and SQ_ACCUM_PREV_HIRE overwriting issue
## Changes
### Changes
- rocprofv3 tool now needs `--` in front of application. For detailed uses, please [Click Here](source/docs/rocprofv3.md)
## ROCprofiler-SDK for AFAR VI
## Additions
### Additions
- OTF2 Tool Support
- Kernel and Range Filtering
@@ -90,6 +90,16 @@ Full documentation for ROCprofiler-SDK is available at [Click Here](source/docs/
- Dispatch_Id
- Added CSV column for counter_collection
## Fixes
### Fixes
- Miscellaneous bug fixes
## ROCprofiler-SDK for AFAR VII
### Additions
### Fixes
### Changes
- Fix misaligned stores (undefined behavior) for buffer records
+4 -2
查看文件
@@ -235,6 +235,7 @@ record_header_buffer::emplace(uint64_t _hash, Tp& _v)
if(m_headers.empty()) return false;
constexpr auto request_size = sizeof(Tp);
constexpr auto align_size = alignof(Tp);
// notify there was a request
m_requested.fetch_add(1);
@@ -242,7 +243,7 @@ record_header_buffer::emplace(uint64_t _hash, Tp& _v)
// in theory, we shouldn't need to lock here but the thread sanitizer says there is a race.
// the lock will be short-lived so hopefully, it will scale fine
write_lock();
auto* _addr = m_buffer.request(request_size, false);
auto* _addr = m_buffer.request(request_size, align_size, false);
write_unlock();
read_lock();
@@ -277,6 +278,7 @@ record_header_buffer::emplace(uint32_t _category, uint32_t _kind, Tp& _v)
if(m_headers.empty()) return false;
constexpr auto request_size = sizeof(Tp);
constexpr auto align_size = alignof(Tp);
// notify there was a request
m_requested.fetch_add(1);
@@ -284,7 +286,7 @@ record_header_buffer::emplace(uint32_t _category, uint32_t _kind, Tp& _v)
// in theory, we shouldn't need to lock here but the thread sanitizer says there is a race.
// the lock will be short-lived so hopefully, it will scale fine
write_lock();
auto* _addr = m_buffer.request(request_size, false);
auto* _addr = m_buffer.request(request_size, align_size, false);
write_unlock();
read_lock();
+9 -4
查看文件
@@ -134,30 +134,35 @@ ring_buffer::as_string() const
//
void*
ring_buffer::request(size_t _length, bool _wrap)
ring_buffer::request(size_t _length, size_t _align, bool _wrap)
{
if(m_ptr == nullptr || m_size == 0) return nullptr;
if(is_full()) return (_wrap) ? retrieve(_length) : nullptr;
LOG_IF(FATAL, _align == 0) << "alignment must be non-zero";
// if write count is at the tail of buffer, bump to the end of buffer
size_t _write_count = 0;
size_t _offset = 0;
size_t _write_pos = 0;
do
{
// Make sure we don't put in more than there's room for, by writing no
// more than there is free.
if(_length > free()) return nullptr;
_offset = 0;
_write_count = m_write_count.load(std::memory_order_acquire);
auto _modulo = m_size - (_write_count % m_size);
if(_modulo < _length) _offset = _modulo;
auto _align_modulo = (_write_count % _align);
auto _align_offset = (_align_modulo > 0) ? (_align - _align_modulo) : 0;
_write_pos = _write_count + _align_offset;
} while(!m_write_count.compare_exchange_strong(
_write_count, _write_count + _length + _offset, std::memory_order_seq_cst));
_write_count, _write_pos + _length + _offset, std::memory_order_seq_cst));
// pointer in buffer
void* _out = write_ptr(_write_count);
void* _out = write_ptr(_write_pos);
return _out;
}
+8 -6
查看文件
@@ -75,7 +75,7 @@ struct ring_buffer
void destroy();
/// Request a pointer for writing at least \param n bytes.
void* request(size_t n, bool wrap = true);
void* request(size_t n, size_t align, bool wrap = true);
/// Retrieve a pointer for reading at least \param n bytes.
void* retrieve(size_t n) const;
@@ -174,8 +174,9 @@ ring_buffer::write(Tp* in, std::enable_if_t<std::is_class<Tp>::value, int>)
{
if(in == nullptr || m_ptr == nullptr) return {0, nullptr};
auto _length = sizeof(Tp);
void* _out_p = request(_length);
constexpr auto _length = sizeof(Tp);
constexpr auto _align = alignof(Tp);
void* _out_p = request(_length, _align, true);
if(_out_p == nullptr) return {0, nullptr};
@@ -194,8 +195,9 @@ ring_buffer::write(Tp* in, std::enable_if_t<!std::is_class<Tp>::value, int>)
{
if(in == nullptr || m_ptr == nullptr) return {0, nullptr};
auto _length = sizeof(Tp);
void* _out_p = request(_length);
constexpr auto _length = sizeof(Tp);
constexpr auto _align = alignof(Tp);
void* _out_p = request(_length, _align, true);
if(_out_p == nullptr) return {0, nullptr};
@@ -214,7 +216,7 @@ ring_buffer::request(bool wrap)
{
if(m_ptr == nullptr) return nullptr;
return reinterpret_cast<Tp*>(request(sizeof(Tp), wrap));
return reinterpret_cast<Tp*>(request(sizeof(Tp), alignof(Tp), wrap));
}
//
template <typename Tp>
+2 -1
查看文件
@@ -156,7 +156,8 @@ validate(const std::vector<rocprofiler_record_header_t*>& _headers)
if(itr->hash == typeid(data_type).hash_code())
{
auto* _data = static_cast<data_type*>(itr->payload);
EXPECT_EQ(_ref_data, *_data);
EXPECT_EQ(_ref_data, *_data)
<< "\nREF: " << _ref_data.to_string() << "\nINP: " << _data->to_string();
}
}
}
+1 -1
查看文件
@@ -218,7 +218,7 @@ TEST(buffering, save_load)
// verify the data, at a high-level is correct
EXPECT_EQ(_buffer.size(), num_variants);
EXPECT_EQ(_buffer.count(), data_size);
EXPECT_GE(_buffer.count(), data_size);
EXPECT_GE(_buffer.free(), 0);
EXPECT_GE(_buffer.capacity(), data_size);
EXPECT_FALSE(_buffer.is_empty());
+15 -4
查看文件
@@ -117,6 +117,17 @@ def generate_custom(args, cmake_args, ctest_args):
os.environ.get("TSAN_OPTIONS", ""),
]
)
elif MEMCHECK_TYPE == "UndefinedBehaviorSanitizer":
MEMCHECK_SUPPRESSION_FILE = (
f"{SOURCE_DIR}/source/scripts/undef-behavior-sanitizer-suppr.txt"
)
os.environ["UBSAN_OPTIONS"] = " ".join(
[
"print_stacktrace=1",
f"suppressions={SOURCE_DIR}/source/scripts/undef-behavior-sanitizer-suppr.txt",
os.environ.get("UBSAN_OPTIONS", ""),
]
)
codecov_exclude = [
"/usr/.*",
@@ -235,12 +246,12 @@ def generate_dashboard_script(args):
RETRY_DELAY 10
CAPTURE_CMAKE_ERROR _cdash_submit_err)
if("{STRICT_SUBMIT}" GREATER 0)
if(NOT _cdash_submit_err EQUAL 0)
if(NOT _cdash_submit_err EQUAL 0)
if("{STRICT_SUBMIT}" GREATER 0)
message(FATAL_ERROR "CDash submission failed: {SUBMIT_ERR}")
else()
message(AUTHOR_WARNING "CDash submission failure ignored due to absence of --require-cdash-submission")
endif()
else()
message(AUTHOR_WARNING "CDash submission failure ignored due to absence of --require-cdash-submission")
endif()
endif()
endmacro()
查看文件
+1 -1
查看文件
@@ -185,7 +185,7 @@ def test_summary_data(json_data):
)
assert oitr.value.count == 2
elif itr.domain == "HIP_API":
assert itr.stats.count == 2135
assert itr.stats.count >= 2130 and itr.stats.count <= 2140
elif itr.domain == "MEMORY_COPY":
assert itr.stats.count == 12
elif itr.domain == "MARKER_API":