diff --git a/.github/workflows/rocprofiler-sdk-restrictions.yml b/.github/workflows/rocprofiler-sdk-restrictions.yml index af1bbac816..fedbc94d94 100644 --- a/.github/workflows/rocprofiler-sdk-restrictions.yml +++ b/.github/workflows/rocprofiler-sdk-restrictions.yml @@ -30,7 +30,7 @@ jobs: regex: runs-on: ubuntu-22.04 env: - FOLDERS: "projects/rocprofiler-sdk/source/lib/common projects/rocprofiler-sdk/source/lib/rocprofiler-sdk projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-roctx" + FOLDERS: "source/lib/common source/lib/rocprofiler-sdk source/lib/rocprofiler-sdk-roctx source/lib/output source/lib/rocprofiler-sdk-tool" steps: - uses: actions/checkout@v4 @@ -42,10 +42,10 @@ jobs: python3 -m pip install -U cmake-format - name: Apply restriction + working-directory: projects/rocprofiler-sdk run: | - cd projects/rocprofiler-sdk set +e - FILES="$(find ${FOLDERS} -type f)" + FILES="$(find ${FOLDERS} -type f -not -name "*.md" -not -name "*.txt")" GREP="$(grep -E -n 'std::regex|' ${FILES})" if [ "${GREP}" != "" ]; then echo -e "\nError! std::regex is not allowed in ${FOLDERS}...\n" diff --git a/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt index ab4aae3da3..417c4d2bc6 100644 --- a/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt @@ -9,6 +9,7 @@ set(common_sources environment.cpp logging.cpp md5sum.cpp + regex.cpp sha256.cpp simple_timer.cpp static_object.cpp @@ -27,6 +28,7 @@ set(common_headers logging.hpp md5sum.hpp mpl.hpp + regex.hpp scope_destructor.hpp sha256.hpp simple_timer.hpp diff --git a/projects/rocprofiler-sdk/source/lib/common/README.md b/projects/rocprofiler-sdk/source/lib/common/README.md new file mode 100644 index 0000000000..c29cfa80ad --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/README.md @@ -0,0 +1,182 @@ +# ROCProfiler SDK Common API Library + +## Custom Regex Engine + +### Why We Have Our Own Regex Implementation + +This directory contains a custom regex engine implementation designed explicitly for ROCm profiling tools. The primary motivation for implementing our own regex engine instead of using `std::regex` is to avoid the **dual ABI compatibility issues** that plague `std::regex` in the GNU libstdc++ library. + +#### The Dual ABI Problem + +The GNU libstdc++ library introduced a dual ABI (Application Binary Interface) system starting with GCC 5.1 to maintain backward compatibility while introducing C++11 improvements. This dual ABI system affects `std::string` and other standard library components, including `std::regex`. + +##### Technical Background + +The dual ABI allows two different implementations to coexist: +- **Old ABI (pre-C++11)**: Uses Copy-on-Write (COW) strings +- **New ABI (C++11+)**: Uses Short String Optimization (SSO) + +The ABI is controlled by the `_GLIBCXX_USE_CXX11_ABI` macro: +- `_GLIBCXX_USE_CXX11_ABI=0`: Old ABI (default for GCC < 5.1) +- `_GLIBCXX_USE_CXX11_ABI=1`: New ABI (default for GCC >= 5.1) + +##### The std::regex Problem + +`std::regex` is particularly problematic because: + +1. **ABI Sensitivity**: The `std::regex` implementation is tightly coupled to the string ABI being used +2. **Symbol Conflicts**: Different ABI versions create incompatible symbols that cannot be mixed +3. **Runtime Failures**: Applications linking against libraries compiled with different ABI settings experience runtime failures +4. **Distribution Issues**: Different Linux distributions and package managers may use different ABI settings + +##### Real-World Impact + +As explained in the [Stack Overflow discussion](https://stackoverflow.com/questions/51382355/stdregex-and-dual-abi), this creates several problematic scenarios: + +- Applications compiled with GCC 4.x linking against libraries compiled with GCC 5+ +- Mixing libraries compiled with different `_GLIBCXX_USE_CXX11_ABI` settings +- Distribution packages that assume different ABI defaults +- Cross-compilation scenarios where ABI settings don't match + +Example error scenarios: +```cpp +// Library A compiled with _GLIBCXX_USE_CXX11_ABI=0 +// Library B compiled with _GLIBCXX_USE_CXX11_ABI=1 +// Both use std::regex -> Runtime failures or linking errors +``` + +### Our Solution + +To avoid these compatibility issues entirely, we implemented a custom regex engine with the following benefits: + +#### 1. **ABI Independence** +- No dependency on `std::regex` or dual ABI settings +- Consistent behavior across all GCC versions and distributions +- Eliminates linking and runtime compatibility issues + +#### 2. **Controlled Dependencies** +- Uses only basic standard library components (`std::string_view`, `std::vector`, etc.) +- Minimizes external dependencies that could introduce ABI conflicts +- Self-contained implementation + +#### 3. **Targeted Feature Set** +Our implementation focuses on the regex features actually needed by ROCm profiling tools: + +##### Supported Features +- **Literals and Escapes**: `\n`, `\t`, `\\`, etc. +- **Anchors**: `^` (beginning), `$` (end) +- **Character Classes**: `[abc]`, `[a-z]`, `[^0-9]` +- **Shortcuts**: `\d`, `\D`, `\w`, `\W`, `\s`, `\S` +- **Quantifiers**: `*`, `+`, `?`, `{m}`, `{m,}`, `{m,n}` +- **Lazy Quantifiers**: `*?`, `+?`, `??`, `{m,n}?` +- **Groups and Alternation**: `()`, `|` +- **Dot Metacharacter**: `.` + +##### API Compatibility +The API is designed to be familiar to users of `std::regex`: + +```cpp +namespace rocprofiler::common::regex { + bool regex_match(std::string_view text, std::string_view pattern); + bool regex_search(std::string_view text, std::string_view pattern); + bool regex_search(std::string_view text, std::string_view pattern, + size_t& begin, size_t& end); + std::string regex_replace(std::string_view text, std::string_view pattern, + std::string_view replacement); +} +``` + +#### 4. **Replacement Token Support** +Full support for replacement tokens in `regex_replace`: +- `$0` or `$&`: Whole match +- `$1` to `$99`: Capture groups +- `$``: Prefix (text before match) +- `$'`: Suffix (text after match) + +### Implementation Architecture + +#### 1. **Parser** (`struct Parser`) +- Converts regex pattern strings into an Abstract Syntax Tree (AST) +- Handles escape sequences, character classes, and quantifiers +- Validates pattern syntax and reports errors + +#### 2. **AST Nodes** (`struct Node`) +- Represents different regex components (literals, classes, quantifiers, etc.) +- Supports recursive structure for complex patterns +- Memory-efficient representation + +#### 3. **Matchers** +- **FastMatcher**: Optimized for simple matching without capture groups +- **CaptureMatcher**: Full-featured matcher with capture group support +- Memoization for performance optimization + +#### 4. **Algorithm Features** +- **Backtracking**: Supports complex patterns with alternatives +- **Greedy/Lazy Quantifiers**: Proper implementation of both modes +- **Zero-length Guards**: Prevents infinite loops in edge cases +- **Capture Group Tracking**: Maintains group boundaries during matching + +### Usage Examples + +```cpp +#include "lib/common/regex.hpp" + +using namespace rocprofiler::common::regex; + +// Basic matching +bool matches = regex_match("hello123", "hello\\d+"); + +// Search with position +size_t begin, end; +if (regex_search("prefix_hello123_suffix", "hello\\d+", begin, end)) { + // Found match at positions [begin, end) +} + +// Replace with captures +std::string result = regex_replace( + "file_v1.2.3.txt", + "v(\\d+)\\.(\\d+)\\.(\\d+)", + "version_$1_$2_$3" +); +// result: "file_version_1_2_3.txt" +``` + +### Testing and Validation + +The implementation includes comprehensive tests that verify compatibility with ECMAScript regex semantics: + +- **Parity Tests**: Compare behavior against `std::regex` where possible +- **Edge Cases**: Handle corner cases like zero-length matches, nested captures +- **Compatibility Tests**: Verify consistent behavior across different string types and usage patterns + +### Maintenance Notes + +- The implementation prioritizes correctness and ABI independence over maximum performance +- Features are added based on actual requirements from ROCm profiling tools +- Regular testing ensures compatibility with target environments +- Documentation is maintained to explain design decisions and limitations + +This custom implementation provides a robust, ABI-independent regex solution that eliminates the compatibility issues that would otherwise plague ROCm profiling tools when deployed across diverse environments. + +### Notes on ABI Independence Testing + +The current test suite includes "compatibility tests" that verify consistent behavior across different string types and usage patterns. However, **true ABI independence testing** would require: + +1. **Cross-compilation builds**: Building test applications with different `_GLIBCXX_USE_CXX11_ABI` settings (0 and 1) +2. **Binary compatibility verification**: Ensuring object files compiled with different ABI settings can link together +3. **Runtime validation**: Testing that regex functionality works consistently regardless of how dependent libraries were compiled + +Such comprehensive ABI testing would require: + +```bash +# Build with old ABI +g++ -D_GLIBCXX_USE_CXX11_ABI=0 -c test_old_abi.cpp + +# Build with new ABI +g++ -D_GLIBCXX_USE_CXX11_ABI=1 -c test_new_abi.cpp + +# Link together and verify functionality +g++ test_old_abi.o test_new_abi.o -o cross_abi_test +``` + +The current implementation achieves ABI independence by avoiding `std::regex` entirely, relying instead on minimal standard library components and custom string processing that remains stable across ABI versions. diff --git a/projects/rocprofiler-sdk/source/lib/common/regex.cpp b/projects/rocprofiler-sdk/source/lib/common/regex.cpp new file mode 100644 index 0000000000..a8604fb9b4 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/regex.cpp @@ -0,0 +1,1161 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/common/regex.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace common +{ +namespace regex +{ +// =============================== AST =============================== + +struct Node +{ + enum Kind + { + LITERAL, + DOT, + CLASS, + ANCHOR_BOL, + ANCHOR_EOL, + SEQ, + ALT, + QUANT, + CAP + } kind; + char ch = 0; + + struct Class + { + std::function pred; + }; + std::optional cls; // for CLASS + + std::vector children; // for SEQ/ALT + + struct Quant + { + std::unique_ptr sub; + size_t min = 0; + size_t max = std::numeric_limits::max(); + bool greedy = true; + }; + std::unique_ptr quant; // for QUANT + + int cap_index = -1; // for CAP (1..N) + std::unique_ptr cap_sub; // for CAP + + // Ctors / simple factories + explicit Node(Kind k) + : kind(k) + {} + explicit Node(char c) + : kind(LITERAL) + , ch(c) + {} + + static Node dot() { return Node(DOT); } + static Node bol() { return Node(ANCHOR_BOL); } + static Node eol() { return Node(ANCHOR_EOL); } + + static Node seq(std::vector v) + { + Node n(SEQ); + n.children = std::move(v); + return n; + } + static Node alt(std::vector v) + { + Node n(ALT); + n.children = std::move(v); + return n; + } + + static Node make_class(std::function p) + { + Node n(CLASS); + n.cls = Class{std::move(p)}; + return n; + } + static Node make_quant(Node sub, size_t mi, size_t ma, bool greedy) + { + Node n(QUANT); + n.quant = std::make_unique(); + n.quant->sub = std::make_unique(std::move(sub)); + n.quant->min = mi; + n.quant->max = ma; + n.quant->greedy = greedy; + return n; + } + static Node make_cap(int idx, Node sub) + { + Node n(CAP); + n.cap_index = idx; + n.cap_sub = std::make_unique(std::move(sub)); + return n; + } +}; + +// ============================= Parser ============================== + +struct Parser +{ + std::string_view pat; + size_t i = 0; + int next_cap_index = 1; + + explicit Parser(std::string_view p) + : pat(p) + {} + + bool end() const { return i >= pat.size(); } + char peek() const { return end() ? '\0' : pat[i]; } + char get() { return end() ? '\0' : pat[i++]; } + bool eat(char c) + { + if(!end() && pat[i] == c) + { + ++i; + return true; + } + return false; + } + static bool is_digit(char c) { return c >= '0' && c <= '9'; } + + std::vector> special_preds; + + Node parse_escape_in_atom() + { + get(); + char e = get(); + if(e == '\0') return Node('\\'); + auto make_cls = [&](auto p) { return Node::make_class(std::move(p)); }; + switch(e) + { + case 'd': return make_cls([](unsigned char x) { return std::isdigit(x) != 0; }); + case 'D': return make_cls([](unsigned char x) { return std::isdigit(x) == 0; }); + case 'w': return make_cls([](unsigned char x) { return std::isalnum(x) || x == '_'; }); + case 'W': + return make_cls([](unsigned char x) { return !(std::isalnum(x) || x == '_'); }); + case 's': return make_cls([](unsigned char x) { return std::isspace(x) != 0; }); + case 'S': return make_cls([](unsigned char x) { return std::isspace(x) == 0; }); + case 'n': return Node('\n'); + case 't': return Node('\t'); + case 'r': return Node('\r'); + case 'f': return Node('\f'); + case 'v': return Node('\v'); + default: return Node(e); + } + } + + Node parse_class() + { + bool negate = false; + if(eat('^')) negate = true; + struct Range + { + unsigned char a, b; + }; + std::vector ranges; + std::vector singles; + auto add_char = [&](unsigned char c) { singles.push_back(c); }; + bool first = true; + unsigned char prev = 0; + bool has_prev = false; + while(!end() && peek() != ']') + { + unsigned char c; + if(eat('\\')) + { + char e = get(); + if(e == 'd' || e == 'D' || e == 'w' || e == 'W' || e == 's' || e == 'S') + { + special_preds.push_back( + e == 'd' ? [](unsigned char x) { return std::isdigit(x) != 0; } + : e == 'D' ? [](unsigned char x) { return std::isdigit(x) == 0; } + : e == 'w' ? [](unsigned char x) { return std::isalnum(x) || x == '_'; } + : e == 'W' ? [](unsigned char x) { return !(std::isalnum(x) || x == '_'); } + : e == 's' ? [](unsigned char x) { return std::isspace(x) != 0; } + : [](unsigned char x) { return std::isspace(x) == 0; }); + continue; + } + else + c = static_cast(e); + } + else + c = static_cast(get()); + + if(!first && c == '-' && peek() != ']' && has_prev) + { + unsigned char nxt; + if(eat('\\')) + nxt = static_cast(get()); + else + nxt = static_cast(get()); + if(prev <= nxt) + ranges.push_back({prev, nxt}); + else + ranges.push_back({nxt, prev}); + has_prev = false; + first = false; + continue; + } + else + { + if(has_prev) add_char(prev); + prev = c; + has_prev = true; + } + first = false; + } + if(has_prev) add_char(prev); + if(!eat(']')) throw std::runtime_error("Unterminated character class"); + + auto rs = std::move(ranges); + auto ss = std::move(singles); + auto specials = std::move(special_preds); + auto pred = [rs, ss, specials, negate](unsigned char x) { + bool in = false; + for(auto& r : rs) + { + if(r.a <= x && x <= r.b) + { + in = true; + break; + } + } + if(!in) + { + for(auto c : ss) + { + if(c == x) + { + in = true; + break; + } + } + } + if(!in) + { + for(auto& sp : specials) + { + if(sp(x)) + { + in = true; + break; + } + } + } + return negate ? !in : in; + }; + return Node::make_class(pred); + } + + struct RangeQ + { + size_t min, max; + bool ok; + }; + + RangeQ parse_brace_quant() + { + size_t save = i; + if(!eat('{')) return {0, 0, false}; + auto read_num = [&]() -> std::optional { + if(end() || !is_digit(peek())) return std::nullopt; + size_t v = 0; + while(!end() && is_digit(peek())) + v = v * 10 + (get() - '0'); + return v; + }; + auto m = read_num(); + if(!m) + { + i = save; + return {0, 0, false}; + } + size_t mn = *m; + size_t mx = mn; + if(eat('}')) return {mn, mx, true}; + if(!eat(',')) + { + i = save; + return {0, 0, false}; + } + if(peek() == '}') + { + get(); + return {mn, std::numeric_limits::max(), true}; + } + auto n = read_num(); + if(!n || !eat('}')) + { + i = save; + return {0, 0, false}; + } + if(*n < mn) + std::swap(mn, *n); + else + mx = *n; + return {mn, mx, true}; + } + + Node parse_atom() + { + if(end()) throw std::runtime_error("Unexpected end in atom"); + char c = peek(); + if(c == '.') + { + get(); + return Node::dot(); + } + if(c == '^') + { + get(); + return Node::bol(); + } + if(c == '$') + { + get(); + return Node::eol(); + } + if(c == '[') + { + get(); + return parse_class(); + } + if(c == '(') + { + get(); + int idx = next_cap_index++; // assign at '(' (left-to-right) + Node inner = parse_alt(); + if(!eat(')')) throw std::runtime_error("Unmatched '('"); + return Node::make_cap(idx, std::move(inner)); + } + if(c == '\\') return parse_escape_in_atom(); + get(); + return Node(c); + } + + Node parse_atom_with_quant() + { + Node atom = parse_atom(); + auto apply_lazy = [&](Node& q) { + if(eat('?')) + if(q.kind == Node::QUANT && q.quant) q.quant->greedy = false; + }; + if(!end()) + { + if(eat('*')) + { + Node q = + Node::make_quant(std::move(atom), 0, std::numeric_limits::max(), true); + apply_lazy(q); + return q; + } + if(eat('+')) + { + Node q = + Node::make_quant(std::move(atom), 1, std::numeric_limits::max(), true); + apply_lazy(q); + return q; + } + if(eat('?')) + { + Node q = Node::make_quant(std::move(atom), 0, 1, true); + apply_lazy(q); + return q; + } + auto br = parse_brace_quant(); + if(br.ok) + { + Node q = Node::make_quant(std::move(atom), br.min, br.max, true); + apply_lazy(q); + return q; + } + } + return atom; + } + + Node parse_seq() + { + std::vector v; + while(!end()) + { + char c = peek(); + if(c == ')' || c == '|') break; + v.push_back(parse_atom_with_quant()); + } + if(v.empty()) return Node::seq(std::move(v)); + if(v.size() == 1) return std::move(v[0]); + return Node::seq(std::move(v)); + } + + Node parse_alt() + { + std::vector branches; + branches.push_back(parse_seq()); + while(eat('|')) + branches.push_back(parse_seq()); + if(branches.size() == 1) return std::move(branches[0]); + return Node::alt(std::move(branches)); + } + + Node parse_all() + { + Node n = parse_alt(); + if(!end()) throw std::runtime_error("Trailing pattern content"); + return n; + } +}; + +// ============================= Matchers ============================ + +struct FastMatcher +{ + const Node& root; + std::string_view s; + + struct Key + { + const Node* node; + size_t idx; + bool operator==(const Key& o) const { return node == o.node && idx == o.idx; } + }; + struct KeyHash + { + size_t operator()(const Key& k) const noexcept + { + return std::hash()(k.node) ^ (std::hash()(k.idx) << 1); + } + }; + std::unordered_map, KeyHash> memo; + + FastMatcher(const Node& r, std::string_view sv) + : root(r) + , s(sv) + {} + + std::optional match(const Node* n, size_t i) + { + Key k{n, i}; + if(auto it = memo.find(k); it != memo.end()) return it->second; + auto r = match_impl(n, i); + memo.emplace(k, r); + return r; + } + + std::optional match_seq_from(const std::vector& children, size_t k, size_t pos) + { + if(k == children.size()) return pos; + + const Node& ch = children[k]; + + if(ch.kind != Node::QUANT) + { + if(ch.kind == Node::CAP && ch.cap_sub && ch.cap_sub->kind == Node::QUANT) + { + const auto& q = *ch.cap_sub->quant; + + std::vector ends; + ends.push_back(pos); // 0 reps -> pos + size_t cur = pos; + size_t count = 0; + while(count < q.max) + { + auto r = match(q.sub.get(), cur); + if(!r) break; + if(*r == cur) break; // zero-length guard + cur = *r; + ++count; + ends.push_back(cur); + if(cur > s.size()) break; + } + + if(q.greedy) + { + for(size_t used = ends.size(); used-- > 0;) + { + if(used < q.min) continue; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + else + { + for(size_t used = 0; used < ends.size(); ++used) + { + if(used < q.min) continue; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + return std::nullopt; + } + + auto r = match(&ch, pos); + if(!r) return std::nullopt; + return match_seq_from(children, k + 1, *r); + } + + const auto& q = *ch.quant; + + std::vector ends; + ends.push_back(pos); // 0 reps -> pos + size_t cur = pos; + size_t count = 0; + while(count < q.max) + { + auto r = match(q.sub.get(), cur); + if(!r) break; + if(*r == cur) break; // zero-length guard + cur = *r; + ++count; + ends.push_back(cur); + if(cur > s.size()) break; + } + + if(q.greedy) + { + for(size_t used = ends.size(); used-- > 0;) + { + if(used < q.min) continue; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + else + { + for(size_t used = 0; used < ends.size(); ++used) + { + if(used < q.min) continue; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + return std::nullopt; + } + + std::optional match_impl(const Node* n, size_t i) + { + switch(n->kind) + { + case Node::LITERAL: + { + if(i < s.size() && (unsigned char) s[i] == (unsigned char) n->ch) return i + 1; + return std::nullopt; + } + case Node::DOT: + { + if(i < s.size()) return i + 1; + return std::nullopt; + } + case Node::CLASS: + { + if(i < s.size() && n->cls && n->cls->pred((unsigned char) s[i])) return i + 1; + return std::nullopt; + } + case Node::ANCHOR_BOL: + { + if(i == 0) return i; + return std::nullopt; + } + case Node::ANCHOR_EOL: + { + if(i == s.size()) return i; + return std::nullopt; + } + case Node::SEQ: + { + return match_seq_from(n->children, 0, i); + } + case Node::ALT: + { + for(const auto& br : n->children) + { + auto r = match(&br, i); + if(r) return r; + } + return std::nullopt; + } + case Node::QUANT: + { + const auto& q = *n->quant; + std::vector ends; + ends.push_back(i); // 0 reps + size_t pos = i; + size_t count = 0; + while(count < q.max) + { + auto r = match(q.sub.get(), pos); + if(!r) break; + if(*r == pos) break; // zero-length guard + pos = *r; + ++count; + ends.push_back(pos); + if(pos > s.size()) break; + } + + if(ends.size() - 1 < q.min) return std::nullopt; + + if(q.greedy) + return ends.back(); + else + return ends[q.min]; + } + case Node::CAP: + { + return match(n->cap_sub.get(), i); // fast path ignores recording + } + } + return std::nullopt; + } + + bool full_match() + { + auto r = match(&root, 0); + return r && *r == s.size(); + } + + std::optional> find_first() + { + for(size_t pos = 0; pos <= s.size(); ++pos) + { + auto end = match(&root, pos); + if(end) return std::make_pair(pos, *end); + } + return std::nullopt; + } +}; + +struct CaptureMatcher +{ + const Node& root; + std::string_view s; + std::vector> groups; // [0]=whole + + CaptureMatcher(const Node& r, std::string_view sv, int num_caps) + : root(r) + , s(sv) + , groups(static_cast(num_caps) + 1, {std::string::npos, std::string::npos}) + {} + + bool run_from(size_t start) + { + auto end = match_node(&root, start); + if(!end) return false; + groups[0] = {start, *end}; + return true; + } + + std::optional match_seq_from(const std::vector& children, size_t k, size_t pos) + { + if(k == children.size()) return pos; + + const Node& ch = children[k]; + + if(ch.kind != Node::QUANT) + { + if(ch.kind == Node::CAP && ch.cap_sub && ch.cap_sub->kind == Node::QUANT) + { + const auto& q = *ch.cap_sub->quant; + + std::vector ends; + ends.push_back(pos); + std::vector>> snaps; + snaps.push_back(groups); + + size_t cur = pos; + size_t count = 0; + while(count < q.max) + { + auto saved = groups; + auto r = match_node(q.sub.get(), cur); + if(!r) + { + groups = std::move(saved); + break; + } + if(*r == cur) + { + groups = std::move(saved); + break; + } // zero-length guard + cur = *r; + ++count; + ends.push_back(cur); + snaps.push_back(groups); + if(cur > s.size()) break; + } + + if(q.greedy) + { + for(size_t used = ends.size(); used-- > 0;) + { + if(used < q.min) continue; + groups = snaps[used]; + groups[ch.cap_index] = {pos, ends[used]}; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + else + { + for(size_t used = 0; used < ends.size(); ++used) + { + if(used < q.min) continue; + groups = snaps[used]; + groups[ch.cap_index] = {pos, ends[used]}; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + return std::nullopt; + } + + auto r = match_node(&ch, pos); + if(!r) return std::nullopt; + return match_seq_from(children, k + 1, *r); + } + + const auto& q = *ch.quant; + + std::vector ends; + ends.push_back(pos); + std::vector>> snaps; + snaps.push_back(groups); + + size_t cur = pos; + size_t count = 0; + while(count < q.max) + { + auto saved = groups; + auto r = match_node(q.sub.get(), cur); + if(!r) + { + groups = std::move(saved); + break; + } + if(*r == cur) + { + groups = std::move(saved); + break; + } // zero-length guard + cur = *r; + ++count; + ends.push_back(cur); + snaps.push_back(groups); + if(cur > s.size()) break; + } + + if(q.greedy) + { + for(size_t used = ends.size(); used-- > 0;) + { + if(used < q.min) continue; + groups = snaps[used]; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + else + { + for(size_t used = 0; used < ends.size(); ++used) + { + if(used < q.min) continue; + groups = snaps[used]; + auto tail = match_seq_from(children, k + 1, ends[used]); + if(tail) return tail; + } + } + return std::nullopt; + } + + std::optional match_node(const Node* n, size_t i) + { + switch(n->kind) + { + case Node::LITERAL: + { + if(i < s.size() && (unsigned char) s[i] == (unsigned char) n->ch) return i + 1; + return std::nullopt; + } + case Node::DOT: + { + if(i < s.size()) return i + 1; + return std::nullopt; + } + case Node::CLASS: + { + if(i < s.size() && n->cls && n->cls->pred((unsigned char) s[i])) return i + 1; + return std::nullopt; + } + case Node::ANCHOR_BOL: + { + if(i == 0) return i; + return std::nullopt; + } + case Node::ANCHOR_EOL: + { + if(i == s.size()) return i; + return std::nullopt; + } + case Node::SEQ: + { + return match_seq_from(n->children, 0, i); + } + case Node::ALT: + { + for(const auto& br : n->children) + { + auto saved = groups; + auto r = match_node(&br, i); + if(r) return r; + groups = std::move(saved); + } + return std::nullopt; + } + case Node::QUANT: + { + const auto& q = *n->quant; + std::vector ends; + ends.push_back(i); + std::vector>> snaps; + snaps.push_back(groups); + + size_t pos = i; + size_t count = 0; + while(count < q.max) + { + auto saved = groups; + auto r = match_node(q.sub.get(), pos); + if(!r) + { + groups = std::move(saved); + break; + } + if(*r == pos) + { + groups = std::move(saved); + break; + } + pos = *r; + ++count; + ends.push_back(pos); + snaps.push_back(groups); + if(pos > s.size()) break; + } + + if(q.greedy) + { + for(size_t k = ends.size(); k-- > 0;) + { + if(k < q.min) continue; + groups = snaps[k]; + return ends[k]; + } + } + else + { + for(size_t used = 0; used < ends.size(); ++used) + { + if(used < q.min) continue; + groups = snaps[used]; + return ends[used]; + } + } + return std::nullopt; + } + case Node::CAP: + { + size_t start_i = i; + auto saved = groups; + auto r = match_node(n->cap_sub.get(), i); + if(!r) + { + groups = std::move(saved); + return std::nullopt; + } + groups[n->cap_index] = {start_i, *r}; + return r; + } + } + return std::nullopt; + } +}; + +static int +count_captures(const Node& n) +{ + switch(n.kind) + { + case Node::CAP: return std::max(n.cap_index, count_captures(*n.cap_sub)); + case Node::SEQ: + case Node::ALT: + { + int m = 0; + for(const auto& c : n.children) + m = std::max(m, count_captures(c)); + return m; + } + case Node::QUANT: return count_captures(*n.quant->sub); + default: return 0; + } +} + +// Expand replacement with captures for a single match span [b,e) +static std::string +expand_replacement(std::string_view text, + const std::vector>& groups, + size_t b, + size_t e, + std::string_view repl) +{ + std::string out; + const int max_group = static_cast(groups.size()) - 1; // groups[0] = whole match + + for(size_t i = 0; i < repl.size(); ++i) + { + char c = repl[i]; + + if(c != '$' || i + 1 >= repl.size()) + { + out.push_back(c); + continue; + } + + char n1 = repl[i + 1]; + + // $` and $' + if(n1 == '`') + { + out.append(text.substr(0, b)); + ++i; + continue; + } + if(n1 == '\'') + { + out.append(text.substr(e)); + ++i; + continue; + } + + // $& or $0 => whole match + if(n1 == '&' || n1 == '0') + { + out.append(text.substr(b, e - b)); + ++i; + continue; + } + + // $1..$99 (ECMAScript semantics: if two digits are present, always consume both) + if(std::isdigit(static_cast(n1))) + { + int idx = n1 - '0'; + size_t j = i + 2; + + if(j < repl.size() && std::isdigit(static_cast(repl[j]))) + { + int d2 = repl[j] - '0'; + idx = idx * 10 + d2; // ALWAYS consume the second digit if present + ++j; + } + + if(idx >= 0 && idx <= max_group) + { + auto [gb, ge] = groups[static_cast(idx)]; + if(gb != std::string::npos && ge != std::string::npos && ge >= gb) + out.append(text.substr(gb, ge - gb)); + } + + i = j - 1; // advance past digits + continue; + } + + // Otherwise: treat as literal + out.push_back('$'); + out.push_back(n1); + ++i; + } + + return out; +} + +// ============================ Public API =========================== + +bool +regex_match(std::string_view text, std::string_view pattern) +{ + Parser P(pattern); + Node ast = P.parse_all(); + + // Build ^ (ast) $ + std::vector seq_nodes; + seq_nodes.emplace_back(Node::bol()); + seq_nodes.emplace_back(std::move(ast)); + seq_nodes.emplace_back(Node::eol()); + Node wrapped = Node::seq(std::move(seq_nodes)); + + FastMatcher M(wrapped, text); + return M.full_match(); +} + +bool +regex_search(std::string_view text, std::string_view pattern) +{ + Parser P(pattern); + Node ast = P.parse_all(); + FastMatcher M(ast, text); + return M.find_first().has_value(); +} + +bool +regex_search(std::string_view text, + std::string_view pattern, + size_t& match_begin, + size_t& match_end) +{ + Parser P(pattern); + Node ast = P.parse_all(); + FastMatcher M(ast, text); + if(auto r = M.find_first()) + { + match_begin = r->first; + match_end = r->second; + return true; + } + return false; +} + +inline std::string +regex_replace(std::string_view text, std::string_view pattern, std::string_view replacement) +{ + Parser P(pattern); + Node ast = P.parse_all(); + const int num_caps = count_captures(ast); + + std::string result; + size_t cur = 0; + const size_t n = text.size(); + + while(cur <= n) + { + // Find first match at or after 'cur' using CaptureMatcher only + bool found = false; + size_t mb = std::string::npos; + size_t me = std::string::npos; + std::vector> groups; + + for(size_t pos = cur; pos <= n; ++pos) + { + CaptureMatcher cap(ast, text, num_caps); + if(cap.run_from(pos)) + { + auto [b0, e0] = cap.groups[0]; + if(b0 != std::string::npos && e0 != std::string::npos && e0 >= b0) + { + found = true; + mb = b0; + me = e0; + groups = std::move(cap.groups); + break; + } + } + } + + if(!found) + { + // No more matches; append the remainder and finish + result.append(text.substr(cur)); + break; + } + + // Append text before the match + result.append(text.substr(cur, mb - cur)); + + // Expand replacement using these exact groups + result += expand_replacement(text, groups, mb, me, replacement); + + // Zero-length guard like standard regex_replace + if(me == mb) + { + if(me < n) + { + // copy one char and advance, to ensure progress + result.push_back(text[me]); + cur = me + 1; + } + else + { + // at end: done + break; + } + } + else + { + cur = me; + } + } + + return result; +} + +} // namespace regex +} // namespace common +} // namespace rocprofiler + +// Global forwards for convenience +bool +regex_match(std::string_view s, std::string_view p) +{ + return rocprofiler::common::regex::regex_match(s, p); +} +bool +regex_search(std::string_view s, std::string_view p) +{ + return rocprofiler::common::regex::regex_search(s, p); +} +bool +regex_search(std::string_view s, std::string_view p, size_t& b, size_t& e) +{ + return rocprofiler::common::regex::regex_search(s, p, b, e); +} +std::string +regex_replace(std::string_view s, std::string_view p, std::string_view r) +{ + return rocprofiler::common::regex::regex_replace(s, p, r); +} diff --git a/projects/rocprofiler-sdk/source/lib/common/regex.hpp b/projects/rocprofiler-sdk/source/lib/common/regex.hpp new file mode 100644 index 0000000000..a56476830d --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/regex.hpp @@ -0,0 +1,73 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/** + * @file regex.hpp + * @brief A lightweight regex engine for ROCm profiling tools. + * + * @note This file was generated by an AI. + * + * Namespace: rocprofiler::common::regex + * + * Implemented APIs: + * bool regex_match(std::string_view text, std::string_view pattern) + * bool regex_search(std::string_view text, std::string_view pattern) + * bool regex_search(std::string_view text, std::string_view pattern, + * size_t& begin, size_t& end) + * std::string regex_replace(std::string_view text, std::string_view pattern, + * std::string_view replacement) + * + * Supported regex syntax: + * Literals/escapes (\n \t \\), ., ^, $, (), |, + * character classes [..], ranges, [^..], \d \D \w \W \s \S, + * quantifiers *, +, ?, {m}, {m,}, {m,n} with lazy forms (*? +? ?? {m,n}?). + * + * Replacement tokens in regex_replace: + * $0 or $& : whole match + * $1..$99 : capture groups + * $` : prefix (text before the match) + * $' : suffix (text after the match) + */ + +#pragma once +#include +#include + +namespace rocprofiler +{ +namespace common +{ +namespace regex +{ +// Public API +bool +regex_match(std::string_view text, std::string_view pattern); +bool +regex_search(std::string_view text, std::string_view pattern); +bool +regex_search(std::string_view text, std::string_view pattern, size_t& begin, size_t& end); +std::string +regex_replace(std::string_view text, std::string_view pattern, std::string_view replacement); + +} // namespace regex +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/format_path.cpp b/projects/rocprofiler-sdk/source/lib/output/format_path.cpp index 472f483e2f..19d6f14737 100644 --- a/projects/rocprofiler-sdk/source/lib/output/format_path.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/format_path.cpp @@ -28,6 +28,7 @@ #include "lib/common/environment.hpp" #include "lib/common/filesystem.hpp" #include "lib/common/logging.hpp" +#include "lib/common/regex.hpp" #include "lib/common/units.hpp" #include "lib/common/utility.hpp" #include "lib/output/output_key.hpp" @@ -45,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -59,9 +59,9 @@ namespace tool namespace { const auto env_regexes = - new std::array{std::regex{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"}, - std::regex{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}, - std::regex{"(.*)%q\\{([A-Z0-9_]+)\\}(.*)"}}; + new std::array{std::string{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"}, + std::string{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}, + std::string{"(.*)%q\\{([A-Z0-9_]+)\\}(.*)"}}; // env regex examples: // - %env{USER}% Consistent with other output key formats (start+end with %) // - $ENV{USER} Similar to CMake @@ -115,13 +115,13 @@ format_path_impl(std::string _fpath, const std::vector& _keys) for(const auto& _re : *env_regexes) { - while(std::regex_search(_fpath, _re)) + while(rocprofiler::common::regex::regex_search(_fpath, _re)) { - auto _var = std::regex_replace(_fpath, _re, "$3"); + auto _var = rocprofiler::common::regex::regex_replace(_fpath, _re, "$3"); std::string _val = common::get_env(_var, ""); _val = strip_leading_and_replace(_val, {'\t', ' ', '/'}, "_"); - auto _beg = std::regex_replace(_fpath, _re, "$1"); - auto _end = std::regex_replace(_fpath, _re, "$4"); + auto _beg = rocprofiler::common::regex::regex_replace(_fpath, _re, "$1"); + auto _end = rocprofiler::common::regex::regex_replace(_fpath, _re, "$4"); _fpath = fmt::format("{}{}{}", _beg, _val, _end); } } @@ -134,9 +134,9 @@ format_path_impl(std::string _fpath, const std::vector& _keys) // remove %arg% where N >= argc try { - auto _re = std::regex{"(.*)(%|\\{)(arg[0-9]+)(%|\\})([-/_]*)(.*)"}; - while(std::regex_search(_fpath, _re)) - _fpath = std::regex_replace(_fpath, _re, "$1$6"); + auto _re = std::string{"(.*)(%|\\{)(arg[0-9]+)(%|\\})([-/_]*)(.*)"}; + while(rocprofiler::common::regex::regex_search(_fpath, _re)) + _fpath = rocprofiler::common::regex::regex_replace(_fpath, _re, "$1$6"); } catch(std::exception& _e) { ROCP_WARNING << "[rocprofiler] " << __FUNCTION__ << " threw an exception :: " << _e.what() diff --git a/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp b/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp index b1cf68c993..3e4406b35e 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include @@ -200,10 +199,9 @@ get_uuid() auto replace_uuid(std::string_view inp) { - const auto& _repl = get_uuid(); - return std::regex_replace(std::string{inp}, - std::regex{"\\{\\{uuid\\}\\}"}, - (_repl.empty()) ? std::string{} : fmt::format("_{}", _repl)); + const auto& _repl = get_uuid(); + const auto replacement = (_repl.empty()) ? std::string{} : fmt::format("_{}", _repl); + return replace_all(std::string{inp}, std::string_view{"{{uuid}}"}, replacement); } auto diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp index bc6bd52955..6bd72e271d 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp @@ -27,6 +27,7 @@ #include "timestamps.hpp" #include "lib/common/logging.hpp" +#include "lib/common/regex.hpp" #include #include @@ -36,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -429,7 +429,7 @@ generate_stats(const output_config& cfg, { auto _col_name = get_domain_column_name(ditr.first); - if(std::regex_match(_col_name.data(), std::regex{itr})) + if(rocprofiler::common::regex::regex_match(_col_name.data(), itr)) { if(!ditr.second) continue; _names.emplace_back(_col_name); diff --git a/projects/rocprofiler-sdk/source/lib/output/output_key.cpp b/projects/rocprofiler-sdk/source/lib/output/output_key.cpp index 7c0f6007ec..fcd200f3f2 100644 --- a/projects/rocprofiler-sdk/source/lib/output/output_key.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/output_key.cpp @@ -31,7 +31,6 @@ #include #include #include -#include namespace rocprofiler { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index e42d5d6ccc..f172d74c1b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -56,16 +55,6 @@ namespace tool { namespace { -const auto env_regexes = - new std::array{std::regex{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"}, - std::regex{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}, - std::regex{"(.*)%q\\{([A-Z0-9_]+)\\}(.*)"}}; -// env regex examples: -// - %env{USER}% Consistent with other output key formats (start+end with %) -// - $ENV{USER} Similar to CMake -// - %q{USER} Compatibility with NVIDIA -// - inline bool not_is_space(int ch) { @@ -344,8 +333,9 @@ format_name(std::string_view _name, const config& _cfg) if(!_cfg.demangle && !_cfg.truncate) return std::string{_name}; // truncating requires demangling first so always demangle - auto _demangled_name = - common::cxx_demangle(std::regex_replace(_name.data(), std::regex{"(\\.kd)$"}, "")); + if(auto kpos = _name.rfind(".kd"); kpos < _name.length() && kpos + 3 == _name.length()) + _name = _name.substr(0, kpos); + auto _demangled_name = common::cxx_demangle(_name); if(_cfg.truncate) return common::truncate_name(_demangled_name); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp index df99f7a950..a2996fe8b2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index 0290dbebf4..14a0ec8ae8 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -32,6 +32,7 @@ #include "lib/common/environment.hpp" #include "lib/common/filesystem.hpp" #include "lib/common/logging.hpp" +#include "lib/common/regex.hpp" #include "lib/common/scope_destructor.hpp" #include "lib/common/simple_timer.hpp" #include "lib/common/static_object.hpp" @@ -940,12 +941,14 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, auto kernel_filter_exclude = tool::get_config().kernel_filter_exclude; auto kernel_filter_range = tool::get_config().kernel_filter_range; - std::regex include_regex(kernel_filter_include); - std::regex exclude_regex(kernel_filter_exclude); - if(std::regex_search(kernel_info->formatted_kernel_name, include_regex)) + std::string_view include_regex(kernel_filter_include); + std::string_view exclude_regex(kernel_filter_exclude); + if(rocprofiler::common::regex::regex_search(kernel_info->formatted_kernel_name, + include_regex)) { if(kernel_filter_exclude.empty() || - !std::regex_search(kernel_info->formatted_kernel_name, exclude_regex)) + !rocprofiler::common::regex::regex_search(kernel_info->formatted_kernel_name, + exclude_regex)) add_kernel_target(sym_data->kernel_id, kernel_filter_range); } } @@ -2197,8 +2200,10 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) } // Handle kernel id of zero - bool include = std::regex_search("0", std::regex(tool::get_config().kernel_filter_include)); - bool exclude = std::regex_search("0", std::regex(tool::get_config().kernel_filter_exclude)); + bool include = + rocprofiler::common::regex::regex_search("0", tool::get_config().kernel_filter_include); + bool exclude = + rocprofiler::common::regex::regex_search("0", tool::get_config().kernel_filter_exclude); if(include && (!exclude || tool::get_config().kernel_filter_exclude.empty())) add_kernel_target(0, tool::get_config().kernel_filter_range); diff --git a/projects/rocprofiler-sdk/source/lib/tests/common/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/tests/common/CMakeLists.txt index 63008a9875..0883b83bd4 100644 --- a/projects/rocprofiler-sdk/source/lib/tests/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/tests/common/CMakeLists.txt @@ -5,8 +5,16 @@ project(rocprofiler-sdk-tests-common LANGUAGES C CXX) include(GoogleTest) -set(common_sources c_array.cpp demangling.cpp environment.cpp md5sum.cpp mpl.cpp - parse.cpp sha256.cpp uuid_v7.cpp) +set(common_sources + c_array.cpp + demangling.cpp + environment.cpp + md5sum.cpp + mpl.cpp + parse.cpp + regex.cpp + sha256.cpp + uuid_v7.cpp) add_executable(common-tests) target_sources(common-tests PRIVATE ${common_sources}) diff --git a/projects/rocprofiler-sdk/source/lib/tests/common/regex.cpp b/projects/rocprofiler-sdk/source/lib/tests/common/regex.cpp new file mode 100644 index 0000000000..7e9101229f --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/tests/common/regex.cpp @@ -0,0 +1,666 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/common/regex.hpp" // rocprofiler::common::regex::... + +namespace R = rocprofiler::common::regex; + +// ----------------------------- Helpers ----------------------------- + +struct StdRes +{ + bool ok = false; + size_t b = 0; + size_t e = 0; +}; + +static std::optional +TryStdMatch(std::string_view text, std::string_view pat) +{ + try + { + std::regex re(std::string(pat), std::regex::ECMAScript); + return std::regex_match(std::string(text), re); + } catch(const std::regex_error&) + { + return std::nullopt; // invalid pattern for std::regex + } +} + +static std::optional +TryStdSearch(std::string_view text, std::string_view pat) +{ + try + { + std::regex re(std::string(pat), std::regex::ECMAScript); + std::cmatch m; + std::string s(text); + if(std::regex_search(s.c_str(), m, re)) + { + return StdRes{true, + static_cast(m.position()), + static_cast(m.position() + m.length())}; + } + return StdRes{false, 0, 0}; + } catch(const std::regex_error&) + { + return std::nullopt; // invalid pattern + } +} + +static std::optional +TryStdReplace(std::string_view text, std::string_view pat, std::string_view repl) +{ + try + { + std::regex re(std::string(pat), std::regex::ECMAScript); + return std::regex_replace(std::string(text), re, std::string(repl)); + } catch(const std::regex_error&) + { + return std::nullopt; + } +} + +// ----------------------------- Tests ------------------------------- + +TEST(regex_parity, literals_and_escapes) +{ + // Avoid invalid ECMAScript escapes that std::regex rejects (e.g., "\c"). + struct C + { + const char* s; + const char* p; + }; + std::vector cases = { + {"abc", "abc"}, + {"a\nb", "a\\nb"}, + {"a\tb", "a\\tb"}, + {"\\", "\\\\"}, + {"a", "a\\n"}, // literal 'n' after backslash for our engine & ECMAScript + }; + for(auto& c : cases) + { + auto sm = TryStdMatch(c.s, c.p); + if(!sm.has_value()) continue; // skip invalid for std::regex + EXPECT_EQ(R::regex_match(c.s, c.p), sm.value()); + + auto ss = TryStdSearch(c.s, c.p); + if(!ss.has_value()) continue; + EXPECT_EQ(R::regex_search(c.s, c.p), ss->ok); + if(ss->ok) + { + size_t b = 0; + size_t e = 0; + ASSERT_TRUE(R::regex_search(c.s, c.p, b, e)); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); + } + } +} + +TEST(regex_parity, dot_and_anchors) +{ + auto cmp = [&](std::string s, std::string p) { + auto sm = TryStdMatch(s, p); + if(!sm) return; + EXPECT_EQ(R::regex_match(s, p), *sm); + + auto ss = TryStdSearch(s, p); + if(!ss) return; + EXPECT_EQ(R::regex_search(s, p), ss->ok); + if(ss->ok) + { + size_t b = 0; + size_t e = 0; + ASSERT_TRUE(R::regex_search(s, p, b, e)); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); + } + }; + cmp("abc", "a.c"); + cmp("abc", "^abc$"); + cmp("zzzHello", "^Hello"); + cmp("Hello world", "^Hello"); + cmp("world!", "world!$"); +} + +TEST(regex_parity, char_classes_and_shortcuts) +{ + std::vector> pats = {{"abc", "[a-c][a-c][a-c]"}, + {"abc", "[^0-9]+"}, + {"A_", "\\w\\w"}, + {"9 ", "\\d\\s"}, + {"__", "\\W\\W"}, + {"Z5z", "[A-Z]\\d[a-z]"}}; + for(auto& [s, p] : pats) + { + auto sm = TryStdMatch(s, p); + if(!sm) continue; + EXPECT_EQ(R::regex_match(s, p), *sm); + + auto ss = TryStdSearch(s, p); + if(!ss) continue; + EXPECT_EQ(R::regex_search(s, p), ss->ok); + if(ss->ok) + { + size_t b = 0; + size_t e = 0; + ASSERT_TRUE(R::regex_search(s, p, b, e)); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); + } + } +} + +TEST(regex_parity, alternation_and_grouping) +{ + std::string s = "abc123xyz"; + std::string p = "(abc|def)\\d{3}xyz"; + auto sm = TryStdMatch(s, p); + ASSERT_TRUE(sm.has_value()); + EXPECT_EQ(R::regex_match(s, p), *sm); + + auto ss = TryStdSearch(s, p); + ASSERT_TRUE(ss.has_value()); + EXPECT_EQ(R::regex_search(s, p), ss->ok); + if(ss->ok) + { + size_t b = 0; + size_t e = 0; + ASSERT_TRUE(R::regex_search(s, p, b, e)); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); + } + + EXPECT_TRUE(R::regex_search("foo bar", "(foo|bar)")); + EXPECT_FALSE(R::regex_search("zzz", "(foo|bar)")); +} + +TEST(regex_parity, quantifiers_greedy) +{ + struct C + { + const char* s; + const char* p; + }; + std::vector cases = { + {"", "a*"}, + {"aaa", "a+"}, + {"aaab", "a+b"}, + {"abbb", "ab{3}"}, + {"abbbbb", "ab{3,}"}, + {"acccb", "ac{2,3}b"}, + }; + for(auto& c : cases) + { + auto sm = TryStdMatch(c.s, c.p); + ASSERT_TRUE(sm.has_value()); + EXPECT_EQ(R::regex_match(c.s, c.p), *sm); + } +} + +TEST(regex_parity, backtracking_across_tail) +{ + const std::string s = "/prefix/%env{ARBITRARY_ENV_VARIABLE}%/suffix.txt"; + const std::string p = "(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"; + auto ss = TryStdSearch(s, p); + ASSERT_TRUE(ss.has_value()); + size_t b = 0; + size_t e = 0; + ASSERT_TRUE(R::regex_search(s, p, b, e)); + ASSERT_TRUE(ss->ok); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); +} + +TEST(regex_parity, search_span) +{ + const std::string s = "xx abcd123 yy"; + const std::string p = "[a-z]+\\d+"; + auto ss = TryStdSearch(s, p); + ASSERT_TRUE(ss.has_value()); + size_t b = 999; + size_t e = 999; + ASSERT_TRUE(R::regex_search(s, p, b, e)); + ASSERT_TRUE(ss->ok); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); +} + +TEST(regex_parity, replace_whole_and_groups) +{ + const std::string s = "abc123def"; + const std::string p = "(\\w+?)(\\d+)(\\w+)"; + auto r1 = TryStdReplace(s, p, "[$&]"); + ASSERT_TRUE(r1.has_value()); + auto r2 = TryStdReplace(s, p, "($1)-{$2}-<$3>"); + ASSERT_TRUE(r2.has_value()); + auto r3 = TryStdReplace(s, "(\\d+)", "pre($`) mid($&) post($')"); + ASSERT_TRUE(r3.has_value()); + + EXPECT_EQ(R::regex_replace(s, p, "[$&]"), *r1); + EXPECT_EQ(R::regex_replace(s, p, "($1)-{$2}-<$3>"), *r2); + EXPECT_EQ(R::regex_replace(s, "(\\d+)", "pre($`) mid($&) post($')"), *r3); +} + +TEST(regex_parity, replace_global_multiple_hits) +{ + const std::string s = "a1 b22 c333"; + const std::string p = "(\\d+)"; + auto sr = TryStdReplace(s, p, "[$1]"); + ASSERT_TRUE(sr.has_value()); + EXPECT_EQ(R::regex_replace(s, p, "[$1]"), *sr); +} + +TEST(regex_parity, replace_two_digit_capture_index) +{ + // 11 groups: 1=outer, 2=a, ..., 10=i, 11=j + const std::string s = "abcdefghij"; + const std::string p = "((a)(b)(c)(d)(e)(f)(g)(h)(i)(j))"; + auto sr = TryStdReplace(s, p, "$10"); + ASSERT_TRUE(sr.has_value()); + EXPECT_EQ(R::regex_replace(s, p, "$10"), *sr); // should be "i" +} + +TEST(regex_parity, env_patterns_from_issue) +{ + const std::string fpath = "/home/user/summary/%env{ARBITRARY_ENV_VARIABLE}%/out_summary.txt"; + + const std::vector pats = { + "(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)", // should match + "(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)", // should NOT match + "(.*)%q\\{([A-Z0-9_]+)\\}(.*)" // should NOT match here + }; + + for(auto& p : pats) + { + auto ss = TryStdSearch(fpath, p); + ASSERT_TRUE(ss.has_value()); + size_t b = 0; + size_t e = 0; + bool got = R::regex_search(fpath, p, b, e); + EXPECT_EQ(got, ss->ok) << "pattern: " << p; + if(ss->ok) + { + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); + // Check common replacements + auto r1 = TryStdReplace(fpath, p, "$1"); + ASSERT_TRUE(r1.has_value()); + auto r3 = TryStdReplace(fpath, p, "$3"); + ASSERT_TRUE(r3.has_value()); + auto r4 = TryStdReplace(fpath, p, "$4"); + ASSERT_TRUE(r4.has_value()); + EXPECT_EQ(R::regex_replace(fpath, p, "$1"), *r1); + EXPECT_EQ(R::regex_replace(fpath, p, "$3"), *r3); + EXPECT_EQ(R::regex_replace(fpath, p, "$4"), *r4); + } + } +} + +TEST(regex_parity, zero_length_and_empty) +{ + auto sm = TryStdMatch("", "a*"); + ASSERT_TRUE(sm.has_value()); + EXPECT_EQ(R::regex_match("", "a*"), *sm); + + auto ss = TryStdSearch("", ""); + ASSERT_TRUE(ss.has_value()); + EXPECT_EQ(R::regex_search("", ""), ss->ok); + if(ss->ok) + { + size_t b = 0; + size_t e = 0; + ASSERT_TRUE(R::regex_search("", "", b, e)); + EXPECT_EQ(b, ss->b); + EXPECT_EQ(e, ss->e); + } +} + +TEST(regex_parity, bad_patterns_throw) +{ + // Both should throw on bad syntax we recognize (unterminated brackets/parens) + EXPECT_THROW({ R::regex_search("abc", "[a-z"); }, std::runtime_error); + EXPECT_THROW({ R::regex_match("abc", "(abc"); }, std::runtime_error); + EXPECT_THROW({ R::regex_replace("abc", "[", "x"); }, std::runtime_error); + + EXPECT_THROW( + { + std::regex re("[a-z", std::regex::ECMAScript); + (void) re; + }, + std::regex_error); + EXPECT_THROW( + { + std::regex re("(abc", std::regex::ECMAScript); + (void) re; + }, + std::regex_error); + EXPECT_THROW( + { + std::regex re("[", std::regex::ECMAScript); + (void) re; + }, + std::regex_error); +} + +// --- LAZY QUANTIFIERS ------------------------------------------------- +TEST(regex_parity, lazy_quantifiers) +{ + const std::string s = "a---b---c"; + const std::string p = "a.*?b"; + size_t b1 = 0; + size_t e1 = 0; + size_t b2 = 0; + size_t e2 = 0; + // search span parity + ASSERT_TRUE(R::regex_search(s, p, b1, e1)); + std::regex re(p, std::regex::ECMAScript); + std::cmatch m; + ASSERT_TRUE(std::regex_search(s.c_str(), m, re)); + b2 = static_cast(m.position()); + e2 = b2 + static_cast(m.length()); + EXPECT_EQ(b1, b2); + EXPECT_EQ(e1, e2); + // replace should touch minimal range + std::string r1 = R::regex_replace(s, p, "X"); + std::string r2 = std::regex_replace(s, re, "X"); + EXPECT_EQ(r1, r2); +} + +// --- CAPTURE VALUE IS LAST ITERATION OF A QUANTIFIED GROUP ----------- +TEST(regex_parity, capture_is_last_iteration) +{ + const std::string s = "ababab"; + const std::string p = "(ab)*"; + // Replacing the match with $1 should be "ab" (last repetition) + EXPECT_EQ(R::regex_replace(s, p, "$1"), std::regex_replace(s, std::regex(p), "$1")); +} + +// --- ALTERNATION CHOICE (LEFT-TO-RIGHT) ------------------------------- +TEST(regex_parity, alternation_preference) +{ + // (a|ab)b on "abb" prefers 'a' alternative (leftmost that leads to a match) + const std::string s = "abb"; + const std::string p = "(a|ab)b"; + std::string r1 = R::regex_replace(s, p, "($1)"); + std::string r2 = std::regex_replace(s, std::regex(p), "($1)"); + EXPECT_EQ(r1, r2); // should be "(a)" +} + +// --- CHARACTER CLASS CORNER CASES ------------------------------------- +TEST(regex_parity, class_hyphen_literal_edges) +{ + // '-' first/last is literal + EXPECT_TRUE(R::regex_match("-", "[-a]")); + EXPECT_TRUE(std::regex_match(std::string("-"), std::regex("[-a]"))); + EXPECT_TRUE(R::regex_match("a", "[-a]")); + EXPECT_TRUE(std::regex_match(std::string("a"), std::regex("[-a]"))); +} + +TEST(regex_parity, class_escaped_bracket_and_negated_shorthand) +{ + // Escaped ']' inside class + EXPECT_TRUE(R::regex_match("]", "[\\]]")); + EXPECT_TRUE(std::regex_match(std::string("]"), std::regex("[\\]]"))); + // Negated digit class allows letters + EXPECT_TRUE(R::regex_match("g", "[^\\d]")); + EXPECT_TRUE(std::regex_match(std::string("g"), std::regex("[^\\d]"))); +} + +// --- ANCHORS WITH EMPTY STRING ---------------------------------------- +TEST(regex_parity, anchors_empty_string) +{ + EXPECT_EQ(R::regex_match("", "^$"), std::regex_match(std::string(""), std::regex("^$"))); +} + +// --- QUANTIFIER {0} ZERO REPS ----------------------------------------- +TEST(regex_parity, quantifier_zero_reps_matches_empty) +{ + // {0} should match empty; compare match result + const std::string p = "a{0}"; + EXPECT_EQ(R::regex_search("xyz", p), + (bool) std::regex_search(std::string("xyz"), std::regex(p))); + EXPECT_EQ(R::regex_match("", p), std::regex_match(std::string(""), std::regex(p))); +} + +// --- REPLACEMENT TOKEN CORNER CASES ----------------------------------- +TEST(regex_parity, replacement_one_digit_then_literal) +{ + // When only one group exists, "$10" == "$1" + "0" + const std::string s = "a"; + const std::string p = "(a)"; + EXPECT_EQ(R::regex_replace(s, p, "$10"), std::regex_replace(s, std::regex(p), "$10")); +} + +TEST(regex_parity, replacement_unknown_two_digit_group_falls_back) +{ + // With only 1 capture, "$99" -> ($9 empty) + "9" => "9" + const std::string s = "a"; + const std::string p = "(a)"; + EXPECT_EQ(R::regex_replace(s, p, "$99"), std::regex_replace(s, std::regex(p), "$99")); +} + +TEST(regex_parity, replacement_dollar_at_end_is_literal) +{ + const std::string s = "abc123"; + const std::string p = "\\d+"; + EXPECT_EQ(R::regex_replace(s, p, "x$"), std::regex_replace(s, std::regex(p), "x$")); +} + +TEST(regex_parity, replacement_whole_match_aliases) +{ + const std::string s = "abc123def"; + const std::string p = "(\\w+?)(\\d+)(\\w+)"; + EXPECT_EQ(R::regex_replace(s, p, "<$0>"), std::regex_replace(s, std::regex(p), "<$0>")); + EXPECT_EQ(R::regex_replace(s, p, "<$&>"), std::regex_replace(s, std::regex(p), "<$&>")); +} + +// --- CAPTURE INDEXING STABILITY WITH NESTED GROUPS --------------------- +TEST(regex_parity, nested_capture_indices_left_to_right) +{ + // Ensure left-to-right numbering at '(' is honored + const std::string s = "xyz"; + const std::string p = "((x)(y))(z)"; + // Expect $1="xy", $2="x", $3="y", $4="z" + EXPECT_EQ(R::regex_replace(s, p, "$1|$2|$3|$4"), + std::regex_replace(s, std::regex(p), "$1|$2|$3|$4")); +} + +// --- COMPATIBILITY AND INTERFACE TESTS ------------------------------------------- +TEST(regex_compatibility, deterministic_behavior) +{ + // This test verifies that our implementation doesn't depend on std::regex + // across different usage patterns and string types + + std::vector> test_cases = { + {"hello world", "hello.*world"}, + {"file_v1.2.3.txt", "v(\\d+)\\.(\\d+)\\.(\\d+)"}, + {"path/to/file", "path/to/.*"}, + {"123-456-7890", "\\d{3}-\\d{3}-\\d{4}"}, + {"", "a*"}, // Empty string edge case + {"abcdef", "[a-f]+"}}; + + for(const auto& [text, pattern] : test_cases) + { + // Operations should be deterministic and repeatable + bool match_result = R::regex_match(text, pattern); + bool search_result = R::regex_search(text, pattern); + + // Results should be consistent across multiple calls + EXPECT_EQ(match_result, R::regex_match(text, pattern)) + << "Match result inconsistent for: " << text << " with pattern: " << pattern; + EXPECT_EQ(search_result, R::regex_search(text, pattern)) + << "Search result inconsistent for: " << text << " with pattern: " << pattern; + } +} + +TEST(regex_compatibility, string_interface_types) +{ + // Test that our implementation works correctly with different string interface types + // (const char*, std::string, std::string_view) + + const char* c_str = "test string with numbers 123"; + std::string std_str = "test string with numbers 123"; + std::string_view sv = std_str; + + const std::string pattern = "\\d+"; + + // All these should produce the same results + bool c_str_result = R::regex_search(c_str, pattern); + bool std_str_result = R::regex_search(std_str, pattern); + bool sv_result = R::regex_search(sv, pattern); + + EXPECT_EQ(c_str_result, std_str_result); + EXPECT_EQ(std_str_result, sv_result); + EXPECT_TRUE(c_str_result); // Should find "123" + + // Test position results are consistent + size_t c_begin, c_end, s_begin, s_end, sv_begin, sv_end; + EXPECT_TRUE(R::regex_search(c_str, pattern, c_begin, c_end)); + EXPECT_TRUE(R::regex_search(std_str, pattern, s_begin, s_end)); + EXPECT_TRUE(R::regex_search(sv, pattern, sv_begin, sv_end)); + + EXPECT_EQ(c_begin, s_begin); + EXPECT_EQ(s_begin, sv_begin); + EXPECT_EQ(c_end, s_end); + EXPECT_EQ(s_end, sv_end); +} + +TEST(regex_compatibility, build_system_patterns) +{ + // Test patterns commonly used in build systems and deployment scenarios + + struct TestCase + { + std::string text; + std::string pattern; + std::string replacement; + std::string expected; + }; + + std::vector cases = { + // Environment variable patterns (common in build systems) + {"/path/%env{HOME}%/file", "%(env|ENV)\\{([A-Z_]+)\\}%", "${$2}", "/path/${HOME}/file"}, + + // Version patterns (common in package management) + {"package-1.2.3", "(\\w+)-(\\d+)\\.(\\d+)\\.(\\d+)", "$1_v$2_$3_$4", "package_v1_2_3"}, + + // Path patterns (common in file systems) + {"/usr/lib64/libfoo.so.1", + "/usr/lib(\\d*)/([^/]+)\\.so\\.(\\d+)", + "lib$1/$2.so.$3", + "lib64/libfoo.so.1"}, + + // Architecture patterns (common in cross-compilation) + {"x86_64-linux-gnu-gcc", + "(\\w+)-(\\w+)-(\\w+)-(\\w+)", + "$1/$2/$3/$4", + "x86_64/linux/gnu/gcc"}}; + + for(const auto& test_case : cases) + { + std::string result = + R::regex_replace(test_case.text, test_case.pattern, test_case.replacement); + EXPECT_EQ(result, test_case.expected) + << "Failed for text: " << test_case.text << " pattern: " << test_case.pattern + << " replacement: " << test_case.replacement; + } +} + +TEST(regex_compatibility, memory_safety) +{ + // Test that our implementation doesn't have memory issues that could be + + std::vector large_texts; + const std::string base_text = "This is a test string with numbers 123 and more text "; + + // Create larger strings to test memory handling + for(int i = 0; i < 10; ++i) + { + std::string large_text; + for(int j = 0; j < 100; ++j) + { + large_text += base_text + std::to_string(j) + " "; + } + large_texts.push_back(large_text); + } + + const std::string pattern = "\\d+"; + + for(const auto& text : large_texts) + { + // These operations should not cause memory corruption or leaks + size_t matches = 0; + size_t pos = 0; + + while(pos < text.length()) + { + size_t begin, end; + if(R::regex_search(text.substr(pos), pattern, begin, end)) + { + matches++; + pos += begin + (end - begin); + } + else + { + break; + } + } + + EXPECT_GT(matches, 0) << "Should find numbers in the large text"; + EXPECT_LT(matches, 2000) << "Shouldn't find unreasonable number of matches"; + } +} + +TEST(regex_compatibility, thread_safety) +{ + // Test that our implementation is thread-safe and doesn't have + + const std::string text = "concurrent test 123 with multiple threads 456"; + const std::string pattern = "\\d+"; + std::atomic success_count{0}; + std::atomic failure_count{0}; + + auto worker = [&]() { + for(int i = 0; i < 100; ++i) + { + try + { + size_t begin, end; + if(R::regex_search(text, pattern, begin, end)) + { + success_count++; + } + else + { + failure_count++; + } + } catch(...) + { + failure_count++; + } + } + }; + + std::vector threads; + for(int i = 0; i < 4; ++i) + { + threads.emplace_back(worker); + } + + for(auto& thread : threads) + { + thread.join(); + } + + EXPECT_EQ(success_count.load(), 400) << "All regex operations should succeed"; + EXPECT_EQ(failure_count.load(), 0) << "No regex operations should fail"; +}