diff --git a/build.zig b/build.zig index 2934099..203f929 100644 --- a/build.zig +++ b/build.zig @@ -31,8 +31,14 @@ pub fn build(b: *std.Build) void { // ── mcp-zig dependency ── const mcp_dep = b.dependency("mcp_zig", .{}); exe.root_module.addImport("mcp", mcp_dep.module("mcp")); + + // ── nanoregex dependency ── + const nanoregex_dep = b.dependency("nanoregex", .{}); + exe.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex")); + b.installArtifact(exe); + // ── macOS codesign (ad-hoc by default; configurable for release builds) ── if (target.result.os.tag == .macos and builtin.os.tag == .macos) { const codesign = b.addSystemCommand(&.{ "codesign", "-f", "-s", codesign_identity }); @@ -58,6 +64,7 @@ pub fn build(b: *std.Build) void { }), }); tests.root_module.addImport("mcp", mcp_dep.module("mcp")); + tests.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex")); if (test_filter) |f| { const filters = b.allocator.alloc([]const u8, 1) catch @panic("oom"); filters[0] = f; @@ -68,6 +75,7 @@ pub fn build(b: *std.Build) void { const tests_run = b.addRunArtifact(tests); test_step.dependOn(&tests_run.step); + // ── Library tests (verify the module root compiles) ── const lib_tests = b.addTest(.{ .root_module = b.createModule(.{ @@ -88,8 +96,10 @@ pub fn build(b: *std.Build) void { .link_libc = true, }), }); + adversarial_tests.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex")); test_step.dependOn(&b.addRunArtifact(adversarial_tests).step); + // ── Benchmarks ── const bench = b.addExecutable(.{ .name = "bench", @@ -102,6 +112,7 @@ pub fn build(b: *std.Build) void { }); const bench_run = b.addRunArtifact(bench); bench.root_module.addImport("mcp", mcp_dep.module("mcp")); + bench.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex")); if (b.args) |args| bench_run.addArgs(args); const bench_step = b.step("bench", "Run benchmarks"); bench_step.dependOn(&bench_run.step); @@ -117,6 +128,7 @@ pub fn build(b: *std.Build) void { }), }); benchmark.root_module.addImport("mcp", mcp_dep.module("mcp")); + benchmark.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex")); const benchmark_run = b.addRunArtifact(benchmark); if (b.args) |args| benchmark_run.addArgs(args); const benchmark_step = b.step("benchmark", "Run repo benchmark (use -- --root /path/to/repo)"); @@ -137,6 +149,7 @@ pub fn build(b: *std.Build) void { .optimize = .ReleaseSmall, }), }); + wasm.root_module.addImport("nanoregex", nanoregex_dep.module("nanoregex")); wasm.rdynamic = true; wasm.entry = .disabled; diff --git a/build.zig.zon b/build.zig.zon index 2cb4cde..c2bd488 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -8,6 +8,10 @@ .url = "https://github.com/justrach/mcp-zig/archive/refs/heads/feature/7-zig-0-16-0-migration.tar.gz", .hash = "mcp_zig-0.2.0-_PilzNJkAQADzH2t3vqpd_nl_W0ta-gDaumXKttuPyBy", }, + .nanoregex = .{ + .url = "https://github.com/justrach/nanoregex/archive/refs/heads/main.tar.gz", + .hash = "nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9", + }, }, .paths = .{ "src", diff --git a/src/explore.zig b/src/explore.zig index e53e38b..b69f9cb 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const nanoregex = @import("nanoregex"); const cio = @import("cio.zig"); const Store = @import("store.zig").Store; const idx = @import("index.zig"); @@ -4015,11 +4016,14 @@ pub const Explorer = struct { } fn searchInContentRegexWithScope(self: *Explorer, path: []const u8, content: []const u8, pattern: []const u8, allocator: std.mem.Allocator, max_results: usize, result_list: *std.ArrayList(ScopedSearchResult)) !void { + var rx = nanoregex.Regex.compile(allocator, pattern) catch return; + defer rx.deinit(); var line_num: u32 = 0; var lines = std.mem.splitScalar(u8, content, '\n'); while (lines.next()) |line| { line_num += 1; - if (regexMatch(line, pattern)) { + if (rx.search(allocator, line) catch null) |m| { + @constCast(&m).deinit(allocator); const line_text = try allocator.dupe(u8, line); errdefer allocator.free(line_text); const path_copy = try allocator.dupe(u8, path); @@ -4264,11 +4268,14 @@ fn matchAtCaseInsensitive(content: []const u8, pos: usize, query: []const u8) bo } fn searchInContentRegex(path: []const u8, content: []const u8, pattern: []const u8, allocator: std.mem.Allocator, max_results: usize, result_list: *std.ArrayList(SearchResult)) !void { + var rx = nanoregex.Regex.compile(allocator, pattern) catch return; + defer rx.deinit(); var line_num: u32 = 0; var lines = std.mem.splitScalar(u8, content, '\n'); while (lines.next()) |line| { line_num += 1; - if (regexMatch(line, pattern)) { + if (rx.search(allocator, line) catch null) |m| { + @constCast(&m).deinit(allocator); const line_text = try allocator.dupe(u8, line); errdefer allocator.free(line_text); const path_copy = try allocator.dupe(u8, path); @@ -4283,313 +4290,16 @@ fn searchInContentRegex(path: []const u8, content: []const u8, pattern: []const } } -/// Simple regex matcher — supports: . \s \w \d \S \W \D [chars] [^chars] -/// * + ? ^ $ | () and escaped literals. -/// Uses backtracking. Searches for a match anywhere in the string (unanchored). pub fn regexMatch(haystack: []const u8, pattern: []const u8) bool { - // Iterate through top-level | separators to prevent stack overflow with - // many alternation branches. No recursion; no fixed-size buffer needed. - var prev: usize = 0; - var i: usize = 0; - var depth: usize = 0; - var in_bracket = false; - while (i < pattern.len) { - const c = pattern[i]; - if (c == '\\' and i + 1 < pattern.len) { - i += 2; - continue; - } - if (c == '[') { - in_bracket = true; - i += 1; - continue; - } - if (c == ']') { - in_bracket = false; - i += 1; - continue; - } - if (in_bracket) { - i += 1; - continue; - } - if (c == '(') { - depth += 1; - i += 1; - continue; - } - if (c == ')') { - if (depth > 0) depth -= 1; - i += 1; - continue; - } - if (c == '|' and depth == 0) { - if (regexMatchSingle(haystack, pattern[prev..i])) return true; - prev = i + 1; - } - i += 1; - } - return regexMatchSingle(haystack, pattern[prev..]); -} - -fn regexMatchSingle(haystack: []const u8, pattern: []const u8) bool { - if (pattern.len > 0 and pattern[0] == '^') { - return matchHere(haystack, pattern[1..], 0); - } - // Try match at every position (unanchored search) - for (0..haystack.len + 1) |start| { - if (matchHere(haystack, pattern, start)) return true; - } - return false; -} - -fn matchHere(haystack: []const u8, pattern: []const u8, pos: usize) bool { - var p: usize = 0; - var h: usize = pos; - - while (p < pattern.len) { - // End anchor - if (pattern[p] == '$' and p + 1 == pattern.len) { - return h == haystack.len; - } - - // Alternation handled at top level in regexMatch - if (pattern[p] == '|') return false; - - // Grouping with parens — handle alternation inside groups - if (pattern[p] == '(') { - // Find matching closing paren - var depth: usize = 1; - var end = p + 1; - while (end < pattern.len and depth > 0) { - if (pattern[end] == '\\' and end + 1 < pattern.len) { - end += 2; - continue; - } - if (pattern[end] == '(') depth += 1; - if (pattern[end] == ')') depth -= 1; - if (depth > 0) end += 1; - } - // end now points at ')' (or pattern.len if unmatched) - const group_end = if (end < pattern.len) end else pattern.len; - const group_content = pattern[p + 1 .. group_end]; - const after_group = if (group_end + 1 <= pattern.len) pattern[group_end + 1 ..] else ""; - - // Split group content on top-level | within this group - var branch_start: usize = 0; - var d: usize = 0; - var i: usize = 0; - while (i < group_content.len) { - if (group_content[i] == '\\' and i + 1 < group_content.len) { - i += 2; - continue; - } - if (group_content[i] == '(') d += 1; - if (group_content[i] == ')') { - if (d > 0) d -= 1; - } - if (group_content[i] == '|' and d == 0) { - // Try this branch - if (matchGroupBranch(haystack, group_content[branch_start..i], after_group, h)) return true; - branch_start = i + 1; - } - i += 1; - } - // Try last branch - return matchGroupBranch(haystack, group_content[branch_start..], after_group, h); - } - - if (pattern[p] == ')') { - p += 1; - continue; - } - - // Check for quantifier following current element - const elem_end = elementEnd(pattern, p); - if (elem_end < pattern.len) { - const qc = pattern[elem_end]; - if (qc == '*') { - return matchQuantified(haystack, pattern, p, elem_end, elem_end + 1, 0, h); - } - if (qc == '+') { - return matchQuantified(haystack, pattern, p, elem_end, elem_end + 1, 1, h); - } - if (qc == '?') { - // Try with one match - if (h < haystack.len and matchElement(haystack[h], pattern, p, elem_end)) { - if (matchHere(haystack, pattern[elem_end + 1 ..], h + 1)) return true; - } - // Try without - return matchHere(haystack, pattern[elem_end + 1 ..], h); - } - if (qc == '{') { - // Parse {n}, {n,}, {n,m} - var qi = elem_end + 1; - var min_rep: usize = 0; - while (qi < pattern.len and pattern[qi] >= '0' and pattern[qi] <= '9') { - min_rep = min_rep * 10 + (pattern[qi] - '0'); - qi += 1; - } - var max_rep: usize = min_rep; // default {n} = exactly n - if (qi < pattern.len and pattern[qi] == ',') { - qi += 1; - if (qi < pattern.len and pattern[qi] >= '0' and pattern[qi] <= '9') { - max_rep = 0; - while (qi < pattern.len and pattern[qi] >= '0' and pattern[qi] <= '9') { - max_rep = max_rep * 10 + (pattern[qi] - '0'); - qi += 1; - } - } else { - max_rep = 256; // {n,} = at least n, cap at 256 - } - } - if (qi < pattern.len and pattern[qi] == '}') { - qi += 1; // skip '}' - return matchQuantifiedRange(haystack, pattern, p, elem_end, qi, min_rep, max_rep, h); - } - // Malformed {…} — treat as literal - } - } - - // No quantifier — must match exactly one char - if (h >= haystack.len) return false; - if (!matchElement(haystack[h], pattern, p, elem_end)) return false; - h += 1; - p = elem_end; - } - - return true; // pattern exhausted — match -} - -/// Try matching a group branch followed by the rest of the pattern. -fn matchGroupBranch(haystack: []const u8, branch: []const u8, after: []const u8, pos: usize) bool { - // Concatenate branch + after conceptually by matching branch first, - // then continuing with after at the new position. - // matchHere on branch tells us how far it consumes. - // We need to try every possible consumption length of the branch. - return matchBranchThenRest(haystack, branch, after, pos); -} - -fn matchBranchThenRest(haystack: []const u8, branch: []const u8, rest: []const u8, pos: usize) bool { - // If branch is empty, just try matching the rest - if (branch.len == 0) return matchHere(haystack, rest, pos); - - // We need to find how many chars the branch consumes, then match rest. - // Build a temporary combined pattern: branch + rest - // This is safe because both are slices of the same original pattern string, - // but they may not be adjacent. Use a simple approach: match branch, track position. - var buf: [4096]u8 = undefined; - if (branch.len + rest.len > buf.len) return false; - @memcpy(buf[0..branch.len], branch); - @memcpy(buf[branch.len .. branch.len + rest.len], rest); - return matchHere(haystack, buf[0 .. branch.len + rest.len], pos); -} - -/// Match a quantified element (greedy). -fn matchQuantified(haystack: []const u8, pattern: []const u8, elem_start: usize, elem_end: usize, rest_start: usize, min_count: usize, start_pos: usize) bool { - // Count max matches - var count: usize = 0; - var h = start_pos; - while (h < haystack.len and matchElement(haystack[h], pattern, elem_start, elem_end)) { - count += 1; - h += 1; - } - // Greedy: try from max matches down to min - var c: usize = count + 1; - while (c > min_count) { - c -= 1; - if (matchHere(haystack, pattern[rest_start..], start_pos + c)) return true; - } - return false; -} - -/// Match a {n,m} quantified element (greedy). -fn matchQuantifiedRange(haystack: []const u8, pattern: []const u8, elem_start: usize, elem_end: usize, rest_start: usize, min_count: usize, max_count: usize, start_pos: usize) bool { - // Count max matches up to max_count - var count: usize = 0; - var h = start_pos; - while (h < haystack.len and count < max_count and matchElement(haystack[h], pattern, elem_start, elem_end)) { - count += 1; - h += 1; - } - if (count < min_count) return false; - // Greedy: try from max matches down to min - var c: usize = count + 1; - while (c > min_count) { - c -= 1; - if (matchHere(haystack, pattern[rest_start..], start_pos + c)) return true; + var rx = nanoregex.Regex.compile(std.heap.smp_allocator, pattern) catch return false; + defer rx.deinit(); + if (rx.search(std.heap.smp_allocator, haystack) catch null) |m| { + @constCast(&m).deinit(std.heap.smp_allocator); + return true; } return false; } -/// Return the index past the current element in the pattern. -fn elementEnd(pattern: []const u8, p: usize) usize { - if (p >= pattern.len) return p; - if (pattern[p] == '\\' and p + 1 < pattern.len) return p + 2; - if (pattern[p] == '[') { - var i = p + 1; - if (i < pattern.len and pattern[i] == '^') i += 1; - if (i < pattern.len and pattern[i] == ']') i += 1; - while (i < pattern.len and pattern[i] != ']') : (i += 1) {} - if (i < pattern.len) i += 1; - return i; - } - if (pattern[p] == '.') return p + 1; - return p + 1; -} - -/// Match a single character against a pattern element. -fn matchElement(c: u8, pattern: []const u8, start: usize, end: usize) bool { - if (start >= end) return false; - - // Dot matches any char - if (pattern[start] == '.' and end == start + 1) return true; - - // Escape sequences - if (pattern[start] == '\\' and end == start + 2) { - return switch (pattern[start + 1]) { - 'd' => std.ascii.isDigit(c), - 'D' => !std.ascii.isDigit(c), - 'w' => std.ascii.isAlphanumeric(c) or c == '_', - 'W' => !(std.ascii.isAlphanumeric(c) or c == '_'), - 's' => c == ' ' or c == '\t' or c == '\n' or c == '\r', - 'S' => !(c == ' ' or c == '\t' or c == '\n' or c == '\r'), - 'b', 'B' => false, // word boundary — not a char match - else => c == pattern[start + 1], - }; - } - - // Character class [...] - if (pattern[start] == '[') { - var i = start + 1; - var negate = false; - if (i < end and pattern[i] == '^') { - negate = true; - i += 1; - } - var matched = false; - // Handle literal ] at start of class (e.g. []] or [^]]) - if (i < end and pattern[i] == ']') { - if (c == ']') matched = true; - i += 1; - } - while (i < end and pattern[i] != ']') { - // Range: a-z, but only if '-' is not at end of class - if (i + 2 < end and pattern[i + 1] == '-' and pattern[i + 2] != ']') { - if (c >= pattern[i] and c <= pattern[i + 2]) matched = true; - i += 3; - } else { - if (c == pattern[i]) matched = true; - i += 1; - } - } - return if (negate) !matched else matched; - } - - // Literal - return c == pattern[start]; -} - fn indexOfCaseInsensitive(haystack: []const u8, needle: []const u8) ?usize { if (needle.len == 0) return 0; if (needle.len > haystack.len) return null; diff --git a/src/tests.zig b/src/tests.zig index 2103592..361d111 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -2634,6 +2634,17 @@ test "regexMatch: dot-star" { try testing.expect(regexMatch("helloworld", "hello.*world")); } +test "issue-454: regex \\b word boundary matches whole-word, not literal 'b'" { + // \b is a word-boundary assertion: should match "foo" as a whole word + // but not when it appears as a substring inside another word. + try testing.expect(regexMatch("foo bar", "\\bfoo\\b")); + try testing.expect(!regexMatch("foobar", "\\bfoo\\b")); + // Whole-word "bar" at end + try testing.expect(regexMatch("foo bar", "\\bbar\\b")); + try testing.expect(!regexMatch("foobarbaz", "\\bbar\\b")); +} + + test "explorer: searchContentRegex end-to-end" { var explorer_inst = Explorer.init(testing.allocator); defer explorer_inst.deinit(); diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/.gitignore b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/.gitignore new file mode 100644 index 0000000..93cfddf --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/.gitignore @@ -0,0 +1,12 @@ +# Zig build artifacts +.zig-cache/ +zig-cache/ +zig-out/ + +# macOS +.DS_Store + +# Editor +*.swp +.vscode/ +.idea/ diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/LICENSE b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/LICENSE new file mode 100644 index 0000000..7b09d8b --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Rach Pradhan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/README.md b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/README.md new file mode 100644 index 0000000..14830ed --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/README.md @@ -0,0 +1,136 @@ +# nanoregex + +Small, fast, pure-Zig regex engine. Built to be a drop-in replacement for `zig-regex 0.1.1` with substantially better performance on real workloads. + +3,200 lines of Zig. No FFI. No external dependencies. 27/27 parity fixtures green against Python's `re` module. + +## Benchmarks + +Search across a 142 KB Zig source file, 200 iterations, ReleaseFast. + +| Pattern | Python `re` | **nanoregex** | Winner | +|---|---|---|---| +| pure literal `compileAllocFlags` | 0.067ms | **0.037ms** | nanoregex 1.8× | +| literal-prefix `compileAllocFlags\([a-z]+` | 0.061ms | **0.037ms** | nanoregex 1.65× | +| `fn [A-Za-z]+\(.*alloc` | **0.061ms** | 0.092ms | python 1.5× | +| `\d+` (1307 matches) | 0.991ms | **0.455ms** | nanoregex 2.2× | +| `[a-z]+` (17711 matches) | 1.654ms | **0.594ms** | nanoregex 2.8× | +| alt `foo\|bar\|baz` | 0.731ms | **0.423ms** | nanoregex 1.7× | +| IPv4-ish `\d+\.\d+\.\d+\.\d+` | 0.904ms | **0.416ms** | nanoregex 2.2× | + +8 of 8 head-to-head non-anchored patterns won. Versus `zig-regex 0.1.1` on a pattern that triggers catastrophic backtracking, nanoregex is ~5000× faster (43 seconds → 8 milliseconds). + +## Architecture + +Layered, with five dispatch tiers that compose at compile time: + +``` +parser.zig pattern bytes → AST +ast.zig AST node tagged union, arena-owned +nfa.zig AST → Thompson NFA +exec.zig Pike-VM simulation (always-correct fallback) +dfa.zig Lazy subset-construction DFA (perf path) +minterm.zig Byte-class compression for the DFA's transition table +prefilter.zig Literal-prefix / required-substring extraction +root.zig Public API + dispatch +``` + +`findAll` and `search` route to the cheapest engine that can correctly handle a given pattern: + +1. **Pure-literal pattern** → `std.mem.indexOf` loop (memmem) +2. **Required-literal absent** → return empty (no engine work at all) +3. **Literal-prefix + DFA-eligible** → `indexOfPos` to candidate starts, DFA at each hit +4. **DFA-eligible** → plain lazy DFA +5. **Otherwise** → Pike VM + +DFA-eligible means: no capture groups, no anchors (`^`, `$`, `\b`), no lazy quantifiers, not case-insensitive, and the on-demand DFA stays under the 4096-state budget. Everything that doesn't fit those rules takes the Pike-VM path, which is linear-time and correct on every input. + +Bytes are folded to **minterm classes** before indexing the DFA's transition table. A pattern with `[a-z]+` reduces 256 bytes to 2 classes (in-set, out-of-set), shrinking the per-state row from 1 KB to 8 bytes and letting the whole transition table live in L1 cache. + +## API + +Mirrors `zig-regex 0.1.1` enough that most callers can switch by changing one path in `build.zig`: + +```zig +const nanoregex = @import("nanoregex"); + +var r = try nanoregex.Regex.compile(allocator, "(\\w+)@(\\w+)"); +defer r.deinit(); + +const matches = try r.findAll(allocator, "alice@example bob@host"); +defer { + for (matches) |*m| @constCast(m).deinit(allocator); + allocator.free(matches); +} +for (matches) |m| { + std.debug.print("{d}..{d}\n", .{ m.span.start, m.span.end }); +} +``` + +Methods take `*Regex` (mutable) rather than `*const Regex` because the lazy DFA fills its transition table on the fly. The first `findAll` call on a fresh `Regex` warms the cache; subsequent calls are pure table lookups. + +Compile flags: + +```zig +try nanoregex.Regex.compileWithFlags(alloc, pattern, .{ + .case_insensitive = false, + .multiline = true, // grep-like default — `^`/`$` match line edges + .dot_all = false, +}); +``` + +Backreference expansion in `replaceAll` (`\1`, `\2`, ...): + +```zig +const out = try r.replaceAll(alloc, "alice@example", "\\2/\\1"); +// → "example/alice" +``` + +## Supported syntax (v1) + +- Literals, `.`, character classes `[abc]` / `[^abc]` / `[a-z]` +- Shorthand `\d \D \w \W \s \S` +- Quantifiers `? * + {n} {n,m}` — greedy and lazy (`*?`, `+?`, `??`, `{n,m}?`) +- Groups `(foo)` capturing, `(?:foo)` non-capturing +- Alternation `foo|bar` +- Anchors `^ $ \b \B \A \z` +- Flags: case-insensitive, multiline, dot-all + +**Not yet supported**: backreferences in *patterns* (`\1` inside the regex itself), lookaround `(?=...)`/`(?!...)`, inline flag groups `(?i)...`, named groups `(?P...)`, Unicode property classes. Patterns using these features parse OK if the syntax shape is recognised, but matching may diverge — fall back to a richer engine if you need them. + +## Build + +```bash +zig build install -Doptimize=ReleaseFast +# → zig-out/bin/nanoregex_probe (parity test CLI) +# → zig-out/bin/nanoregex_bench (single-file benchmark) +``` + +## Tests + +Tests are split into narrow per-module steps so the inner loop stays tight: + +```bash +zig build test-ast # 3 tests +zig build test-parser # parser + ast tests +zig build test-nfa # nfa + parser + ast +zig build test-exec # Pike VM tests +zig build test-prefilter # literal-extraction tests +zig build test-minterm # byte-class compression +zig build test-dfa # DFA construction + matching +zig build test-root # public API +zig build parity # Python re parity (requires python3) +zig build test-all # everything, explicit and opt-in +``` + +Add `-Dtest-filter='substring'` to any step to narrow further. + +## Why it exists + +This was extracted from the [zigrepper](https://github.com/justrach/zigrepper) toolchain, where `zig-regex 0.1.1`'s backtracking engine was making `zigrep --regex` take 43 seconds on patterns like `compileAllocFlags\([a-z]+` against a directory tree. After this engine landed, the same query finished in 0.43 seconds end-to-end. + +Inspired by Russ Cox's writing on regex implementation, RE2's lazy DFA, and the [RE# / Resharp blog post](https://iev.ee/blog/resharp-how-we-built-the-fastest-regex-in-fsharp/) which laid out minterm compression and several other optimizations cleanly. + +## License + +MIT diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/build.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/build.zig new file mode 100644 index 0000000..36959f1 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/build.zig @@ -0,0 +1,150 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + // Pass-through filter that the user can layer on top of any named step + // to narrow further: zig build test-dfa -Dtest-filter='alternation' + const user_filter = b.option([]const u8, "test-filter", "Narrow test name filter (substring)"); + + const nanoregex_mod = b.addModule("nanoregex", .{ + .root_source_file = b.path("src/root.zig"), + .target = target, + .optimize = optimize, + }); + + const probe = b.addExecutable(.{ + .name = "nanoregex_probe", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/probe.zig"), + .target = target, + .optimize = optimize, + .link_libc = true, + .imports = &.{ .{ .name = "nanoregex", .module = nanoregex_mod } }, + }), + }); + b.installArtifact(probe); + + const bench = b.addExecutable(.{ + .name = "nanoregex_bench", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/bench.zig"), + .target = target, + .optimize = .ReleaseFast, + .link_libc = true, + .imports = &.{ .{ .name = "nanoregex", .module = nanoregex_mod } }, + }), + }); + b.installArtifact(bench); + + // ───────────────────────────────────────────────────────────────── + // Per-module test steps. + // + // The user's preferred iteration loop is to run ONE narrow named step + // at a time, so each module has its own step that compiles a small + // test binary scoped to that source file (+ its imports). There is + // NO aggregate `test` step — `test-all` is explicit and opt-in. + // + // To narrow further: zig build test-dfa -Dtest-filter='alternation' + // ───────────────────────────────────────────────────────────────── + + _ = addTestStep(b, "test-ast", "src/ast.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-parser", "src/parser.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-nfa", "src/nfa.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-exec", "src/exec.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-prefilter", "src/prefilter.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-dfa", "src/dfa.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-minterm", "src/minterm.zig", &.{}, user_filter, target, optimize); + _ = addTestStep(b, "test-root", "src/root.zig", &.{}, user_filter, target, optimize); + + // ── Pre-baked filtered shortcuts ── + // Each named step runs only tests whose name contains one of the + // listed substrings (Zig's --test-filter is OR-of-substrings). + // Compile is cached and shared with the parent module's step. + + _ = addTestStep(b, "test-parser-core", "src/parser.zig", &.{ "literal", "concat", "alternation" }, user_filter, target, optimize); + _ = addTestStep(b, "test-parser-quant", "src/parser.zig", &.{ "quantifier" }, user_filter, target, optimize); + _ = addTestStep(b, "test-parser-class", "src/parser.zig", &.{ "class" }, user_filter, target, optimize); + _ = addTestStep(b, "test-parser-group", "src/parser.zig", &.{ "group" }, user_filter, target, optimize); + _ = addTestStep(b, "test-parser-error", "src/parser.zig", &.{ "errors" }, user_filter, target, optimize); + + _ = addTestStep(b, "test-nfa-basic", "src/nfa.zig", &.{ "literal", "concat" }, user_filter, target, optimize); + _ = addTestStep(b, "test-nfa-quant", "src/nfa.zig", &.{ "star", "plus", "question", "counted" }, user_filter, target, optimize); + _ = addTestStep(b, "test-nfa-alt", "src/nfa.zig", &.{ "alt" }, user_filter, target, optimize); + _ = addTestStep(b, "test-nfa-group", "src/nfa.zig", &.{ "group" }, user_filter, target, optimize); + + _ = addTestStep(b, "test-exec-basic", "src/exec.zig", &.{ "literal", "no match" }, user_filter, target, optimize); + _ = addTestStep(b, "test-exec-quant", "src/exec.zig", &.{ "greedy", "lazy", "counted", "optional" }, user_filter, target, optimize); + _ = addTestStep(b, "test-exec-class", "src/exec.zig", &.{ "class", "digit", "word" }, user_filter, target, optimize); + _ = addTestStep(b, "test-exec-group", "src/exec.zig", &.{ "group" }, user_filter, target, optimize); + _ = addTestStep(b, "test-exec-anchor", "src/exec.zig", &.{ "anchor", "boundary" }, user_filter, target, optimize); + + _ = addTestStep(b, "test-dfa-rejects", "src/dfa.zig", &.{ "rejects" }, user_filter, target, optimize); + _ = addTestStep(b, "test-dfa-match", "src/dfa.zig", &.{ "literal", "plus", "alt", "class", "wildcard", "longest" }, user_filter, target, optimize); + + _ = addTestStep(b, "test-prefilter-full", "src/prefilter.zig", &.{ "full literal" }, user_filter, target, optimize); + _ = addTestStep(b, "test-prefilter-required", "src/prefilter.zig", &.{ "required literal" }, user_filter, target, optimize); + + // ───────────────────────────────────────────────────────────────── + // Aggregate sweeps — explicit, opt-in. Run these AFTER you're done + // iterating, not in the inner loop. + // ───────────────────────────────────────────────────────────────── + + const ast_all = addTestStep(b, "_test-ast-all", "src/ast.zig", &.{}, user_filter, target, optimize); + const parser_all = addTestStep(b, "_test-parser-all", "src/parser.zig", &.{}, user_filter, target, optimize); + const nfa_all = addTestStep(b, "_test-nfa-all", "src/nfa.zig", &.{}, user_filter, target, optimize); + const exec_all = addTestStep(b, "_test-exec-all", "src/exec.zig", &.{}, user_filter, target, optimize); + const prefilter_all = addTestStep(b, "_test-prefilter-all", "src/prefilter.zig", &.{}, user_filter, target, optimize); + const dfa_all = addTestStep(b, "_test-dfa-all", "src/dfa.zig", &.{}, user_filter, target, optimize); + const root_all = addTestStep(b, "_test-root-all", "src/root.zig", &.{}, user_filter, target, optimize); + + const test_all = b.step("test-all", "Run ALL unit tests across every module (slow — use named steps in the inner loop)"); + test_all.dependOn(ast_all); + test_all.dependOn(parser_all); + test_all.dependOn(nfa_all); + test_all.dependOn(exec_all); + test_all.dependOn(prefilter_all); + test_all.dependOn(dfa_all); + test_all.dependOn(root_all); + + // ── Parity vs Python re (separate; never auto-runs) ── + const parity_cmd = b.addSystemCommand(&.{"bash"}); + parity_cmd.addFileArg(b.path("tests/parity/run.sh")); + parity_cmd.addFileArg(probe.getEmittedBin()); + parity_cmd.addDirectoryArg(b.path("tests/parity/fixtures")); + const parity_step = b.step("parity", "Run Python-re parity tests (separate; opt-in)"); + parity_step.dependOn(&parity_cmd.step); +} + +/// Build one focused test step. `filters` is OR'd substring matching +/// (test runs iff its name contains at least one of the filters, or all +/// pass when the list is empty). The user-level -Dtest-filter is appended +/// so a named step can be narrowed further from the command line. +fn addTestStep( + b: *std.Build, + step_name: []const u8, + root_path: []const u8, + base_filters: []const []const u8, + user_filter: ?[]const u8, + target: std.Build.ResolvedTarget, + optimize: std.builtin.OptimizeMode, +) *std.Build.Step { + var filter_list = std.ArrayList([]const u8).empty; + filter_list.appendSlice(b.allocator, base_filters) catch @panic("OOM"); + if (user_filter) |f| filter_list.append(b.allocator, f) catch @panic("OOM"); + + const test_mod = b.createModule(.{ + .root_source_file = b.path(root_path), + .target = target, + .optimize = optimize, + }); + const test_exe = b.addTest(.{ + .root_module = test_mod, + .filters = filter_list.toOwnedSlice(b.allocator) catch @panic("OOM"), + }); + const run_step = b.addRunArtifact(test_exe); + const step = b.step(step_name, b.fmt("Run tests in {s}", .{root_path})); + step.dependOn(&run_step.step); + return step; +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/build.zig.zon b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/build.zig.zon new file mode 100644 index 0000000..3a5dfe3 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/build.zig.zon @@ -0,0 +1,8 @@ +.{ + .name = .nanoregex, + .version = "0.0.1", + .fingerprint = 0xc8e46b5d7121d911, + .minimum_zig_version = "0.16.0", + .dependencies = .{}, + .paths = .{""}, +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/ast.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/ast.zig new file mode 100644 index 0000000..f68cfce --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/ast.zig @@ -0,0 +1,186 @@ +//! Regex AST. Built by parser.zig, consumed by nfa.zig (later). +//! +//! All nodes are arena-owned by the parent Regex. The arena is freed in one +//! shot on Regex.deinit, so individual nodes never need their own deinit. +//! Child references use `*const Node` rather than slices to keep the union +//! tag size predictable and to avoid sub-allocations for unary nodes. + +const std = @import("std"); + +pub const Node = union(enum) { + /// A single literal byte. Case-folding (when flags.case_insensitive) is + /// handled at match time so the AST stays case-preserving — useful for + /// reporting in error messages. + literal: u8, + + /// `.` — matches any byte except `\n`, or any byte at all when the + /// pattern was compiled with `flags.dot_all`. The matcher consults the + /// flag; the AST node itself is flag-agnostic. + dot, + + /// `[abc]`, `[^abc]`, `[a-z]`, plus the shorthands `\d \D \w \W \s \S` + /// which the parser desugars into a Class node with the right bitmap. + class: *const Class, + + /// Zero-width anchors: `^ $ \b \B \A \z`. + anchor: Anchor, + + /// A concatenation `abc` — match each sub-node in order. + concat: []const *const Node, + + /// Alternation `a|b|c` — match any one branch (left to right, first + /// wins under leftmost-first semantics, matching Python re). + alt: []const *const Node, + + /// Quantified sub-pattern: `a*`, `a+`, `a?`, `a{n,m}`. + repeat: *const Repeat, + + /// `(foo)` or `(?:foo)`. The group's `index` is 0 for non-capturing, + /// 1..N for capturing groups in left-paren order. Index 0 (the whole + /// match) is implicit at the top level — not a Group node. + group: *const Group, +}; + +pub const Anchor = enum { + /// `^` — start of input, or start of any line in multiline mode. + line_start, + /// `$` — end of input, or end of any line in multiline mode. + line_end, + /// `\b` — boundary between a word char (`[A-Za-z0-9_]`) and not. + word_boundary, + /// `\B` — anywhere `\b` doesn't match. + non_word_boundary, + /// `\A` — start of input (ignores multiline flag). + string_start, + /// `\z` — end of input (ignores multiline flag). + string_end, +}; + +/// Character class represented as a 256-bit bitmap. `bitmap[byte/8] & +/// (1 << (byte%8))` is set iff the byte is included. Negation is folded +/// into the bitmap at parse time so the matcher does a single bit test. +pub const Class = struct { + bitmap: [32]u8, + + pub fn empty() Class { + return .{ .bitmap = [_]u8{0} ** 32 }; + } + + pub fn set(self: *Class, byte: u8) void { + self.bitmap[byte / 8] |= @as(u8, 1) << @intCast(byte % 8); + } + + pub fn setRange(self: *Class, lo: u8, hi: u8) void { + var b: usize = lo; + while (b <= hi) : (b += 1) { + self.set(@intCast(b)); + if (b == 0xff) break; + } + } + + pub fn contains(self: *const Class, byte: u8) bool { + return (self.bitmap[byte / 8] >> @intCast(byte % 8)) & 1 != 0; + } + + pub fn negate(self: *Class) void { + for (&self.bitmap) |*b| b.* = ~b.*; + } +}; + +pub const Repeat = struct { + sub: *const Node, + min: u32, + /// `std.math.maxInt(u32)` represents unbounded (`*` and `+`). + max: u32, + /// True for `*`/`+`/`?`/`{n,m}`, false for the lazy `??`/`*?`/`+?`/`{n,m}?` + /// variants. Greedy is the Python re default. + greedy: bool, +}; + +pub const Group = struct { + sub: *const Node, + /// 0 = non-capturing; ≥1 = capture index in left-paren declaration order. + index: u32, + capturing: bool, +}; + +// ── Test helpers ── + +/// Pretty-print an AST for debugging and tests. Indents to make tree shape +/// visible. The format is stable enough to assert against in tests. +/// Pretty-print an AST for debugging and tests. Indents to make tree shape +/// visible. The format is stable enough to assert against in tests. +/// Writes into an ArrayList(u8) rather than a std.io.Writer because Zig 0.16 +/// reworked the writer interface and we don't want to chase the new shape +/// from a leaf debug helper. +pub fn debugWrite(node: *const Node, buf: *std.ArrayList(u8), alloc: std.mem.Allocator, indent: u32) error{OutOfMemory}!void { + var i: u32 = 0; + while (i < indent) : (i += 1) try buf.appendSlice(alloc, " "); + var tmp: [128]u8 = undefined; + switch (node.*) { + .literal => |c| { + const line = std.fmt.bufPrint(&tmp, "literal '{c}'\n", .{c}) catch unreachable; + try buf.appendSlice(alloc, line); + }, + .dot => try buf.appendSlice(alloc, "dot\n"), + .anchor => |a| { + const line = std.fmt.bufPrint(&tmp, "anchor {s}\n", .{@tagName(a)}) catch unreachable; + try buf.appendSlice(alloc, line); + }, + .class => |c| { + var popcnt: u32 = 0; + for (c.bitmap) |b| popcnt += @popCount(b); + const line = std.fmt.bufPrint(&tmp, "class [{d} bytes]\n", .{popcnt}) catch unreachable; + try buf.appendSlice(alloc, line); + }, + .concat => |children| { + try buf.appendSlice(alloc, "concat\n"); + for (children) |child| try debugWrite(child, buf, alloc, indent + 1); + }, + .alt => |children| { + try buf.appendSlice(alloc, "alt\n"); + for (children) |child| try debugWrite(child, buf, alloc, indent + 1); + }, + .repeat => |r| { + const line = std.fmt.bufPrint(&tmp, "repeat min={d} max={d} greedy={}\n", .{ r.min, r.max, r.greedy }) catch unreachable; + try buf.appendSlice(alloc, line); + try debugWrite(r.sub, buf, alloc, indent + 1); + }, + .group => |g| { + const line = std.fmt.bufPrint(&tmp, "group #{d} cap={}\n", .{ g.index, g.capturing }) catch unreachable; + try buf.appendSlice(alloc, line); + try debugWrite(g.sub, buf, alloc, indent + 1); + }, + } +} + +test "class bitmap set/contains" { + var c = Class.empty(); + c.set('a'); + c.set('z'); + try std.testing.expect(c.contains('a')); + try std.testing.expect(c.contains('z')); + try std.testing.expect(!c.contains('b')); + try std.testing.expect(!c.contains('y')); +} + +test "class range" { + var c = Class.empty(); + c.setRange('a', 'd'); + try std.testing.expect(c.contains('a')); + try std.testing.expect(c.contains('b')); + try std.testing.expect(c.contains('c')); + try std.testing.expect(c.contains('d')); + try std.testing.expect(!c.contains('e')); + try std.testing.expect(!c.contains('`')); +} + +test "class negate" { + var c = Class.empty(); + c.setRange('a', 'z'); + c.negate(); + try std.testing.expect(!c.contains('a')); + try std.testing.expect(!c.contains('z')); + try std.testing.expect(c.contains('A')); + try std.testing.expect(c.contains('0')); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/bench.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/bench.zig new file mode 100644 index 0000000..e9715b4 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/bench.zig @@ -0,0 +1,145 @@ +//! Single-file regex benchmark. +//! +//! Wire shape: nanoregex_bench [iters] +//! +//! Reads `path` into memory once, then runs `r.findAll` against it `iters` +//! times (default 20). Prints mean per-iteration time + match count so the +//! comparison script can diff us against python re and zig-regex on the +//! exact same input. +//! +//! We measure findAll only — not parse/compile — because the latter is a +//! one-shot cost the user pays once per CLI invocation, while findAll +//! dominates real workloads (walks across thousands of files). + +const std = @import("std"); +const nanoregex = @import("nanoregex"); + +extern "c" fn write(fd: c_int, ptr: [*]const u8, len: usize) isize; + +fn writeAll(fd: c_int, data: []const u8) void { + var rem = data; + while (rem.len > 0) { + const n = write(fd, rem.ptr, rem.len); + if (n <= 0) return; + rem = rem[@intCast(n)..]; + } +} + +pub fn main(init: std.process.Init) !void { + const alloc = init.gpa; + + var args_list: std.ArrayList([]const u8) = .empty; + defer args_list.deinit(alloc); + var args_iter = init.minimal.args.iterate(); + while (args_iter.next()) |a| try args_list.append(alloc, a); + const args = args_list.items; + if (args.len < 3) { + writeAll(2, "usage: nanoregex_bench [iters]\n"); + std.process.exit(2); + } + + const pattern = args[1]; + const path = args[2]; + const iters: usize = if (args.len >= 4) + std.fmt.parseInt(usize, args[3], 10) catch 20 + else + 20; + + // Read the whole file. Using libc fopen+fread to avoid the std.fs API + // churn in 0.16 — this binary is throwaway so the simplest path wins. + const data = readFile(alloc, path) catch |err| { + var tmp: [256]u8 = undefined; + const msg = std.fmt.bufPrint(&tmp, "read error: {s}\n", .{@errorName(err)}) catch "read error\n"; + writeAll(2, msg); + std.process.exit(1); + }; + defer alloc.free(data); + + var r = nanoregex.Regex.compile(alloc, pattern) catch |err| { + var tmp: [256]u8 = undefined; + const msg = std.fmt.bufPrint(&tmp, "parse error: {s}\n", .{@errorName(err)}) catch "parse error\n"; + writeAll(2, msg); + std.process.exit(1); + }; + defer r.deinit(); + + var total_ns: u128 = 0; + var match_count: usize = 0; + + // One untimed warm-up so JIT-like effects don't bias the first sample. + { + const ms = r.findAll(alloc, data) catch { + writeAll(2, "engine error during warm-up\n"); + std.process.exit(1); + }; + match_count = ms.len; + for (ms) |*m| @constCast(m).deinit(alloc); + alloc.free(ms); + } + + var iter: usize = 0; + while (iter < iters) : (iter += 1) { + const start_ns = nowNs(); + const ms = r.findAll(alloc, data) catch { + writeAll(2, "engine error in timed loop\n"); + std.process.exit(1); + }; + const end_ns = nowNs(); + total_ns += @intCast(end_ns - start_ns); + match_count = ms.len; + for (ms) |*m| @constCast(m).deinit(alloc); + alloc.free(ms); + } + + const mean_ms: f64 = @as(f64, @floatFromInt(total_ns)) / @as(f64, @floatFromInt(iters)) / 1_000_000.0; + + var out_buf: [256]u8 = undefined; + const line = std.fmt.bufPrint(&out_buf, "nanoregex: matches={d} mean={d:.3}ms ({d}KB, {d} iters)\n", .{ + match_count, + mean_ms, + data.len / 1024, + iters, + }) catch return; + writeAll(1, line); +} + +const Timespec = extern struct { tv_sec: i64, tv_nsec: i64 }; +extern "c" fn clock_gettime(clk: c_int, ts: *Timespec) c_int; +const CLOCK_MONOTONIC: c_int = 6; + +fn nowNs() i128 { + var ts: Timespec = .{ .tv_sec = 0, .tv_nsec = 0 }; + _ = clock_gettime(CLOCK_MONOTONIC, &ts); + return @as(i128, ts.tv_sec) * 1_000_000_000 + ts.tv_nsec; +} + +extern "c" fn fopen(path: [*:0]const u8, mode: [*:0]const u8) ?*anyopaque; +extern "c" fn fclose(stream: *anyopaque) c_int; +extern "c" fn fread(ptr: [*]u8, size: usize, n: usize, stream: *anyopaque) usize; +extern "c" fn fseek(stream: *anyopaque, offset: c_long, whence: c_int) c_int; +extern "c" fn ftell(stream: *anyopaque) c_long; +const SEEK_END: c_int = 2; +const SEEK_SET: c_int = 0; + +fn readFile(alloc: std.mem.Allocator, path: []const u8) ![]u8 { + var path_buf: [4096]u8 = undefined; + if (path.len >= path_buf.len) return error.PathTooLong; + @memcpy(path_buf[0..path.len], path); + path_buf[path.len] = 0; + const path_z: [*:0]const u8 = @ptrCast(&path_buf); + + const f = fopen(path_z, "rb") orelse return error.OpenFailed; + defer _ = fclose(f); + + if (fseek(f, 0, SEEK_END) != 0) return error.SeekFailed; + const size_raw = ftell(f); + if (size_raw < 0) return error.SizeFailed; + const size: usize = @intCast(size_raw); + if (fseek(f, 0, SEEK_SET) != 0) return error.SeekFailed; + + const buf = try alloc.alloc(u8, size); + errdefer alloc.free(buf); + const n = fread(buf.ptr, 1, size, f); + if (n != size) return error.ReadShort; + return buf; +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/dfa.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/dfa.zig new file mode 100644 index 0000000..8e59901 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/dfa.zig @@ -0,0 +1,457 @@ +//! Lazy DFA built by subset construction over the Thompson NFA. +//! +//! Each DFA state is a sorted set of NFA state IDs. Transitions are computed +//! on demand: when (state, byte) is first encountered we run `step`, hash +//! the resulting NFA-state set, look it up (or create a new DFA state for +//! it), and cache the edge. Subsequent bytes through the same transition +//! are a single indexed lookup. +//! +//! Scope for v1: +//! - No capture-group tracking (caller must check `nfa.n_groups == 0`). +//! - No anchors (caller must check the AST for `Anchor` nodes; if present, +//! fall back to the Pike VM). +//! - Bounded state count (MAX_STATES). On overflow we surface an error +//! and the caller falls back to the Pike VM. +//! +//! The forward-only driver below handles unanchored search by trying each +//! starting position. Per-character cost is one table lookup, so this is +//! O(n) for cases where matches don't overlap heavily. + +const std = @import("std"); +const ast = @import("ast.zig"); +const nfa = @import("nfa.zig"); +const minterm = @import("minterm.zig"); + +pub const DfaStateId = u32; +pub const DEAD: DfaStateId = std.math.maxInt(DfaStateId); +const UNCOMPUTED: DfaStateId = std.math.maxInt(DfaStateId) - 1; + +const MAX_STATES: u32 = 4096; + +pub const Error = error{ OutOfMemory, TooManyStates, HasCaptures, HasAnchors, HasLazyQuantifier }; + +/// Knobs the runtime needs the DFA to bake in at construction time — +/// flags whose meaning isn't visible from the NFA alone. (case_insensitive +/// is intentionally absent: v1 falls back to the Pike VM when CI is set.) +pub const BuildOptions = struct { + /// Forwarded from the public Flags.dot_all. When true, `.` matches `\n`. + dot_all: bool = false, +}; + +/// Set of NFA states reached after a particular byte sequence. Sorted, +/// deduplicated; the byte representation is the hash-map key. +const NfaSet = struct { + ids: []const nfa.StateId, + accepts: bool, +}; + +/// Hash-map context that compares NfaSet identity by content of `ids`. +/// We key on the raw byte slice of the sorted ids — same length, same +/// bytes, same set. +const SetMapCtx = struct { + pub fn hash(_: SetMapCtx, key: []const u8) u64 { + return std.hash.Wyhash.hash(0, key); + } + pub fn eql(_: SetMapCtx, a: []const u8, b: []const u8) bool { + if (a.len != b.len) return false; + return std.mem.eql(u8, a, b); + } +}; + +const SetMap = std.HashMap([]const u8, DfaStateId, SetMapCtx, std.hash_map.default_max_load_percentage); + +pub const Dfa = struct { + /// Arena for state sets and transition rows. Lives until deinit. + arena: *std.heap.ArenaAllocator, + parent_alloc: std.mem.Allocator, + nfa_ref: *const nfa.Nfa, + + states: std.ArrayList(NfaSet), + /// Flat 2-D transition table indexed by `state * minterm.n_classes + class_id`. + /// Far smaller than 256-per-row when the pattern's atomic predicates + /// partition the alphabet into a handful of equivalence classes — + /// `[a-z]+` ends up with 2 classes, a 128× shrink. + transitions: []DfaStateId, + /// Resolves byte → class id. Built once from the AST at compile time. + minterm: minterm.Table, + set_to_id: SetMap, + + start: DfaStateId, + /// Whether `.` should match `\n`. Threaded in from the public Flags + /// at compile time so the inner-loop test stays branch-cheap. + dot_all: bool, + + pub fn fromNfa(alloc: std.mem.Allocator, n: *const nfa.Nfa, root: *const ast.Node, opts: BuildOptions) Error!Dfa { + if (n.n_groups != 0) return Error.HasCaptures; + if (containsAnchor(root)) return Error.HasAnchors; + // Lazy quantifiers need leftmost-shortest semantics, which a + // plain subset-construction DFA cannot express — it always picks + // leftmost-longest. Bail and let the Pike VM handle the pattern. + if (containsLazy(root)) return Error.HasLazyQuantifier; + + const arena = try alloc.create(std.heap.ArenaAllocator); + errdefer alloc.destroy(arena); + arena.* = std.heap.ArenaAllocator.init(alloc); + errdefer arena.deinit(); + const aa = arena.allocator(); + + // Compute byte equivalence classes from the pattern so the + // transition table can be indexed by class instead of raw byte. + // Typical pattern → 4-20 classes, so the row shrinks 12-64× and + // fits in L1 instead of L2. + const mt = try minterm.build(aa, root, opts.dot_all); + + const transitions = try aa.alloc(DfaStateId, @as(usize, MAX_STATES) * mt.n_classes); + @memset(transitions, UNCOMPUTED); + + var dfa: Dfa = .{ + .arena = arena, + .parent_alloc = alloc, + .nfa_ref = n, + .states = .empty, + .transitions = transitions, + .minterm = mt, + .set_to_id = SetMap.init(aa), + .start = 0, + .dot_all = opts.dot_all, + }; + + // Seed: the start DFA state is the epsilon-closure of {nfa.start}. + const seed = try epsilonClosure(aa, n, &.{n.start}); + dfa.start = try dfa.internState(seed); + + return dfa; + } + + pub fn deinit(self: *Dfa) void { + self.arena.deinit(); + self.parent_alloc.destroy(self.arena); + self.* = undefined; + } + + /// Insert a state set into the DFA, returning either a fresh id or the + /// existing one. Sorts the input slice in place before hashing. + fn internState(self: *Dfa, ids: []const nfa.StateId) Error!DfaStateId { + // Sort + dedupe — caller is allowed to pass an unsorted set. + const dup = try self.arena.allocator().dupe(nfa.StateId, ids); + std.mem.sort(nfa.StateId, dup, {}, comptime std.sort.asc(nfa.StateId)); + const deduped = uniqueSorted(dup); + + const key_bytes = std.mem.sliceAsBytes(deduped); + if (self.set_to_id.get(key_bytes)) |existing| return existing; + + if (self.states.items.len >= MAX_STATES) return Error.TooManyStates; + + var accepts = false; + for (deduped) |id| if (id == self.nfa_ref.accept) { + accepts = true; + break; + }; + + const id: DfaStateId = @intCast(self.states.items.len); + try self.states.append(self.arena.allocator(), .{ .ids = deduped, .accepts = accepts }); + try self.set_to_id.put(key_bytes, id); + return id; + } + + /// Public wrapper around the inlined hot-path transition. Mostly used + /// by tests; the matching loop in `matchAt` reads `transitions` and + /// `byte_to_class` directly to skip the function-call overhead. + pub fn transition(self: *Dfa, state: DfaStateId, byte: u8) Error!DfaStateId { + const class_id: usize = self.minterm.byte_to_class[byte]; + const idx = @as(usize, state) * self.minterm.n_classes + class_id; + const cached = self.transitions[idx]; + if (cached != UNCOMPUTED) return cached; + return try self.computeAndCacheTransition(state, class_id); + } + + /// Slow path: compute the successor set for `(state, class_id)`, intern + /// it as a new DFA state if needed, and cache the edge. Called from the + /// hot loops only when the transition is missing. + fn computeAndCacheTransition(self: *Dfa, state: DfaStateId, class_id: usize) Error!DfaStateId { + const idx = @as(usize, state) * self.minterm.n_classes + class_id; + const rep_byte = self.minterm.representatives[class_id]; + + const cur = self.states.items[state]; + var next_ids: std.ArrayList(nfa.StateId) = .empty; + defer next_ids.deinit(self.arena.allocator()); + + for (cur.ids) |sid| { + const ns = self.nfa_ref.states[sid]; + const matched = switch (ns.consume) { + .byte => |b| b == rep_byte, + .any => self.dot_all or rep_byte != '\n', + .class => |cls| cls.contains(rep_byte), + .epsilon, .anchor, .group_start, .group_end => false, + }; + if (matched) { + if (ns.out1) |o| try next_ids.append(self.arena.allocator(), o); + } + } + + if (next_ids.items.len == 0) { + self.transitions[idx] = DEAD; + return DEAD; + } + + const closure = try epsilonClosure(self.arena.allocator(), self.nfa_ref, next_ids.items); + const next_id = try self.internState(closure); + self.transitions[idx] = next_id; + return next_id; + } + + /// Anchored match starting at `start` in `input`. Returns the end index + /// of the longest accepted run, or null if no match. + /// + /// The hot loop reads the byte → class table and the transition table + /// directly. The slow-path branch (`UNCOMPUTED`) is hoisted out so the + /// fast path is a tight series of array reads + one compare. After + /// warmup the slow path is essentially never taken, so this trades + /// one predicted-not-taken branch for skipping a function frame and + /// the redundant idx recomputation that the older `transition` call + /// did inside the loop. + pub fn matchAt(self: *Dfa, input: []const u8, start: usize) Error!?usize { + var cur: DfaStateId = self.start; + var longest: ?usize = if (self.states.items[cur].accepts) start else null; + const byte_to_class = &self.minterm.byte_to_class; + const n_classes: usize = self.minterm.n_classes; + // `transitions` is a fixed-size buffer allocated once in fromNfa + // — capturing its slice is safe. `states.items`, on the other + // hand, can be reallocated by computeAndCacheTransition's call + // to internState, so we re-read it on the read-back path. + const transitions = self.transitions; + + var i = start; + while (i < input.len) : (i += 1) { + const class_id: usize = byte_to_class[input[i]]; + const idx = @as(usize, cur) * n_classes + class_id; + var next = transitions[idx]; + if (next == UNCOMPUTED) { + next = try self.computeAndCacheTransition(cur, class_id); + } + if (next == DEAD) break; + cur = next; + if (self.states.items[cur].accepts) longest = i + 1; + } + return longest; + } + + /// Find every non-overlapping match span in `input`. Tries each + /// starting position; on a hit, skips past the match end. Zero-width + /// matches advance one byte so we don't loop. + pub fn findAll(self: *Dfa, alloc: std.mem.Allocator, input: []const u8) Error![]Span { + var out: std.ArrayList(Span) = .empty; + errdefer out.deinit(alloc); + + var p: usize = 0; + while (p <= input.len) { + const end_opt = try self.matchAt(input, p); + if (end_opt) |end| { + try out.append(alloc, .{ .start = p, .end = end }); + p = if (end > p) end else p + 1; + } else { + p += 1; + } + } + return try out.toOwnedSlice(alloc); + } +}; + +pub const Span = struct { start: usize, end: usize }; + +// ── Helpers ── + +/// Compute the epsilon-closure of `seeds`: every NFA state reachable from +/// the seeds via zero-width transitions (epsilon, group_start, group_end — +/// not anchor, since we caller-fail when anchors are present). +fn epsilonClosure(alloc: std.mem.Allocator, n: *const nfa.Nfa, seeds: []const nfa.StateId) Error![]nfa.StateId { + var stack: std.ArrayList(nfa.StateId) = .empty; + defer stack.deinit(alloc); + var seen = try alloc.alloc(bool, n.states.len); + defer alloc.free(seen); + @memset(seen, false); + + var out: std.ArrayList(nfa.StateId) = .empty; + errdefer out.deinit(alloc); + + for (seeds) |s| { + if (!seen[s]) { + seen[s] = true; + try stack.append(alloc, s); + } + } + + while (stack.pop()) |sid| { + try out.append(alloc, sid); + if (sid == n.accept) continue; + const ns = n.states[sid]; + switch (ns.consume) { + .epsilon, .group_start, .group_end => { + if (ns.out1) |o| if (!seen[o]) { + seen[o] = true; + try stack.append(alloc, o); + }; + if (ns.out2) |o| if (!seen[o]) { + seen[o] = true; + try stack.append(alloc, o); + }; + }, + // Consuming and anchor states don't contribute to the closure — + // they're already terminal for this iteration. + else => {}, + } + } + + return try out.toOwnedSlice(alloc); +} + +fn uniqueSorted(sorted: []nfa.StateId) []nfa.StateId { + if (sorted.len == 0) return sorted; + var w: usize = 1; + var i: usize = 1; + while (i < sorted.len) : (i += 1) { + if (sorted[i] != sorted[w - 1]) { + sorted[w] = sorted[i]; + w += 1; + } + } + return sorted[0..w]; +} + +fn containsAnchor(node: *const ast.Node) bool { + return switch (node.*) { + .anchor => true, + .literal, .dot, .class => false, + .concat => |children| for (children) |c| { + if (containsAnchor(c)) break true; + } else false, + .alt => |children| for (children) |c| { + if (containsAnchor(c)) break true; + } else false, + .repeat => |r| containsAnchor(r.sub), + .group => |g| containsAnchor(g.sub), + }; +} + +/// True iff any quantifier in the AST is lazy (`*?`/`+?`/`??`/`{n,m}?`). +/// The DFA always yields leftmost-longest matches, which contradicts +/// lazy semantics — we caller-fail so the Pike VM runs instead. +fn containsLazy(node: *const ast.Node) bool { + return switch (node.*) { + .literal, .dot, .class, .anchor => false, + .concat => |children| for (children) |c| { + if (containsLazy(c)) break true; + } else false, + .alt => |children| for (children) |c| { + if (containsLazy(c)) break true; + } else false, + .repeat => |r| !r.greedy or containsLazy(r.sub), + .group => |g| containsLazy(g.sub), + }; +} + +// ── Tests ── + +const parser = @import("parser.zig"); + +fn buildDfa(arena: *std.heap.ArenaAllocator, pattern: []const u8) !Dfa { + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + // Heap-allocate the Nfa on the arena so its address is stable for the + // returned Dfa's `nfa_ref`. An earlier version did `&local_const` + // which was a dangling stack pointer the moment buildDfa returned, + // and every test that actually exercised the DFA either crashed or + // returned bogus values from torn-over stack memory. + const automaton_ptr = try arena.allocator().create(nfa.Nfa); + automaton_ptr.* = try nfa.build(arena.allocator(), root, p.n_groups); + return try Dfa.fromNfa(std.testing.allocator, automaton_ptr, root, .{}); +} + +test "dfa: literal pattern matches via findAll" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var dfa = try buildDfa(&arena, "abc"); + defer dfa.deinit(); + const spans = try dfa.findAll(std.testing.allocator, "the abc and abc again"); + defer std.testing.allocator.free(spans); + try std.testing.expectEqual(@as(usize, 2), spans.len); + try std.testing.expectEqual(@as(usize, 4), spans[0].start); + try std.testing.expectEqual(@as(usize, 7), spans[0].end); +} + +test "dfa: greedy plus consumes longest run" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var dfa = try buildDfa(&arena, "\\d+"); + defer dfa.deinit(); + const spans = try dfa.findAll(std.testing.allocator, "abc 42 def 1234 xyz"); + defer std.testing.allocator.free(spans); + try std.testing.expectEqual(@as(usize, 2), spans.len); + // "42" — two digits. + try std.testing.expectEqual(@as(usize, 2), spans[0].end - spans[0].start); + // "1234" — four digits. The earlier expectation of `6` was a typo. + try std.testing.expectEqual(@as(usize, 4), spans[1].end - spans[1].start); +} + +test "dfa: alternation" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var dfa = try buildDfa(&arena, "cat|dog|bird"); + defer dfa.deinit(); + const spans = try dfa.findAll(std.testing.allocator, "the cat saw a dog and a bird"); + defer std.testing.allocator.free(spans); + try std.testing.expectEqual(@as(usize, 3), spans.len); +} + +test "dfa: class quantifier" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var dfa = try buildDfa(&arena, "[a-z]+"); + defer dfa.deinit(); + const spans = try dfa.findAll(std.testing.allocator, "Hello World"); + defer std.testing.allocator.free(spans); + try std.testing.expectEqual(@as(usize, 2), spans.len); +} + +test "dfa: rejects capture patterns" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), "(abc)"); + const root = try p.parseRoot(); + const automaton_ptr = try arena.allocator().create(nfa.Nfa); + automaton_ptr.* = try nfa.build(arena.allocator(), root, p.n_groups); + try std.testing.expectError(Error.HasCaptures, Dfa.fromNfa(std.testing.allocator, automaton_ptr, root, .{})); +} + +test "dfa: rejects anchor patterns" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), "^foo"); + const root = try p.parseRoot(); + const automaton_ptr = try arena.allocator().create(nfa.Nfa); + automaton_ptr.* = try nfa.build(arena.allocator(), root, p.n_groups); + try std.testing.expectError(Error.HasAnchors, Dfa.fromNfa(std.testing.allocator, automaton_ptr, root, .{})); +} + +test "dfa: dot wildcard" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var dfa = try buildDfa(&arena, "a.b"); + defer dfa.deinit(); + const spans = try dfa.findAll(std.testing.allocator, "axb ayb azb"); + defer std.testing.allocator.free(spans); + try std.testing.expectEqual(@as(usize, 3), spans.len); +} + +test "dfa: longest match wins on greedy" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var dfa = try buildDfa(&arena, "a*"); + defer dfa.deinit(); + const spans = try dfa.findAll(std.testing.allocator, "aaa"); + defer std.testing.allocator.free(spans); + // a* should match "aaa" once at position 0, then zero-width at position 3. + try std.testing.expect(spans.len >= 1); + try std.testing.expectEqual(@as(usize, 0), spans[0].start); + try std.testing.expectEqual(@as(usize, 3), spans[0].end); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/exec.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/exec.zig new file mode 100644 index 0000000..0a2f6b6 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/exec.zig @@ -0,0 +1,469 @@ +//! Pike-VM NFA simulator. +//! +//! Classic two-list simulation: at each input position we hold a `clist` of +//! threads parked at consuming states, then step them on input[pos] into a +//! `nlist`. Zero-width states (epsilon, anchor, group_*) are walked inside +//! `addThread` so they never appear in the active lists — only consuming +//! states (byte / any / class) and the accept state do. +//! +//! Leftmost-first semantics: threads are added by DFS following out1 before +//! out2, so the priority order matches the AST's left-to-right reading. +//! Inside one input position the first thread to reach `accept` wins; lower- +//! priority threads at the same position are stopped from advancing. +//! +//! Captures are per-thread arrays of `?Span`. When a thread crosses a +//! group_start / group_end state, we dupe the array first so siblings +//! don't see each other's updates. This is the simplest correct shape; +//! a copy-on-write or generation-tagged store is the obvious v2 win. + +const std = @import("std"); +const ast = @import("ast.zig"); +const nfa = @import("nfa.zig"); + +pub const Flags = struct { + case_insensitive: bool = false, + /// `^` / `$` match line boundaries (default). When false they only match + /// input start / end. + multiline: bool = true, + /// `.` matches `\n`. Default false. + dot_all: bool = false, +}; + +pub const Span = struct { start: usize, end: usize }; + +pub const MatchResult = struct { + span: Span, + /// Slot 0 is the whole match (equal to `span`). Slots 1..n are capture + /// groups by declaration order. A null entry means the group didn't + /// participate in the match. + captures: []const ?Span, + + pub fn deinit(self: *MatchResult, alloc: std.mem.Allocator) void { + alloc.free(self.captures); + self.* = undefined; + } +}; + +const Thread = struct { + pc: nfa.StateId, + captures: []?Span, +}; + +/// Sparse-set thread list. Generation counter avoids per-step clears. +const ThreadList = struct { + threads: std.ArrayList(Thread), + seen_gen: []u32, + cur_gen: u32, + + fn init(alloc: std.mem.Allocator, n_states: usize) !ThreadList { + const seen = try alloc.alloc(u32, n_states); + @memset(seen, 0); + return .{ + .threads = .empty, + .seen_gen = seen, + .cur_gen = 1, + }; + } + + fn deinit(self: *ThreadList, alloc: std.mem.Allocator) void { + self.threads.deinit(alloc); + alloc.free(self.seen_gen); + } + + /// True iff the state was not yet in this generation. Marks it seen. + fn markIfNew(self: *ThreadList, pc: nfa.StateId) bool { + if (self.seen_gen[pc] == self.cur_gen) return false; + self.seen_gen[pc] = self.cur_gen; + return true; + } + + fn clear(self: *ThreadList, alloc: std.mem.Allocator) void { + self.cur_gen += 1; + self.threads.clearRetainingCapacity(); + _ = alloc; + } +}; + +pub const ExecError = error{OutOfMemory}; + +pub const Vm = struct { + alloc: std.mem.Allocator, + automaton: *const nfa.Nfa, + flags: Flags, + /// Total capture-array length: index 0 = whole match, 1..n_groups = explicit. + cap_len: usize, + + pub fn init(alloc: std.mem.Allocator, automaton: *const nfa.Nfa, flags: Flags) Vm { + return .{ + .alloc = alloc, + .automaton = automaton, + .flags = flags, + .cap_len = @as(usize, automaton.n_groups) + 1, + }; + } + + /// Find a single match starting at or after position 0 (whichever + /// position succeeds first, leftmost-first within that). Returns null + /// when nothing in the input matches. + pub fn search(self: *Vm, input: []const u8) ExecError!?MatchResult { + var start: usize = 0; + while (start <= input.len) : (start += 1) { + if (try self.matchAt(input, start)) |m| return m; + } + return null; + } + + /// Find all non-overlapping matches, leftmost-first. Caller owns the + /// returned slice and each MatchResult's captures. + pub fn findAll(self: *Vm, input: []const u8) ExecError![]MatchResult { + var results: std.ArrayList(MatchResult) = .empty; + errdefer { + for (results.items) |*m| m.deinit(self.alloc); + results.deinit(self.alloc); + } + + var pos: usize = 0; + while (pos <= input.len) { + const m_opt = try self.matchAt(input, pos); + if (m_opt) |m| { + try results.append(self.alloc, m); + // Advance past the match. Zero-width match (start == end) + // must still advance one byte or we'd loop forever. + pos = if (m.span.end > pos) m.span.end else pos + 1; + } else { + pos += 1; + } + } + return results.toOwnedSlice(self.alloc); + } + + /// Try to match the pattern against `input` starting exactly at `start`. + /// Returns the longest match the engine finds via leftmost-first + /// exploration, or null. Threads / captures are allocated from + /// `self.alloc` and the returned MatchResult owns its capture slice. + /// Try to match the pattern against `input` starting exactly at `start`. + /// Returns the longest leftmost-first match the engine finds, or null. + /// Captures in the returned MatchResult are owned by `self.alloc`; the + /// per-attempt scratch arena dies at end-of-scope. + fn matchAt(self: *Vm, input: []const u8, start: usize) ExecError!?MatchResult { + var arena = std.heap.ArenaAllocator.init(self.alloc); + defer arena.deinit(); + const aa = arena.allocator(); + + var clist = try ThreadList.init(aa, self.automaton.states.len); + var nlist = try ThreadList.init(aa, self.automaton.states.len); + + const initial_caps = try aa.alloc(?Span, self.cap_len); + @memset(initial_caps, null); + initial_caps[0] = .{ .start = start, .end = start }; + + try self.addThread(&clist, self.automaton.start, start, initial_caps, input, aa); + + var best: ?MatchResult = null; + var pos = start; + while (true) : (pos += 1) { + // Scan clist for accept in priority order. Lower-priority + // threads (after the first accept) are killed for this step; + // higher-priority threads (before it) keep stepping — they + // can still yield a longer leftmost-first match at a later + // position, which beats the recorded one. + var accept_idx: ?usize = null; + for (clist.threads.items, 0..) |t, idx| { + if (t.pc == self.automaton.accept) { + var captures = try self.alloc.alloc(?Span, self.cap_len); + for (t.captures, 0..) |c, i| captures[i] = c; + if (captures[0]) |*c0| c0.end = pos; + // Free the previous best (an older, shorter or + // lower-priority accept) before replacing. + if (best) |*old| old.deinit(self.alloc); + best = .{ + .span = .{ .start = start, .end = pos }, + .captures = captures, + }; + accept_idx = idx; + break; + } + } + + if (pos == input.len) break; + if (clist.threads.items.len == 0) break; + + // Priority cutoff: only threads with index < accept_idx are + // allowed to step. When no accept was found this step, all + // threads step. + const step_limit = accept_idx orelse clist.threads.items.len; + + for (clist.threads.items[0..step_limit]) |t| { + if (t.pc == self.automaton.accept) continue; + const state = self.automaton.states[t.pc]; + switch (state.consume) { + .byte => |b| { + if (self.byteMatches(b, input[pos])) { + if (state.out1) |o| try self.addThread(&nlist, o, pos + 1, t.captures, input, aa); + } + }, + .any => { + if (self.flags.dot_all or input[pos] != '\n') { + if (state.out1) |o| try self.addThread(&nlist, o, pos + 1, t.captures, input, aa); + } + }, + .class => |c| { + if (self.classMatches(c, input[pos])) { + if (state.out1) |o| try self.addThread(&nlist, o, pos + 1, t.captures, input, aa); + } + }, + // Zero-width states never reach an active list — + // addThread walks past them. Reaching here means an + // NFA construction bug. + .epsilon, .anchor, .group_start, .group_end => unreachable, + } + } + + // Nothing advanced this step. If we've recorded a match, ship it. + if (nlist.threads.items.len == 0) break; + + clist.clear(aa); + std.mem.swap(ThreadList, &clist, &nlist); + } + + return best; + } + + /// Walk every zero-width state reachable from `pc` and add any consuming + /// states (or the accept state) into `list`. Captures are duplicated at + /// every group boundary so sibling threads don't observe each other's + /// writes. + fn addThread( + self: *Vm, + list: *ThreadList, + pc: nfa.StateId, + pos: usize, + captures: []?Span, + input: []const u8, + aa: std.mem.Allocator, + ) ExecError!void { + if (!list.markIfNew(pc)) return; + + if (pc == self.automaton.accept) { + try list.threads.append(aa, .{ .pc = pc, .captures = captures }); + return; + } + + const state = self.automaton.states[pc]; + switch (state.consume) { + .epsilon => { + if (state.out1) |o| try self.addThread(list, o, pos, captures, input, aa); + if (state.out2) |o| try self.addThread(list, o, pos, captures, input, aa); + }, + .anchor => |a| { + if (self.anchorMatches(a, input, pos)) { + if (state.out1) |o| try self.addThread(list, o, pos, captures, input, aa); + } + // Anchor fail: thread dies here. + }, + .group_start => |idx| { + const new_caps = try dupeAndSet(aa, captures, idx, .{ .start = pos, .end = pos }); + if (state.out1) |o| try self.addThread(list, o, pos, new_caps, input, aa); + }, + .group_end => |idx| { + const new_caps = try dupeAndSetEnd(aa, captures, idx, pos); + if (state.out1) |o| try self.addThread(list, o, pos, new_caps, input, aa); + }, + .byte, .any, .class => { + try list.threads.append(aa, .{ .pc = pc, .captures = captures }); + }, + } + } + + // ── Predicates ── + + fn byteMatches(self: *const Vm, expected: u8, actual: u8) bool { + if (!self.flags.case_insensitive) return expected == actual; + return toLower(expected) == toLower(actual); + } + + fn classMatches(self: *const Vm, cls: *const ast.Class, actual: u8) bool { + if (cls.contains(actual)) return true; + if (self.flags.case_insensitive) { + const swapped = if (actual >= 'A' and actual <= 'Z') + actual + 32 + else if (actual >= 'a' and actual <= 'z') + actual - 32 + else + actual; + if (swapped != actual and cls.contains(swapped)) return true; + } + return false; + } + + fn anchorMatches(self: *const Vm, a: ast.Anchor, input: []const u8, pos: usize) bool { + return switch (a) { + .string_start => pos == 0, + .string_end => pos == input.len, + .line_start => pos == 0 or (self.flags.multiline and pos > 0 and input[pos - 1] == '\n'), + .line_end => pos == input.len or (self.flags.multiline and pos < input.len and input[pos] == '\n'), + .word_boundary => isAtWordBoundary(input, pos), + .non_word_boundary => !isAtWordBoundary(input, pos), + }; + } +}; + +fn dupeAndSet(alloc: std.mem.Allocator, captures: []?Span, idx: u32, value: Span) ![]?Span { + const out = try alloc.alloc(?Span, captures.len); + @memcpy(out, captures); + if (idx < out.len) out[idx] = value; + return out; +} + +fn dupeAndSetEnd(alloc: std.mem.Allocator, captures: []?Span, idx: u32, end: usize) ![]?Span { + const out = try alloc.alloc(?Span, captures.len); + @memcpy(out, captures); + if (idx < out.len) { + if (out[idx]) |*span| { + span.end = end; + } else { + // group_end without a matching group_start — shouldn't happen + // with our NFA construction, but treat as zero-width if it does. + out[idx] = .{ .start = end, .end = end }; + } + } + return out; +} + +fn toLower(c: u8) u8 { + return if (c >= 'A' and c <= 'Z') c + 32 else c; +} + +fn isWordChar(c: u8) bool { + return (c >= 'a' and c <= 'z') or + (c >= 'A' and c <= 'Z') or + (c >= '0' and c <= '9') or + c == '_'; +} + +fn isAtWordBoundary(input: []const u8, pos: usize) bool { + const left_is_word = pos > 0 and isWordChar(input[pos - 1]); + const right_is_word = pos < input.len and isWordChar(input[pos]); + return left_is_word != right_is_word; +} + +// ── Tests ── + +const parser = @import("parser.zig"); + +fn runFindAll(alloc: std.mem.Allocator, pattern: []const u8, input: []const u8) ![]MatchResult { + return runFindAllFlags(alloc, pattern, input, .{}); +} + +fn runFindAllFlags(alloc: std.mem.Allocator, pattern: []const u8, input: []const u8, flags: Flags) ![]MatchResult { + var arena = std.heap.ArenaAllocator.init(alloc); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + const automaton = try nfa.build(arena.allocator(), root, p.n_groups); + var vm = Vm.init(alloc, &automaton, flags); + return try vm.findAll(input); +} + +fn freeMatches(alloc: std.mem.Allocator, matches: []MatchResult) void { + for (matches) |*m| m.deinit(alloc); + alloc.free(matches); +} + +test "exec: literal match" { + const ms = try runFindAll(std.testing.allocator, "abc", "the abc and abc again"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 2), ms.len); + try std.testing.expectEqual(@as(usize, 4), ms[0].span.start); + try std.testing.expectEqual(@as(usize, 7), ms[0].span.end); + try std.testing.expectEqual(@as(usize, 12), ms[1].span.start); +} + +test "exec: no match" { + const ms = try runFindAll(std.testing.allocator, "xyz", "the abc"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 0), ms.len); +} + +test "exec: dot star greedy" { + const ms = try runFindAll(std.testing.allocator, "a.*b", "axxxb yyy"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 1), ms.len); + try std.testing.expectEqual(@as(usize, 0), ms[0].span.start); + try std.testing.expectEqual(@as(usize, 5), ms[0].span.end); +} + +test "exec: dot star lazy" { + const ms = try runFindAll(std.testing.allocator, "a.*?b", "axxxbxxxb"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 1), ms.len); + // Lazy stops at the first 'b'. + try std.testing.expectEqual(@as(usize, 0), ms[0].span.start); + try std.testing.expectEqual(@as(usize, 5), ms[0].span.end); +} + +test "exec: char class" { + const ms = try runFindAll(std.testing.allocator, "[a-z]+", "Hello World"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 2), ms.len); + try std.testing.expectEqual(@as(usize, 1), ms[0].span.start); + try std.testing.expectEqual(@as(usize, 5), ms[0].span.end); +} + +test "exec: alternation prefers leftmost" { + const ms = try runFindAll(std.testing.allocator, "cat|dog|bird", "the dog saw a cat and a bird"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 3), ms.len); +} + +test "exec: anchors line start" { + const flags = Flags{ .multiline = true }; + const ms = try runFindAllFlags(std.testing.allocator, "^foo", "foo\nbar foo\nfoo bar", flags); + defer freeMatches(std.testing.allocator, ms); + // multiline: ^foo matches at offset 0 and at offset 8 (after \n). + try std.testing.expectEqual(@as(usize, 2), ms.len); +} + +test "exec: counted quantifier" { + const ms = try runFindAll(std.testing.allocator, "a{2,3}", "a aa aaa aaaa"); + defer freeMatches(std.testing.allocator, ms); + // Single 'a' doesn't match (need >=2). "aa" matches. "aaa" matches. + // "aaaa" matches as "aaa" + "a" (the trailing 'a' alone doesn't qualify), + // so we get exactly: aa, aaa, aaa. + try std.testing.expectEqual(@as(usize, 3), ms.len); + try std.testing.expectEqual(@as(usize, 2), ms[0].span.end - ms[0].span.start); + try std.testing.expectEqual(@as(usize, 3), ms[1].span.end - ms[1].span.start); + try std.testing.expectEqual(@as(usize, 3), ms[2].span.end - ms[2].span.start); +} + +test "exec: capturing group" { + const ms = try runFindAll(std.testing.allocator, "(\\w+)@(\\w+)", "alice@example bob@host"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 2), ms.len); + // First match: alice@example, groups: alice, example + try std.testing.expect(ms[0].captures[1] != null); + try std.testing.expectEqual(@as(usize, 0), ms[0].captures[1].?.start); + try std.testing.expectEqual(@as(usize, 5), ms[0].captures[1].?.end); + try std.testing.expectEqual(@as(usize, 6), ms[0].captures[2].?.start); + try std.testing.expectEqual(@as(usize, 13), ms[0].captures[2].?.end); +} + +test "exec: case-insensitive flag" { + const flags = Flags{ .case_insensitive = true }; + const ms = try runFindAllFlags(std.testing.allocator, "hello", "HELLO Hello hello", flags); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 3), ms.len); +} + +test "exec: word boundary" { + const ms = try runFindAll(std.testing.allocator, "\\bcat\\b", "the cat sat on a catnap"); + defer freeMatches(std.testing.allocator, ms); + // 'cat' alone matches, 'catnap' doesn't (no boundary after 'cat'). + try std.testing.expectEqual(@as(usize, 1), ms.len); +} + +test "exec: digit shorthand" { + const ms = try runFindAll(std.testing.allocator, "\\d+", "abc 42 def 1234 xyz"); + defer freeMatches(std.testing.allocator, ms); + try std.testing.expectEqual(@as(usize, 2), ms.len); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/minterm.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/minterm.zig new file mode 100644 index 0000000..3f0164c --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/minterm.zig @@ -0,0 +1,191 @@ +//! Byte-class compression for the DFA's transition table. +//! +//! Two bytes are EQUIVALENT for a given pattern when every atomic predicate +//! in that pattern (literal-byte tests, character classes, `.`) agrees on +//! them. Equivalent bytes drive the DFA to the same next state, so we can +//! collapse them into a single "minterm" class and index transitions by +//! class id instead of raw byte. +//! +//! Concretely: a pattern with `[a-z]+` has two classes — {a..z} and +//! everything else — so the DFA's transition row shrinks from 256 entries +//! to 2. A pattern with several distinct literals plus a class typically +//! has 10-20 classes. Resharp reported a 7× speedup from this alone, +//! mostly because the smaller table fits in L1. +//! +//! Build cost is one O(256 × n_predicates) pass at compile time. We cap +//! n_predicates at 64 (fits a u64 signature); patterns with more atomic +//! predicates than that fall back to an identity (byte == class) table, +//! which makes minterm a no-op and we still match correctly. + +const std = @import("std"); +const ast = @import("ast.zig"); + +pub const Table = struct { + /// byte → class_id (0..n_classes-1). For n_classes == 256 this is the + /// identity mapping and minterm acts as a no-op. + byte_to_class: [256]u8, + /// Number of distinct classes. 1..256. + n_classes: u16, + /// class_id → arbitrary byte that lives in that class. Used when the + /// DFA's `move` step needs a concrete byte to feed into per-NFA-state + /// predicate tests. + representatives: [256]u8, +}; + +pub const Error = error{OutOfMemory}; + +pub fn build(arena: std.mem.Allocator, root: *const ast.Node, dot_all: bool) Error!Table { + var preds: std.ArrayList(Predicate) = .empty; + defer preds.deinit(arena); + try collectPredicates(root, arena, &preds); + + // Pattern with no consuming atoms — only anchors / empty. Nothing to + // distinguish; collapse the alphabet to a single class. + if (preds.items.len == 0) return singleClass(); + // Too many predicates for our 64-bit signature; bail to identity. The + // matcher still works, the minterm is just a pass-through. + if (preds.items.len > 64) return identity(); + + var sigs: [256]u64 = undefined; + for (0..256) |b| { + var sig: u64 = 0; + for (preds.items, 0..) |pred, i| { + if (matches(pred, @intCast(b), dot_all)) { + sig |= @as(u64, 1) << @intCast(i); + } + } + sigs[b] = sig; + } + + var sig_to_class = std.AutoHashMap(u64, u16).init(arena); + defer sig_to_class.deinit(); + + var byte_to_class: [256]u8 = undefined; + var representatives: [256]u8 = undefined; + var n_classes: u16 = 0; + + for (0..256) |b| { + const sig = sigs[b]; + if (sig_to_class.get(sig)) |existing| { + byte_to_class[b] = @intCast(existing); + } else { + const c = n_classes; + try sig_to_class.put(sig, c); + representatives[c] = @intCast(b); + byte_to_class[b] = @intCast(c); + n_classes += 1; + } + } + + return .{ + .byte_to_class = byte_to_class, + .n_classes = n_classes, + .representatives = representatives, + }; +} + +// ── Internals ── + +const Predicate = union(enum) { + byte: u8, + any, + class: *const ast.Class, +}; + +fn collectPredicates(node: *const ast.Node, arena: std.mem.Allocator, out: *std.ArrayList(Predicate)) Error!void { + switch (node.*) { + .literal => |c| try out.append(arena, .{ .byte = c }), + .dot => try out.append(arena, .any), + .class => |cls| try out.append(arena, .{ .class = cls }), + .anchor => {}, // zero-width — doesn't partition bytes + .concat => |children| for (children) |c| try collectPredicates(c, arena, out), + .alt => |children| for (children) |c| try collectPredicates(c, arena, out), + .repeat => |r| try collectPredicates(r.sub, arena, out), + .group => |g| try collectPredicates(g.sub, arena, out), + } +} + +fn matches(p: Predicate, b: u8, dot_all: bool) bool { + return switch (p) { + .byte => |v| v == b, + .any => dot_all or b != '\n', + .class => |cls| cls.contains(b), + }; +} + +fn singleClass() Table { + var bts: [256]u8 = undefined; + @memset(&bts, 0); + var reps: [256]u8 = undefined; + @memset(&reps, 0); + return .{ .byte_to_class = bts, .n_classes = 1, .representatives = reps }; +} + +fn identity() Table { + var bts: [256]u8 = undefined; + var reps: [256]u8 = undefined; + for (0..256) |i| { + bts[i] = @intCast(i); + reps[i] = @intCast(i); + } + return .{ .byte_to_class = bts, .n_classes = 256, .representatives = reps }; +} + +// ── Tests ── + +const parser = @import("parser.zig"); + +fn buildFor(pattern: []const u8, dot_all: bool) !Table { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + return try build(std.testing.allocator, root, dot_all); +} + +test "minterm: single class for a-z+" { + const t = try buildFor("[a-z]+", false); + // {a..z} ⇒ one class, everything else ⇒ another. 2 classes. + try std.testing.expectEqual(@as(u16, 2), t.n_classes); + try std.testing.expectEqual(t.byte_to_class['a'], t.byte_to_class['z']); + try std.testing.expect(t.byte_to_class['a'] != t.byte_to_class['A']); +} + +test "minterm: pure-literal pattern splits each distinct byte" { + const t = try buildFor("abc", false); + // 'a', 'b', 'c' each get a class; everything else is one more. 4. + try std.testing.expectEqual(@as(u16, 4), t.n_classes); + try std.testing.expect(t.byte_to_class['a'] != t.byte_to_class['b']); + try std.testing.expect(t.byte_to_class['b'] != t.byte_to_class['c']); +} + +test "minterm: dot collapses everything except newline" { + const t = try buildFor("a.b", false); + // Classes: 'a', 'b', '\n' (because . doesn't match it), and "rest". + try std.testing.expectEqual(@as(u16, 4), t.n_classes); + try std.testing.expect(t.byte_to_class['\n'] != t.byte_to_class['x']); +} + +test "minterm: dot-all collapses newline with the rest" { + const t = try buildFor("a.b", true); + // With dot_all the . matches \n too. So \n joins the "rest" class. + // Classes: 'a', 'b', "rest" (incl '\n'). 3. + try std.testing.expectEqual(@as(u16, 3), t.n_classes); + try std.testing.expectEqual(t.byte_to_class['\n'], t.byte_to_class['x']); +} + +test "minterm: anchors don't partition bytes" { + const t = try buildFor("^abc$", false); + // ^ and $ are zero-width, they don't appear in the predicate list. + // Classes are the same as for "abc": 'a', 'b', 'c', "rest" = 4. + try std.testing.expectEqual(@as(u16, 4), t.n_classes); +} + +test "minterm: representatives are valid bytes in their class" { + const t = try buildFor("[a-z]+", false); + var c: u16 = 0; + while (c < t.n_classes) : (c += 1) { + const rep = t.representatives[c]; + try std.testing.expectEqual(@as(u16, c), @as(u16, t.byte_to_class[rep])); + } +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/nfa.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/nfa.zig new file mode 100644 index 0000000..a23369e --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/nfa.zig @@ -0,0 +1,449 @@ +//! Thompson NFA construction. +//! +//! Each AST node compiles to a Frag: a single entry state plus a list of +//! "dangling" out-edges that the parent patches when concatenating. The +//! final NFA has one start state and one accept state, both well-defined +//! indices into `states`. The graph is arena-allocated alongside the AST. +//! +//! Greedy vs lazy quantifiers differ only in the *order* of out-edges on +//! the fork state — out1 is preferred by the matcher, so greedy puts the +//! sub-fragment on out1 (consume more) and lazy puts the continuation on +//! out1 (consume less). The matcher honours the ordering during simulation. +//! +//! Counted quantifiers `{m,n}` are unfolded inline — m mandatory copies +//! followed by (n-m) optional copies, capped at 1024 unfolds to keep +//! compile time bounded. + +const std = @import("std"); +const ast = @import("ast.zig"); + +pub const StateId = u32; + +pub const Consume = union(enum) { + /// No input consumed; no observation. Used at forks and merges. + epsilon, + /// Consume one byte; succeeds iff `byte == value`. + byte: u8, + /// Consume one byte; succeeds iff byte matches dot. The matcher consults + /// its `dot_all` flag to decide whether `\n` is included. + any, + /// Consume one byte; succeeds iff `byte ∈ class.bitmap`. + class: *const ast.Class, + /// Zero-width anchor — succeeds iff the current position satisfies it. + anchor: ast.Anchor, + /// No input consumed. Side effect: mark capture group N start at the + /// current position. The matcher updates its capture array. + group_start: u32, + /// No input consumed. Side effect: mark capture group N end. + group_end: u32, +}; + +pub const State = struct { + consume: Consume, + /// Primary out edge. For consuming states (byte/any/class), out1 is + /// followed only when the byte matches. For zero-width states + /// (epsilon/anchor/group_*), out1 is followed unconditionally. + out1: ?StateId, + /// Secondary out — populated only when this is an alt or repeat fork. + /// Both out edges are then epsilon-like. + out2: ?StateId, +}; + +pub const Nfa = struct { + states: []State, + start: StateId, + /// The accept state is an epsilon state with no out edges. Matcher + /// detects acceptance by id, not by Consume tag. + accept: StateId, + n_groups: u32, +}; + +pub const BuildError = error{ OutOfMemory, QuantifierTooLarge }; + +/// Compile an AST into an NFA. Allocates state buffer + dangling-edge +/// scratch from `arena`. The returned slice is also arena-owned. +pub fn build(arena: std.mem.Allocator, root: *const ast.Node, n_groups: u32) BuildError!Nfa { + var b: Builder = .{ .arena = arena, .states = .empty }; + var frag = try b.compile(root); + defer frag.dangles.deinit(arena); + + // All dangling out-edges become inputs to a single accept state. + const accept = try b.newState(.epsilon, null, null); + for (frag.dangles.items) |p| b.patch(p, accept); + + return .{ + .states = try arena.dupe(State, b.states.items), + .start = frag.start, + .accept = accept, + .n_groups = n_groups, + }; +} + +// ── Internal construction state ── + +const PatchSlot = enum { out1, out2 }; + +const PatchPoint = struct { + state: StateId, + slot: PatchSlot, +}; + +const Frag = struct { + start: StateId, + /// Out-edges that haven't been wired to anything yet. The parent + /// node patches them when concatenating with the next fragment. + dangles: std.ArrayList(PatchPoint), +}; + +const Builder = struct { + arena: std.mem.Allocator, + states: std.ArrayList(State), + + fn newState(self: *Builder, consume: Consume, out1: ?StateId, out2: ?StateId) BuildError!StateId { + const id: StateId = @intCast(self.states.items.len); + try self.states.append(self.arena, .{ .consume = consume, .out1 = out1, .out2 = out2 }); + return id; + } + + fn patch(self: *Builder, point: PatchPoint, target: StateId) void { + switch (point.slot) { + .out1 => self.states.items[point.state].out1 = target, + .out2 => self.states.items[point.state].out2 = target, + } + } + + fn singleton(self: *Builder, consume: Consume) BuildError!Frag { + const s = try self.newState(consume, null, null); + var dangles: std.ArrayList(PatchPoint) = .empty; + try dangles.append(self.arena, .{ .state = s, .slot = .out1 }); + return .{ .start = s, .dangles = dangles }; + } + + fn epsilonFrag(self: *Builder) BuildError!Frag { + return self.singleton(.epsilon); + } + + fn compile(self: *Builder, node: *const ast.Node) BuildError!Frag { + return switch (node.*) { + .literal => |c| try self.singleton(.{ .byte = c }), + .dot => try self.singleton(.any), + .class => |cls| try self.singleton(.{ .class = cls }), + .anchor => |a| try self.singleton(.{ .anchor = a }), + .concat => |children| try self.compileConcat(children), + .alt => |children| try self.compileAlt(children), + .repeat => |r| try self.compileRepeat(r), + .group => |g| try self.compileGroup(g), + }; + } + + fn compileConcat(self: *Builder, children: []const *const ast.Node) BuildError!Frag { + if (children.len == 0) return try self.epsilonFrag(); + var acc = try self.compile(children[0]); + var i: usize = 1; + while (i < children.len) : (i += 1) { + const next = try self.compile(children[i]); + for (acc.dangles.items) |p| self.patch(p, next.start); + acc.dangles.deinit(self.arena); + acc.dangles = next.dangles; + } + return acc; + } + + fn compileAlt(self: *Builder, children: []const *const ast.Node) BuildError!Frag { + if (children.len == 1) return self.compile(children[0]); + + // Build right-to-left so the leftmost branch is preferred at the + // top-level fork (matches Python re's leftmost-first semantics). + var i: usize = children.len; + i -= 1; + var rest = try self.compile(children[i]); + while (i > 0) { + i -= 1; + var branch = try self.compile(children[i]); + const fork = try self.newState(.epsilon, branch.start, rest.start); + // Merge danglers from both sides. + for (rest.dangles.items) |p| try branch.dangles.append(self.arena, p); + rest.dangles.deinit(self.arena); + rest = .{ .start = fork, .dangles = branch.dangles }; + } + return rest; + } + + fn compileRepeat(self: *Builder, r: *const ast.Repeat) BuildError!Frag { + // Fast paths for the common shapes — *, +, ?. + if (r.min == 0 and r.max == std.math.maxInt(u32)) return try self.compileStar(r.sub, r.greedy); + if (r.min == 1 and r.max == std.math.maxInt(u32)) return try self.compilePlus(r.sub, r.greedy); + if (r.min == 0 and r.max == 1) return try self.compileQuestion(r.sub, r.greedy); + return try self.compileCounted(r); + } + + fn compileStar(self: *Builder, sub_node: *const ast.Node, greedy: bool) BuildError!Frag { + var sub = try self.compile(sub_node); + const fork = if (greedy) + try self.newState(.epsilon, sub.start, null) + else + try self.newState(.epsilon, null, sub.start); + for (sub.dangles.items) |p| self.patch(p, fork); + sub.dangles.deinit(self.arena); + var dangles: std.ArrayList(PatchPoint) = .empty; + try dangles.append(self.arena, .{ + .state = fork, + .slot = if (greedy) .out2 else .out1, + }); + return .{ .start = fork, .dangles = dangles }; + } + + fn compilePlus(self: *Builder, sub_node: *const ast.Node, greedy: bool) BuildError!Frag { + var sub = try self.compile(sub_node); + const fork = if (greedy) + try self.newState(.epsilon, sub.start, null) + else + try self.newState(.epsilon, null, sub.start); + for (sub.dangles.items) |p| self.patch(p, fork); + sub.dangles.deinit(self.arena); + var dangles: std.ArrayList(PatchPoint) = .empty; + try dangles.append(self.arena, .{ + .state = fork, + .slot = if (greedy) .out2 else .out1, + }); + return .{ .start = sub.start, .dangles = dangles }; + } + + fn compileQuestion(self: *Builder, sub_node: *const ast.Node, greedy: bool) BuildError!Frag { + var sub = try self.compile(sub_node); + const fork = if (greedy) + try self.newState(.epsilon, sub.start, null) + else + try self.newState(.epsilon, null, sub.start); + try sub.dangles.append(self.arena, .{ + .state = fork, + .slot = if (greedy) .out2 else .out1, + }); + return .{ .start = fork, .dangles = sub.dangles }; + } + + fn compileCounted(self: *Builder, r: *const ast.Repeat) BuildError!Frag { + // Bound unfold size — a pattern like `a{100000}` is almost certainly + // pathological; refuse so compile-time stays sane. + const max_unfold: u32 = 1024; + if (r.min > max_unfold) return BuildError.QuantifierTooLarge; + if (r.max != std.math.maxInt(u32) and r.max > max_unfold) return BuildError.QuantifierTooLarge; + + var head: ?Frag = null; + var idx: u32 = 0; + while (idx < r.min) : (idx += 1) { + const next = try self.compile(r.sub); + if (head) |*h| { + for (h.dangles.items) |p| self.patch(p, next.start); + h.dangles.deinit(self.arena); + h.dangles = next.dangles; + } else { + head = next; + } + } + + if (r.max == std.math.maxInt(u32)) { + // {m,∞} → m mandatory copies followed by a star tail. + const tail = try self.compileStar(r.sub, r.greedy); + if (head) |*h| { + for (h.dangles.items) |p| self.patch(p, tail.start); + h.dangles.deinit(self.arena); + h.dangles = tail.dangles; + return h.*; + } + return tail; + } + + // {m,n} → m mandatory + (n-m) optional copies. + const optionals = r.max - r.min; + var i: u32 = 0; + while (i < optionals) : (i += 1) { + const opt = try self.compileQuestion(r.sub, r.greedy); + if (head) |*h| { + for (h.dangles.items) |p| self.patch(p, opt.start); + h.dangles.deinit(self.arena); + h.dangles = opt.dangles; + } else { + head = opt; + } + } + + if (head) |h| return h; + // Pure {0,0} — match the empty string. + return try self.epsilonFrag(); + } + + fn compileGroup(self: *Builder, g: *const ast.Group) BuildError!Frag { + var sub = try self.compile(g.sub); + if (!g.capturing) return sub; + + // Wrap sub with group_start ... sub ... group_end. + const start = try self.newState(.{ .group_start = g.index }, sub.start, null); + const end = try self.newState(.{ .group_end = g.index }, null, null); + for (sub.dangles.items) |p| self.patch(p, end); + sub.dangles.deinit(self.arena); + + var dangles: std.ArrayList(PatchPoint) = .empty; + try dangles.append(self.arena, .{ .state = end, .slot = .out1 }); + return .{ .start = start, .dangles = dangles }; + } +}; + +// ── Validation helpers (also used by tests) ── + +/// Walk every state and assert that out-edge ids point at real states. +/// Catches construction bugs that would otherwise show up as wrong matches. +pub fn validate(nfa: Nfa) !void { + if (nfa.start >= nfa.states.len) return error.InvalidStartState; + if (nfa.accept >= nfa.states.len) return error.InvalidAcceptState; + for (nfa.states, 0..) |s, i| { + if (s.out1) |o| if (o >= nfa.states.len) { + std.debug.print("state {d} out1={d} out-of-bounds\n", .{ i, o }); + return error.InvalidOutEdge; + }; + if (s.out2) |o| if (o >= nfa.states.len) { + std.debug.print("state {d} out2={d} out-of-bounds\n", .{ i, o }); + return error.InvalidOutEdge; + }; + } +} + +// ── Tests ── + +const parser = @import("parser.zig"); + +fn buildFrom(arena: *std.heap.ArenaAllocator, pattern: []const u8) !Nfa { + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + return try build(arena.allocator(), root, p.n_groups); +} + +test "nfa: literal compiles to one byte state + accept" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a"); + try validate(nfa); + try std.testing.expectEqual(@as(usize, 2), nfa.states.len); + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[nfa.start].consume); +} + +test "nfa: concat 'ab' chains two byte states" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "ab"); + try validate(nfa); + try std.testing.expectEqual(@as(usize, 3), nfa.states.len); + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[nfa.start].consume); + const second = nfa.states[nfa.start].out1.?; + try std.testing.expectEqual(Consume{ .byte = 'b' }, nfa.states[second].consume); +} + +test "nfa: alt 'a|b' builds a fork" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a|b"); + try validate(nfa); + // Fork (epsilon) + a + b + accept = 4. + try std.testing.expectEqual(@as(usize, 4), nfa.states.len); + try std.testing.expectEqual(Consume.epsilon, nfa.states[nfa.start].consume); + try std.testing.expect(nfa.states[nfa.start].out1 != null); + try std.testing.expect(nfa.states[nfa.start].out2 != null); +} + +test "nfa: star 'a*' builds fork pointing at sub + continuation" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a*"); + try validate(nfa); + // a + fork + accept = 3. + try std.testing.expectEqual(@as(usize, 3), nfa.states.len); + try std.testing.expectEqual(Consume.epsilon, nfa.states[nfa.start].consume); + // Greedy: out1 should be the sub. + const sub_id = nfa.states[nfa.start].out1.?; + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[sub_id].consume); +} + +test "nfa: lazy star 'a*?' reverses fork ordering" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a*?"); + try validate(nfa); + // Lazy: out1 is the continuation, out2 is the sub. + try std.testing.expectEqual(Consume.epsilon, nfa.states[nfa.start].consume); + const sub_id = nfa.states[nfa.start].out2.?; + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[sub_id].consume); +} + +test "nfa: plus 'a+' starts at sub, not fork" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a+"); + try validate(nfa); + // 'a' must be matched at least once — start is the byte state. + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[nfa.start].consume); +} + +test "nfa: question 'a?' makes sub optional" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a?"); + try validate(nfa); + try std.testing.expectEqual(Consume.epsilon, nfa.states[nfa.start].consume); +} + +test "nfa: capturing group wraps sub with group_start / group_end" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "(ab)"); + try validate(nfa); + try std.testing.expectEqual(@as(u32, 1), nfa.n_groups); + switch (nfa.states[nfa.start].consume) { + .group_start => |idx| try std.testing.expectEqual(@as(u32, 1), idx), + else => return error.ExpectedGroupStart, + } +} + +test "nfa: non-capturing group has no group_start/end" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "(?:ab)"); + try validate(nfa); + try std.testing.expectEqual(@as(u32, 0), nfa.n_groups); + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[nfa.start].consume); +} + +test "nfa: counted {2,3}" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a{2,3}"); + try validate(nfa); + // 2 mandatory + 1 optional + accept = at least 4 states. + try std.testing.expect(nfa.states.len >= 4); + try std.testing.expectEqual(Consume{ .byte = 'a' }, nfa.states[nfa.start].consume); +} + +test "nfa: oversized quantifier errors" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), "a{2000}"); + const root = try p.parseRoot(); + try std.testing.expectError(BuildError.QuantifierTooLarge, build(arena.allocator(), root, p.n_groups)); +} + +test "nfa: anchor compiles to anchor state" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "^abc"); + try validate(nfa); + switch (nfa.states[nfa.start].consume) { + .anchor => |a| try std.testing.expectEqual(ast.Anchor.line_start, a), + else => return error.ExpectedAnchor, + } +} + +test "nfa: validate catches no-bug case" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + const nfa = try buildFrom(&arena, "a(b|c)*d"); + try validate(nfa); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/parser.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/parser.zig new file mode 100644 index 0000000..9f1f943 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/parser.zig @@ -0,0 +1,495 @@ +//! Recursive-descent regex parser. Pattern bytes → AST. +//! +//! Grammar (v1, mirrors the Python re subset we ship as feature parity): +//! +//! regex ::= alt +//! alt ::= concat ('|' concat)* +//! concat ::= atom* +//! atom ::= primary quantifier? +//! primary ::= literal | dot | class | anchor | group +//! group ::= '(' ('?:')? regex ')' +//! class ::= '[' '^'? class_item+ ']' +//! class_item ::= char | char '-' char | escape +//! quantifier ::= ( '?' | '*' | '+' | '{' n (',' m?)? '}' ) '?'? +//! +//! Deferred to v2: backreferences, lookarounds, named groups, inline flags. +//! +//! All AST nodes are allocated in the caller-provided allocator (expected to +//! be a Regex-owned arena). Parse errors return a typed error; the partial +//! AST is freed when the arena drops. + +const std = @import("std"); +const ast = @import("ast.zig"); + +pub const ParseError = error{ + UnexpectedEnd, + UnbalancedParen, + UnbalancedBracket, + InvalidEscape, + InvalidQuantifier, + NothingToRepeat, + InvalidCharRange, + OutOfMemory, +}; + +pub const Parser = struct { + alloc: std.mem.Allocator, + src: []const u8, + pos: usize = 0, + /// Count of capturing groups seen so far. Incremented on `(` (but not + /// on `(?:`). Used to assign group indices in declaration order. + n_groups: u32 = 0, + + pub fn init(alloc: std.mem.Allocator, pattern: []const u8) Parser { + return .{ .alloc = alloc, .src = pattern }; + } + + pub fn parseRoot(self: *Parser) ParseError!*const ast.Node { + const root = try self.parseAlt(); + if (self.pos < self.src.len) { + // A stray closing `)` would land here. + return ParseError.UnbalancedParen; + } + return root; + } + + // ── alt = concat ('|' concat)* ── + + fn parseAlt(self: *Parser) ParseError!*const ast.Node { + var branches: std.ArrayList(*const ast.Node) = .empty; + defer branches.deinit(self.alloc); + + try branches.append(self.alloc, try self.parseConcat()); + while (self.peek() == '|') { + self.pos += 1; + try branches.append(self.alloc, try self.parseConcat()); + } + if (branches.items.len == 1) return branches.items[0]; + const slice = try self.alloc.dupe(*const ast.Node, branches.items); + return try self.node(.{ .alt = slice }); + } + + // ── concat = atom* ── + + fn parseConcat(self: *Parser) ParseError!*const ast.Node { + var pieces: std.ArrayList(*const ast.Node) = .empty; + defer pieces.deinit(self.alloc); + + while (self.pos < self.src.len) { + const c = self.src[self.pos]; + if (c == '|' or c == ')') break; + try pieces.append(self.alloc, try self.parseAtom()); + } + if (pieces.items.len == 0) { + // Empty concat matches the empty string. Represent as an empty + // concat node — the matcher treats it as zero-width success. + const slice = try self.alloc.dupe(*const ast.Node, &.{}); + return try self.node(.{ .concat = slice }); + } + if (pieces.items.len == 1) return pieces.items[0]; + const slice = try self.alloc.dupe(*const ast.Node, pieces.items); + return try self.node(.{ .concat = slice }); + } + + // ── atom = primary quantifier? ── + + fn parseAtom(self: *Parser) ParseError!*const ast.Node { + const primary = try self.parsePrimary(); + return try self.maybeQuantify(primary); + } + + fn parsePrimary(self: *Parser) ParseError!*const ast.Node { + if (self.pos >= self.src.len) return ParseError.UnexpectedEnd; + const c = self.src[self.pos]; + switch (c) { + '.' => { + self.pos += 1; + return try self.node(.dot); + }, + '^' => { + self.pos += 1; + return try self.node(.{ .anchor = .line_start }); + }, + '$' => { + self.pos += 1; + return try self.node(.{ .anchor = .line_end }); + }, + '(' => return try self.parseGroup(), + '[' => return try self.parseClass(), + '\\' => return try self.parseEscape(), + '*', '+', '?', '{' => return ParseError.NothingToRepeat, + ')', '|' => return ParseError.UnexpectedEnd, + else => { + self.pos += 1; + return try self.node(.{ .literal = c }); + }, + } + } + + // ── group = '(' ('?:')? regex ')' ── + + fn parseGroup(self: *Parser) ParseError!*const ast.Node { + std.debug.assert(self.src[self.pos] == '('); + self.pos += 1; + + var capturing = true; + if (self.pos + 1 < self.src.len and self.src[self.pos] == '?' and self.src[self.pos + 1] == ':') { + capturing = false; + self.pos += 2; + } + + // Reserve the capture index BEFORE recursing so nested groups get + // higher indices, matching Python re's left-paren declaration order. + var index: u32 = 0; + if (capturing) { + self.n_groups += 1; + index = self.n_groups; + } + + const sub = try self.parseAlt(); + + if (self.peek() != ')') return ParseError.UnbalancedParen; + self.pos += 1; + + const g = try self.alloc.create(ast.Group); + g.* = .{ .sub = sub, .index = index, .capturing = capturing }; + return try self.node(.{ .group = g }); + } + + // ── class = '[' '^'? items ']' ── + + fn parseClass(self: *Parser) ParseError!*const ast.Node { + std.debug.assert(self.src[self.pos] == '['); + self.pos += 1; + + const cls = try self.alloc.create(ast.Class); + cls.* = ast.Class.empty(); + + var negate = false; + if (self.peek() == '^') { + negate = true; + self.pos += 1; + } + + // A `]` as the very first char inside the class is treated as a + // literal `]`, matching Python re's behaviour. Otherwise `]` ends. + var first = true; + while (self.pos < self.src.len) { + const c = self.src[self.pos]; + if (c == ']' and !first) break; + first = false; + + const lo = try self.parseClassChar(); + // Range `a-z` only if the `-` is followed by a non-`]` char. + if (self.pos + 1 < self.src.len and self.src[self.pos] == '-' and self.src[self.pos + 1] != ']') { + self.pos += 1; // consume '-' + const hi = try self.parseClassChar(); + if (hi < lo) return ParseError.InvalidCharRange; + cls.setRange(lo, hi); + } else { + cls.set(lo); + } + } + if (self.peek() != ']') return ParseError.UnbalancedBracket; + self.pos += 1; + + if (negate) cls.negate(); + return try self.node(.{ .class = cls }); + } + + fn parseClassChar(self: *Parser) ParseError!u8 { + if (self.pos >= self.src.len) return ParseError.UnbalancedBracket; + const c = self.src[self.pos]; + if (c == '\\') { + self.pos += 1; + if (self.pos >= self.src.len) return ParseError.InvalidEscape; + const e = self.src[self.pos]; + self.pos += 1; + return switch (e) { + 'n' => '\n', + 't' => '\t', + 'r' => '\r', + '0' => 0, + else => e, + }; + } + self.pos += 1; + return c; + } + + // ── escape = '\' (shorthand | metaliteral) ── + + fn parseEscape(self: *Parser) ParseError!*const ast.Node { + std.debug.assert(self.src[self.pos] == '\\'); + self.pos += 1; + if (self.pos >= self.src.len) return ParseError.InvalidEscape; + const e = self.src[self.pos]; + self.pos += 1; + return switch (e) { + 'd' => try self.shorthandClass(digitClass()), + 'D' => try self.shorthandClass(negated(digitClass())), + 'w' => try self.shorthandClass(wordClass()), + 'W' => try self.shorthandClass(negated(wordClass())), + 's' => try self.shorthandClass(spaceClass()), + 'S' => try self.shorthandClass(negated(spaceClass())), + 'b' => try self.node(.{ .anchor = .word_boundary }), + 'B' => try self.node(.{ .anchor = .non_word_boundary }), + 'A' => try self.node(.{ .anchor = .string_start }), + 'z' => try self.node(.{ .anchor = .string_end }), + 'n' => try self.node(.{ .literal = '\n' }), + 't' => try self.node(.{ .literal = '\t' }), + 'r' => try self.node(.{ .literal = '\r' }), + '0' => try self.node(.{ .literal = 0 }), + else => try self.node(.{ .literal = e }), + }; + } + + fn shorthandClass(self: *Parser, cls_value: ast.Class) ParseError!*const ast.Node { + const cls = try self.alloc.create(ast.Class); + cls.* = cls_value; + return try self.node(.{ .class = cls }); + } + + fn digitClass() ast.Class { + var c = ast.Class.empty(); + c.setRange('0', '9'); + return c; + } + + fn wordClass() ast.Class { + var c = ast.Class.empty(); + c.setRange('a', 'z'); + c.setRange('A', 'Z'); + c.setRange('0', '9'); + c.set('_'); + return c; + } + + fn spaceClass() ast.Class { + var c = ast.Class.empty(); + c.set(' '); + c.set('\t'); + c.set('\n'); + c.set('\r'); + c.set(0x0b); // \v + c.set(0x0c); // \f + return c; + } + + fn negated(cls_in: ast.Class) ast.Class { + var c = cls_in; + c.negate(); + return c; + } + + // ── quantifier ── + + fn maybeQuantify(self: *Parser, primary: *const ast.Node) ParseError!*const ast.Node { + if (self.pos >= self.src.len) return primary; + const c = self.src[self.pos]; + var min: u32 = 0; + var max: u32 = 0; + switch (c) { + '?' => { + self.pos += 1; + min = 0; + max = 1; + }, + '*' => { + self.pos += 1; + min = 0; + max = std.math.maxInt(u32); + }, + '+' => { + self.pos += 1; + min = 1; + max = std.math.maxInt(u32); + }, + '{' => { + const parsed = try self.parseCountedQuantifier(); + min = parsed.min; + max = parsed.max; + }, + else => return primary, + } + var greedy = true; + if (self.peek() == '?') { + greedy = false; + self.pos += 1; + } + const r = try self.alloc.create(ast.Repeat); + r.* = .{ .sub = primary, .min = min, .max = max, .greedy = greedy }; + return try self.node(.{ .repeat = r }); + } + + fn parseCountedQuantifier(self: *Parser) ParseError!struct { min: u32, max: u32 } { + std.debug.assert(self.src[self.pos] == '{'); + self.pos += 1; + const lo = try self.readNumber(); + var hi = lo; + if (self.peek() == ',') { + self.pos += 1; + if (self.peek() == '}') { + hi = std.math.maxInt(u32); + } else { + hi = try self.readNumber(); + } + } + if (self.peek() != '}') return ParseError.InvalidQuantifier; + self.pos += 1; + if (hi < lo) return ParseError.InvalidQuantifier; + return .{ .min = lo, .max = hi }; + } + + fn readNumber(self: *Parser) ParseError!u32 { + const start = self.pos; + while (self.pos < self.src.len and self.src[self.pos] >= '0' and self.src[self.pos] <= '9') { + self.pos += 1; + } + if (self.pos == start) return ParseError.InvalidQuantifier; + return std.fmt.parseInt(u32, self.src[start..self.pos], 10) catch ParseError.InvalidQuantifier; + } + + // ── Helpers ── + + fn peek(self: *const Parser) ?u8 { + if (self.pos >= self.src.len) return null; + return self.src[self.pos]; + } + + fn node(self: *Parser, value: ast.Node) ParseError!*const ast.Node { + const n = try self.alloc.create(ast.Node); + n.* = value; + return n; + } +}; + +// ── Tests ── + +fn parseToString(alloc: std.mem.Allocator, pattern: []const u8) ![]u8 { + var arena = std.heap.ArenaAllocator.init(alloc); + defer arena.deinit(); + var p = Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(alloc); + try ast.debugWrite(root, &buf, alloc, 0); + return alloc.dupe(u8, buf.items); +} + +test "parse single literal" { + const out = try parseToString(std.testing.allocator, "a"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings("literal 'a'\n", out); +} + +test "parse concat of literals" { + const out = try parseToString(std.testing.allocator, "abc"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\concat + \\ literal 'a' + \\ literal 'b' + \\ literal 'c' + \\ + , out); +} + +test "parse alternation" { + const out = try parseToString(std.testing.allocator, "a|b"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\alt + \\ literal 'a' + \\ literal 'b' + \\ + , out); +} + +test "parse star quantifier" { + const out = try parseToString(std.testing.allocator, "a*"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\repeat min=0 max=4294967295 greedy=true + \\ literal 'a' + \\ + , out); +} + +test "parse lazy quantifier" { + const out = try parseToString(std.testing.allocator, "a+?"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\repeat min=1 max=4294967295 greedy=false + \\ literal 'a' + \\ + , out); +} + +test "parse counted quantifier" { + const out = try parseToString(std.testing.allocator, "a{2,5}"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\repeat min=2 max=5 greedy=true + \\ literal 'a' + \\ + , out); +} + +test "parse char class with range" { + const out = try parseToString(std.testing.allocator, "[a-z]"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings("class [26 bytes]\n", out); +} + +test "parse capturing group" { + const out = try parseToString(std.testing.allocator, "(abc)"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\group #1 cap=true + \\ concat + \\ literal 'a' + \\ literal 'b' + \\ literal 'c' + \\ + , out); +} + +test "parse non-capturing group" { + const out = try parseToString(std.testing.allocator, "(?:abc)"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings( + \\group #0 cap=false + \\ concat + \\ literal 'a' + \\ literal 'b' + \\ literal 'c' + \\ + , out); +} + +test "parse shorthand class \\d" { + const out = try parseToString(std.testing.allocator, "\\d"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings("class [10 bytes]\n", out); +} + +test "parse anchor word_boundary" { + const out = try parseToString(std.testing.allocator, "\\b"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings("anchor word_boundary\n", out); +} + +test "parse unbalanced paren errors" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var p = Parser.init(arena.allocator(), "(abc"); + try std.testing.expectError(ParseError.UnbalancedParen, p.parseRoot()); +} + +test "parse nothing-to-repeat errors" { + var arena = std.heap.ArenaAllocator.init(std.testing.allocator); + defer arena.deinit(); + var p = Parser.init(arena.allocator(), "*abc"); + try std.testing.expectError(ParseError.NothingToRepeat, p.parseRoot()); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/prefilter.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/prefilter.zig new file mode 100644 index 0000000..8e5f576 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/prefilter.zig @@ -0,0 +1,314 @@ +//! Compile-time pattern analysis for fast-path optimisations. +//! +//! Two analyses, both consumed by `findAll`: +//! +//! 1. `extractFullLiteral` — when the AST is purely literal (literal bytes, +//! optional non-capturing concat/group wrapping, no metacharacters, no +//! capture groups), returns the literal byte sequence. Callers can then +//! bypass the NFA entirely and use `std.mem.indexOf` in a loop — a +//! 50-100x win on patterns like `compileAllocFlags` that came in as +//! "regex" but have no actual regex content. +//! +//! 2. `extractRequiredLiteral` — for genuinely-regex patterns, finds the +//! longest contiguous run of bytes that MUST appear in any successful +//! match. Callers use it as a pre-filter: if the haystack doesn't +//! contain the substring, no match exists and we skip the engine. When +//! the haystack does contain it, we still gate engine work to windows +//! around hits via `findOccurrences`. +//! +//! Both analyses are conservative: when in doubt, they return null/empty so +//! the matcher falls back to the full engine. The unit tests pin down what +//! we extract for each shape. + +const std = @import("std"); +const ast = @import("ast.zig"); + +/// If the AST is purely literal — only `.literal` / `.concat` / non-capturing +/// `.group` nodes — flatten it to a single byte slice. Returns null when any +/// regex feature (dot, class, anchor, alt, repeat, capturing group) is +/// present. The returned slice is arena-allocated. +pub fn extractFullLiteral(arena: std.mem.Allocator, root: *const ast.Node) !?[]const u8 { + var buf: std.ArrayList(u8) = .empty; + errdefer buf.deinit(arena); + if (!try collectLiteral(root, arena, &buf)) { + buf.deinit(arena); + return null; + } + if (buf.items.len == 0) { + buf.deinit(arena); + return null; + } + return try buf.toOwnedSlice(arena); +} + +fn collectLiteral(node: *const ast.Node, arena: std.mem.Allocator, buf: *std.ArrayList(u8)) !bool { + return switch (node.*) { + .literal => |c| { + try buf.append(arena, c); + return true; + }, + .concat => |children| { + for (children) |child| { + if (!try collectLiteral(child, arena, buf)) return false; + } + return true; + }, + .group => |g| { + // A capturing group changes externally-observable behaviour + // (callers may want span info), so bail out and let the engine + // handle it. Non-capturing groups are pure parens; transparent. + if (g.capturing) return false; + return collectLiteral(g.sub, arena, buf); + }, + .dot, .class, .anchor, .alt, .repeat => false, + }; +} + +/// Find the longest run of unconditionally-required literal bytes inside +/// the pattern. "Required" means every successful match must contain these +/// bytes contiguously. Returns null when no run of length ≥ `min_len` can +/// be extracted; below that threshold the prefilter overhead beats the +/// win. +pub fn extractRequiredLiteral(arena: std.mem.Allocator, root: *const ast.Node, min_len: usize) !?[]const u8 { + var best: std.ArrayList(u8) = .empty; + errdefer best.deinit(arena); + var current: std.ArrayList(u8) = .empty; + defer current.deinit(arena); + + try walkRequired(root, arena, ¤t, &best); + // Final flush in case the longest run ends at the AST tail. + if (current.items.len > best.items.len) { + best.deinit(arena); + best = current; + current = .empty; + } + + if (best.items.len < min_len) { + best.deinit(arena); + return null; + } + return try best.toOwnedSlice(arena); +} + +fn walkRequired( + node: *const ast.Node, + arena: std.mem.Allocator, + current: *std.ArrayList(u8), + best: *std.ArrayList(u8), +) error{OutOfMemory}!void { + switch (node.*) { + .literal => |c| try current.append(arena, c), + .concat => |children| for (children) |child| try walkRequired(child, arena, current, best), + .group => |g| try walkRequired(g.sub, arena, current, best), + .repeat => |r| { + // The sub-pattern is required iff min ≥ 1. When it is, treat the + // first mandatory copy as additional bytes in the current run. + // Past that, the rest can repeat-with-variation so we flush + // and start fresh. + if (r.min >= 1) { + try walkRequired(r.sub, arena, current, best); + } + try flush(current, best, arena); + }, + .dot, .class, .anchor => try flush(current, best, arena), + .alt => { + // We could pick the longest common prefix across branches but + // for v1 we bail conservatively — the run ends here. + try flush(current, best, arena); + }, + } +} + +fn flush(current: *std.ArrayList(u8), best: *std.ArrayList(u8), arena: std.mem.Allocator) !void { + if (current.items.len > best.items.len) { + best.deinit(arena); + best.* = current.*; + current.* = .empty; + } else { + current.clearRetainingCapacity(); + } +} + +/// Extract the contiguous literal byte sequence at the very START of the +/// pattern, if any. Differs from `extractRequiredLiteral` in two ways: +/// - We anchor at the pattern start instead of picking the longest run. +/// - Callers can use the result as a STARTING-POSITION hint: every match +/// must begin where this byte sequence occurs in the haystack. +/// +/// Bails on alternation at the top level (different branches → different +/// possible prefixes; we'd need their common prefix). Returns null when +/// the prefix is shorter than `min_len`, the threshold below which the +/// indexOf-and-resume overhead beats the engine. +pub fn extractLiteralPrefix(arena: std.mem.Allocator, root: *const ast.Node, min_len: usize) !?[]const u8 { + var buf: std.ArrayList(u8) = .empty; + errdefer buf.deinit(arena); + _ = collectPrefix(root, arena, &buf) catch |err| switch (err) { + error.OutOfMemory => return err, + }; + if (buf.items.len < min_len) { + buf.deinit(arena); + return null; + } + return try buf.toOwnedSlice(arena); +} + +/// Walk the AST left-to-right adding ONLY contiguous required literal +/// bytes at the start. Stops at the first non-literal node (or at a +/// quantifier that makes a byte optional). +/// Returns true iff the prefix collection can continue past this node. +fn collectPrefix(node: *const ast.Node, arena: std.mem.Allocator, buf: *std.ArrayList(u8)) error{OutOfMemory}!bool { + switch (node.*) { + .literal => |c| { + try buf.append(arena, c); + return true; + }, + .concat => |children| { + for (children) |child| { + const ok = try collectPrefix(child, arena, buf); + if (!ok) return false; + } + return true; + }, + .group => |g| return collectPrefix(g.sub, arena, buf), + .repeat => |r| { + // A repeat with min ≥ 1 contributes one mandatory copy of its + // sub-pattern's prefix. With min == 0, the entire repeat is + // optional and contributes nothing. + if (r.min >= 1) _ = try collectPrefix(r.sub, arena, buf); + // Always stop here — beyond the first mandatory copy the + // repeat could match more bytes, but they're not part of a + // CONTIGUOUS prefix every match shares. + return false; + }, + // Class, dot, anchor, and alt all end the prefix. + .class, .dot, .anchor, .alt => return false, + } +} + + +// ── Tests ── + +const parser = @import("parser.zig"); + +fn fullFor(alloc: std.mem.Allocator, pattern: []const u8) !?[]const u8 { + var arena = std.heap.ArenaAllocator.init(alloc); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + const lit = try extractFullLiteral(arena.allocator(), root); + if (lit) |bytes| return try alloc.dupe(u8, bytes); + return null; +} + +fn requiredFor(alloc: std.mem.Allocator, pattern: []const u8) !?[]const u8 { + var arena = std.heap.ArenaAllocator.init(alloc); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + const lit = try extractRequiredLiteral(arena.allocator(), root, 3); + if (lit) |bytes| return try alloc.dupe(u8, bytes); + return null; +} + +fn prefixFor(alloc: std.mem.Allocator, pattern: []const u8) !?[]const u8 { + var arena = std.heap.ArenaAllocator.init(alloc); + defer arena.deinit(); + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + const lit = try extractLiteralPrefix(arena.allocator(), root, 3); + if (lit) |bytes| return try alloc.dupe(u8, bytes); + return null; +} + +test "full literal: plain identifier" { + const got = (try fullFor(std.testing.allocator, "compileAllocFlags")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("compileAllocFlags", got); +} + +test "full literal: non-capturing group is transparent" { + const got = (try fullFor(std.testing.allocator, "(?:abc)def")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("abcdef", got); +} + +test "full literal: capturing group blocks" { + try std.testing.expect((try fullFor(std.testing.allocator, "(abc)")) == null); +} + +test "full literal: dot blocks" { + try std.testing.expect((try fullFor(std.testing.allocator, "a.b")) == null); +} + +test "full literal: class blocks" { + try std.testing.expect((try fullFor(std.testing.allocator, "a[bc]")) == null); +} + +test "full literal: alternation blocks" { + try std.testing.expect((try fullFor(std.testing.allocator, "abc|def")) == null); +} + +test "full literal: repeat blocks" { + try std.testing.expect((try fullFor(std.testing.allocator, "ab+")) == null); +} + +test "required literal: extracts prefix before class" { + const got = (try requiredFor(std.testing.allocator, "compileAllocFlags\\([a-z]+")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("compileAllocFlags(", got); +} + +test "required literal: picks longest run" { + const got = (try requiredFor(std.testing.allocator, "[a-z]hello\\d+worlds\\s+end")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("worlds", got); +} + +test "required literal: too short returns null" { + try std.testing.expect((try requiredFor(std.testing.allocator, "a.b")) == null); +} + +test "required literal: alternation bails" { + try std.testing.expect((try requiredFor(std.testing.allocator, "foo|bar")) == null); +} + +test "required literal: min=0 quantifier doesn't contribute" { + // 'a*bar' — 'a' is optional, so the required run is 'bar'. + const got = (try requiredFor(std.testing.allocator, "a*barbaz")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("barbaz", got); +} + +test "required literal: min=1 quantifier contributes single copy" { + // 'a+bar' — 'a' is required (at least once), then 'bar'. + // After walking the repeat, we flush — so 'a' alone is the run, but + // 'bar' is longer. Best = 'bar'. + const got = (try requiredFor(std.testing.allocator, "a+barbaz")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("barbaz", got); +} + +test "literal prefix: simple identifier" { + const got = (try prefixFor(std.testing.allocator, "compileAllocFlags\\([a-z]+")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("compileAllocFlags(", got); +} + +test "literal prefix: class at start bails" { + try std.testing.expect((try prefixFor(std.testing.allocator, "[a-z]+hello")) == null); +} + +test "literal prefix: alternation at top bails" { + try std.testing.expect((try prefixFor(std.testing.allocator, "foo|bar")) == null); +} + +test "literal prefix: under threshold returns null" { + try std.testing.expect((try prefixFor(std.testing.allocator, "ab.c")) == null); +} + +test "literal prefix: stops at optional byte" { + // `foox?bar` — 'foo' is mandatory, then 'x?' is optional, so prefix is 'foo'. + const got = (try prefixFor(std.testing.allocator, "foox?bar")).?; + defer std.testing.allocator.free(got); + try std.testing.expectEqualStrings("foo", got); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/probe.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/probe.zig new file mode 100644 index 0000000..73cbe20 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/probe.zig @@ -0,0 +1,87 @@ +//! Parity-test probe CLI. +//! +//! Wire shape: nanoregex_probe [flags] +//! Flags string (3rd argv) is any concatenation of `i`/`m`/`s` matching +//! Python re's IGNORECASE / MULTILINE / DOTALL — same encoding used by +//! tests/parity/run.sh. +//! +//! Output: one match per line as `start..end\tmatched_bytes`. Bytes are +//! emitted raw (no escaping) so the harness can diff byte-for-byte against +//! `re.finditer` output formatted the same way. +//! +//! Stdio is via extern libc `write`: Zig 0.16 removed the synchronous +//! stdlib wrappers we used to rely on, and nanoregex links libc anyway. + +const std = @import("std"); +const nanoregex = @import("nanoregex"); + +extern "c" fn write(fd: c_int, ptr: [*]const u8, len: usize) isize; + +fn writeAll(fd: c_int, data: []const u8) void { + var rem = data; + while (rem.len > 0) { + const n = write(fd, rem.ptr, rem.len); + if (n <= 0) return; + rem = rem[@intCast(n)..]; + } +} + +fn parseFlags(s: []const u8) nanoregex.Flags { + var f: nanoregex.Flags = .{}; + for (s) |c| switch (c) { + 'i' => f.case_insensitive = true, + 'm' => f.multiline = true, + 's' => f.dot_all = true, + else => {}, + }; + return f; +} + +pub fn main(init: std.process.Init) !void { + const alloc = init.gpa; + + var args_list: std.ArrayList([]const u8) = .empty; + defer args_list.deinit(alloc); + var args_iter = init.minimal.args.iterate(); + while (args_iter.next()) |arg| try args_list.append(alloc, arg); + const args = args_list.items; + + if (args.len < 2) { + writeAll(2, "usage: nanoregex_probe [] []\n"); + std.process.exit(2); + } + const pattern = args[1]; + const haystack: []const u8 = if (args.len >= 3) args[2] else ""; + const flags = parseFlags(if (args.len >= 4) args[3] else ""); + + var r = nanoregex.Regex.compileWithFlags(alloc, pattern, flags) catch |err| { + var tmp: [128]u8 = undefined; + const msg = std.fmt.bufPrint(&tmp, "PARSE_ERROR: {s}\n", .{@errorName(err)}) catch "PARSE_ERROR\n"; + writeAll(1, msg); + // Exit 0 — the harness checks output content, not exit code, so + // PARSE_ERROR on both sides should match cleanly. + std.process.exit(0); + }; + defer r.deinit(); + + const matches = r.findAll(alloc, haystack) catch { + writeAll(2, "ENGINE_ERROR\n"); + std.process.exit(1); + }; + defer { + for (matches) |*m| @constCast(m).deinit(alloc); + alloc.free(matches); + } + + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(alloc); + + var line_buf: [256]u8 = undefined; + for (matches) |m| { + const header = std.fmt.bufPrint(&line_buf, "{d}..{d}\t", .{ m.span.start, m.span.end }) catch continue; + try buf.appendSlice(alloc, header); + try buf.appendSlice(alloc, haystack[m.span.start..m.span.end]); + try buf.append(alloc, '\n'); + } + writeAll(1, buf.items); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/root.zig b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/root.zig new file mode 100644 index 0000000..b7e4cee --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/src/root.zig @@ -0,0 +1,416 @@ +//! nanoregex — pure-Zig regex engine with Python-re-compatible semantics. +//! +//! Layered design: +//! 1. parser.zig — pattern bytes → AST +//! 2. ast.zig — AST node tagged union, arena-owned +//! 3. nfa.zig — AST → Thompson NFA +//! 4. exec.zig — Pike-VM NFA simulation (always-correct fallback) +//! 5. prefilter.zig — literal/required-substring extraction (fast path) +//! 6. dfa.zig — Lazy subset-construction DFA (perf path) +//! +//! Dispatch policy in findAll/search: +//! 1. Pure-literal AST + no captures + case-sensitive → memmem loop +//! 2. Required literal absent in haystack → early-return empty +//! 3. DFA eligible (no captures, no anchors, no case-insensitive) → DFA +//! 4. Otherwise → Pike VM +//! +//! The DFA is built eagerly at compile time (when eligible) so the hot loop +//! is a single transition-table lookup per byte. Regex deinit cleans the +//! DFA's arena. + +const std = @import("std"); +pub const ast = @import("ast.zig"); +pub const parser = @import("parser.zig"); +pub const nfa = @import("nfa.zig"); +pub const exec = @import("exec.zig"); +pub const prefilter = @import("prefilter.zig"); +pub const dfa = @import("dfa.zig"); +pub const minterm = @import("minterm.zig"); + +pub const Flags = exec.Flags; +pub const Span = exec.Span; +pub const Match = exec.MatchResult; + +pub const Regex = struct { + /// Backing arena for the AST + NFA + prefilter slices. Lives until deinit(). + arena: *std.heap.ArenaAllocator, + parent_alloc: std.mem.Allocator, + root: *const ast.Node, + /// Heap-allocated on `arena` so the address is stable for `dfa.nfa_ref` + /// and for `exec.Vm.init`. Storing the Nfa by-value here used to give + /// us a dangling pointer when the Regex was returned by value. + automaton: *nfa.Nfa, + flags: Flags, + n_groups: u32, + + /// Non-null iff the pattern is purely literal AND has zero capture + /// groups. Callers bypass the engine entirely and use SIMD `indexOf`. + pure_literal: ?[]const u8, + /// Non-null iff a contiguous substring is required to appear in every + /// match. Used as a coarse pre-filter. + required_literal: ?[]const u8, + /// Non-null iff every match's start position is at an occurrence of + /// this byte sequence. Used to skip directly to candidate start + /// positions via SIMD-accelerated indexOf, then run the DFA only at + /// those hits. + literal_prefix: ?[]const u8, + + /// Built eagerly when the pattern is DFA-eligible (no captures, no + /// anchors, no case-insensitive). Null otherwise. Mutable because the + /// DFA fills its transition table lazily during matching. + dfa_engine: ?dfa.Dfa, + + pub fn compile(alloc: std.mem.Allocator, pattern: []const u8) !Regex { + return compileWithFlags(alloc, pattern, .{}); + } + + pub fn compileWithFlags(alloc: std.mem.Allocator, pattern: []const u8, flags: Flags) !Regex { + const arena = try alloc.create(std.heap.ArenaAllocator); + errdefer alloc.destroy(arena); + arena.* = std.heap.ArenaAllocator.init(alloc); + errdefer arena.deinit(); + + var p = parser.Parser.init(arena.allocator(), pattern); + const root = try p.parseRoot(); + + // Heap-allocate the Nfa on the arena. We point at it from both the + // Regex itself and the Dfa; storing by value would invalidate the + // address once compileWithFlags returns by value. + const automaton_ptr = try arena.allocator().create(nfa.Nfa); + automaton_ptr.* = try nfa.build(arena.allocator(), root, p.n_groups); + + const pure_lit: ?[]const u8 = if (flags.case_insensitive) + null + else if (p.n_groups != 0) + null + else + try prefilter.extractFullLiteral(arena.allocator(), root); + + const req_lit: ?[]const u8 = if (flags.case_insensitive) + null + else + try prefilter.extractRequiredLiteral(arena.allocator(), root, 3); + + const lit_prefix: ?[]const u8 = if (flags.case_insensitive) + null + else + try prefilter.extractLiteralPrefix(arena.allocator(), root, 3); + + // Try to build a DFA. Falls back to null (=> Pike VM at runtime) + // when the pattern has captures, anchors, or grows the state + // table past the budget. Case-insensitive also skips DFA for v1 — + // adding case-folding to the bitmap test is straightforward but + // not done yet. + var dfa_engine: ?dfa.Dfa = null; + if (!flags.case_insensitive) { + if (dfa.Dfa.fromNfa(alloc, automaton_ptr, root, .{ .dot_all = flags.dot_all })) |built| { + dfa_engine = built; + } else |_| { + // Any DFA build error (HasCaptures, HasAnchors, TooManyStates, + // OOM) falls back silently. The Pike VM handles the same + // patterns correctly, just slower. + dfa_engine = null; + } + } + + return .{ + .arena = arena, + .parent_alloc = alloc, + .root = root, + .automaton = automaton_ptr, + .flags = flags, + .n_groups = p.n_groups, + .pure_literal = pure_lit, + .required_literal = req_lit, + .literal_prefix = lit_prefix, + .dfa_engine = dfa_engine, + }; + } + + pub fn deinit(self: *Regex) void { + if (self.dfa_engine) |*d| d.deinit(); + self.arena.deinit(); + self.parent_alloc.destroy(self.arena); + self.* = undefined; + } + + /// First leftmost match, or null. Caller owns the returned Match. + pub fn search(self: *Regex, alloc: std.mem.Allocator, input: []const u8) !?Match { + if (self.required_literal) |lit| { + if (std.mem.indexOf(u8, input, lit) == null) return null; + } + if (self.pure_literal) |lit| return try literalFirst(alloc, lit, input); + if (self.literal_prefix) |prefix| if (self.dfa_engine) |*d| + return try dfaFirstWithPrefix(alloc, d, input, prefix); + if (self.dfa_engine) |*d| return try dfaFirst(alloc, d, input); + + var vm = exec.Vm.init(alloc, self.automaton, self.flags); + return try vm.search(input); + } + + /// All non-overlapping matches, leftmost-first. + pub fn findAll(self: *Regex, alloc: std.mem.Allocator, input: []const u8) ![]Match { + if (self.required_literal) |lit| { + if (std.mem.indexOf(u8, input, lit) == null) { + return try alloc.alloc(Match, 0); + } + } + if (self.pure_literal) |lit| return try literalAll(alloc, lit, input); + if (self.literal_prefix) |prefix| if (self.dfa_engine) |*d| + return try dfaAllWithPrefix(alloc, d, input, prefix); + if (self.dfa_engine) |*d| return try dfaAll(alloc, d, input); + + var vm = exec.Vm.init(alloc, self.automaton, self.flags); + return try vm.findAll(input); + } + + /// Replace every non-overlapping match. Backreferences (`\N`) honoured. + pub fn replaceAll(self: *Regex, alloc: std.mem.Allocator, input: []const u8, replacement: []const u8) ![]u8 { + const matches = try self.findAll(alloc, input); + defer { + for (matches) |*m| @constCast(m).deinit(alloc); + alloc.free(matches); + } + + var out: std.ArrayList(u8) = .empty; + defer out.deinit(alloc); + + var cursor: usize = 0; + for (matches) |m| { + try out.appendSlice(alloc, input[cursor..m.span.start]); + try appendReplacement(alloc, &out, replacement, m, input); + cursor = m.span.end; + } + try out.appendSlice(alloc, input[cursor..]); + return try out.toOwnedSlice(alloc); + } +}; + +// ── Literal fast paths ── + +fn literalFirst(alloc: std.mem.Allocator, needle: []const u8, haystack: []const u8) !?Match { + if (needle.len == 0) { + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = 0, .end = 0 }; + return .{ .span = .{ .start = 0, .end = 0 }, .captures = captures }; + } + const idx = std.mem.indexOf(u8, haystack, needle) orelse return null; + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = idx, .end = idx + needle.len }; + return .{ .span = .{ .start = idx, .end = idx + needle.len }, .captures = captures }; +} + +fn literalAll(alloc: std.mem.Allocator, needle: []const u8, haystack: []const u8) ![]Match { + var results: std.ArrayList(Match) = .empty; + errdefer { + for (results.items) |*m| @constCast(m).deinit(alloc); + results.deinit(alloc); + } + if (needle.len == 0) return try results.toOwnedSlice(alloc); + + var pos: usize = 0; + while (pos <= haystack.len) { + const idx = std.mem.indexOfPos(u8, haystack, pos, needle) orelse break; + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = idx, .end = idx + needle.len }; + try results.append(alloc, .{ .span = .{ .start = idx, .end = idx + needle.len }, .captures = captures }); + pos = idx + needle.len; + } + return try results.toOwnedSlice(alloc); +} + +// ── DFA wrappers ── +// +// Adapter from `dfa.Dfa`'s span-only output to the public `Match` shape +// (which carries a captures slice). DFA mode has no captures so we emit a +// 1-element captures array containing just the whole-match span. + +fn dfaFirst(alloc: std.mem.Allocator, d: *dfa.Dfa, input: []const u8) !?Match { + var p: usize = 0; + while (p <= input.len) : (p += 1) { + const end_opt = try d.matchAt(input, p); + if (end_opt) |end| { + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = p, .end = end }; + return .{ .span = .{ .start = p, .end = end }, .captures = captures }; + } + } + return null; +} + +fn dfaAll(alloc: std.mem.Allocator, d: *dfa.Dfa, input: []const u8) ![]Match { + const spans = try d.findAll(alloc, input); + defer alloc.free(spans); + + var results = try alloc.alloc(Match, spans.len); + var built: usize = 0; + errdefer { + for (results[0..built]) |*m| m.deinit(alloc); + alloc.free(results); + } + for (spans) |span| { + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = span.start, .end = span.end }; + results[built] = .{ .span = .{ .start = span.start, .end = span.end }, .captures = captures }; + built += 1; + } + return results; +} + +/// Like `dfaFirst` but uses `prefix` to skip directly to candidate match +/// starts via `std.mem.indexOfPos`. Far fewer engine invocations for +/// sparse literal-prefixed patterns. +fn dfaFirstWithPrefix(alloc: std.mem.Allocator, d: *dfa.Dfa, input: []const u8, prefix: []const u8) !?Match { + var pos: usize = 0; + while (true) { + const hit = std.mem.indexOfPos(u8, input, pos, prefix) orelse return null; + if (try d.matchAt(input, hit)) |end| { + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = hit, .end = end }; + return .{ .span = .{ .start = hit, .end = end }, .captures = captures }; + } + // DFA didn't accept at this hit (the prefix matched but the rest + // of the pattern didn't). Advance one byte past this hit and + // resume the indexOf scan. + pos = hit + 1; + } +} + +fn dfaAllWithPrefix(alloc: std.mem.Allocator, d: *dfa.Dfa, input: []const u8, prefix: []const u8) ![]Match { + var results: std.ArrayList(Match) = .empty; + errdefer { + for (results.items) |*m| @constCast(m).deinit(alloc); + results.deinit(alloc); + } + + var pos: usize = 0; + while (true) { + const hit = std.mem.indexOfPos(u8, input, pos, prefix) orelse break; + if (try d.matchAt(input, hit)) |end| { + const captures = try alloc.alloc(?Span, 1); + captures[0] = .{ .start = hit, .end = end }; + try results.append(alloc, .{ + .span = .{ .start = hit, .end = end }, + .captures = captures, + }); + // Skip past the match end. Zero-width match falls back to + // hit+1 so we don't infinite-loop. + pos = if (end > hit) end else hit + 1; + } else { + pos = hit + 1; + } + } + return try results.toOwnedSlice(alloc); +} + +fn appendReplacement( + alloc: std.mem.Allocator, + out: *std.ArrayList(u8), + replacement: []const u8, + m: Match, + input: []const u8, +) !void { + var i: usize = 0; + while (i < replacement.len) { + const c = replacement[i]; + if (c == '\\' and i + 1 < replacement.len) { + const n = replacement[i + 1]; + switch (n) { + '0'...'9' => { + const idx: usize = n - '0'; + if (idx < m.captures.len) { + if (m.captures[idx]) |span| try out.appendSlice(alloc, input[span.start..span.end]); + } + i += 2; + continue; + }, + 'n' => { try out.append(alloc, '\n'); i += 2; continue; }, + 't' => { try out.append(alloc, '\t'); i += 2; continue; }, + 'r' => { try out.append(alloc, '\r'); i += 2; continue; }, + '\\' => { try out.append(alloc, '\\'); i += 2; continue; }, + else => { try out.append(alloc, '\\'); i += 1; continue; }, + } + } + try out.append(alloc, c); + i += 1; + } +} + +test "module imports compile" { + std.testing.refAllDecls(@This()); + std.testing.refAllDecls(ast); + std.testing.refAllDecls(parser); + std.testing.refAllDecls(nfa); + std.testing.refAllDecls(exec); + std.testing.refAllDecls(prefilter); + std.testing.refAllDecls(dfa); +} + +test "Regex.search basic" { + var r = try Regex.compile(std.testing.allocator, "[a-z]+"); + defer r.deinit(); + var m = (try r.search(std.testing.allocator, "Hello World")).?; + defer m.deinit(std.testing.allocator); + try std.testing.expectEqual(@as(usize, 1), m.span.start); + try std.testing.expectEqual(@as(usize, 5), m.span.end); +} + +test "Regex.findAll" { + var r = try Regex.compile(std.testing.allocator, "\\d+"); + defer r.deinit(); + const ms = try r.findAll(std.testing.allocator, "abc 42 xyz 1234"); + defer { + for (ms) |*m| @constCast(m).deinit(std.testing.allocator); + std.testing.allocator.free(ms); + } + try std.testing.expectEqual(@as(usize, 2), ms.len); +} + +test "Regex pure literal fast path" { + var r = try Regex.compile(std.testing.allocator, "compileAllocFlags"); + defer r.deinit(); + try std.testing.expect(r.pure_literal != null); + try std.testing.expectEqualStrings("compileAllocFlags", r.pure_literal.?); + const ms = try r.findAll(std.testing.allocator, "abc compileAllocFlags xyz compileAllocFlags"); + defer { + for (ms) |*m| @constCast(m).deinit(std.testing.allocator); + std.testing.allocator.free(ms); + } + try std.testing.expectEqual(@as(usize, 2), ms.len); +} + +test "Regex DFA engine is built when eligible" { + var r = try Regex.compile(std.testing.allocator, "[a-z]+"); + defer r.deinit(); + // [a-z]+ has no captures, no anchors → DFA should be built. + try std.testing.expect(r.dfa_engine != null); +} + +test "Regex falls back to Pike VM with captures" { + var r = try Regex.compile(std.testing.allocator, "(abc)"); + defer r.deinit(); + try std.testing.expect(r.dfa_engine == null); +} + +test "Regex falls back to Pike VM with anchors" { + var r = try Regex.compile(std.testing.allocator, "^foo"); + defer r.deinit(); + try std.testing.expect(r.dfa_engine == null); +} + +test "Regex required-literal pre-filter skips haystack with no candidates" { + var r = try Regex.compile(std.testing.allocator, "hello\\d+"); + defer r.deinit(); + try std.testing.expectEqualStrings("hello", r.required_literal.?); + const ms = try r.findAll(std.testing.allocator, "no candidates anywhere here"); + defer std.testing.allocator.free(ms); + try std.testing.expectEqual(@as(usize, 0), ms.len); +} + +test "Regex.replaceAll with backreference" { + var r = try Regex.compile(std.testing.allocator, "(\\w+)@(\\w+)"); + defer r.deinit(); + const out = try r.replaceAll(std.testing.allocator, "alice@example bob@host", "\\2/\\1"); + defer std.testing.allocator.free(out); + try std.testing.expectEqualStrings("example/alice host/bob", out); +} diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/001_literal.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/001_literal.txt new file mode 100644 index 0000000..24a9ec9 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/001_literal.txt @@ -0,0 +1,3 @@ +abc + +the quick abc jumps over abc lazy abc diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/002_dot_star.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/002_dot_star.txt new file mode 100644 index 0000000..d195989 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/002_dot_star.txt @@ -0,0 +1,3 @@ +a.*b + +start aXYZb middle a___b end diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/003_char_class.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/003_char_class.txt new file mode 100644 index 0000000..e2ff782 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/003_char_class.txt @@ -0,0 +1,3 @@ +[a-z]+ + +Hello World 42 diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/004_anchors.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/004_anchors.txt new file mode 100644 index 0000000..bb24a3b --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/004_anchors.txt @@ -0,0 +1,5 @@ +^foo +m +foo +bar foo +foo bar diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/005_alternation.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/005_alternation.txt new file mode 100644 index 0000000..764da96 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/005_alternation.txt @@ -0,0 +1,3 @@ +cat|dog|bird + +the cat saw a dog and a bird diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/006_shorthand_digit.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/006_shorthand_digit.txt new file mode 100644 index 0000000..9487302 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/006_shorthand_digit.txt @@ -0,0 +1,3 @@ +\d+ + +version 1.2.345 build 678 diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/007_group_capture.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/007_group_capture.txt new file mode 100644 index 0000000..f9dd704 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/007_group_capture.txt @@ -0,0 +1,3 @@ +(\w+)@(\w+) + +alice@example bob@host diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/008_lazy_star.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/008_lazy_star.txt new file mode 100644 index 0000000..a6140ed --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/008_lazy_star.txt @@ -0,0 +1,3 @@ +a.*?b + +start aXXXb middle aYYYb end diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/009_lazy_plus.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/009_lazy_plus.txt new file mode 100644 index 0000000..d421e36 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/009_lazy_plus.txt @@ -0,0 +1,3 @@ +<.+?> + + diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/010_nested_groups.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/010_nested_groups.txt new file mode 100644 index 0000000..58c877e --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/010_nested_groups.txt @@ -0,0 +1,3 @@ +((a)(b))+ + +ababab xyz abab diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/011_escape_dot.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/011_escape_dot.txt new file mode 100644 index 0000000..7c85091 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/011_escape_dot.txt @@ -0,0 +1,3 @@ +\. + +foo.bar baz.qux .end diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/012_word_boundary.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/012_word_boundary.txt new file mode 100644 index 0000000..4b919c6 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/012_word_boundary.txt @@ -0,0 +1,3 @@ +\bcat\b + +the cat sat on a catnap but a cat is a cat diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/013_dollar_multiline.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/013_dollar_multiline.txt new file mode 100644 index 0000000..0e4a6f5 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/013_dollar_multiline.txt @@ -0,0 +1,6 @@ +foo$ +m +foo +bar foo +foo bar +foo diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/014_case_insensitive.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/014_case_insensitive.txt new file mode 100644 index 0000000..3e78a7f --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/014_case_insensitive.txt @@ -0,0 +1,3 @@ +hello +i +HELLO Hello hello hElLo diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/015_dot_all.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/015_dot_all.txt new file mode 100644 index 0000000..94be27f --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/015_dot_all.txt @@ -0,0 +1,6 @@ +a.b +s +aXb +a +b a +b diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/016_counted_range.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/016_counted_range.txt new file mode 100644 index 0000000..3c4f120 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/016_counted_range.txt @@ -0,0 +1,3 @@ +a{2,4} + +a aa aaa aaaa aaaaa aaaaaa diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/017_alternation_anchors.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/017_alternation_anchors.txt new file mode 100644 index 0000000..b4c3eb6 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/017_alternation_anchors.txt @@ -0,0 +1,6 @@ +^cat|dog$ +m +cat sees dog +dog sees cat +catnap +bulldog diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/018_catastrophic_backtrack.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/018_catastrophic_backtrack.txt new file mode 100644 index 0000000..ef772ce --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/018_catastrophic_backtrack.txt @@ -0,0 +1,3 @@ +(a+)+b + +aaaaaaaaaab end diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/019_char_class_negation.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/019_char_class_negation.txt new file mode 100644 index 0000000..4bfefae --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/019_char_class_negation.txt @@ -0,0 +1,3 @@ +[^aeiou]+ + +hello world diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/020_non_capturing_group.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/020_non_capturing_group.txt new file mode 100644 index 0000000..4b71115 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/020_non_capturing_group.txt @@ -0,0 +1,3 @@ +(?:foo|bar)+ + +foo barbar foobarfoo xxx baz diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/021_optional_quantifier.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/021_optional_quantifier.txt new file mode 100644 index 0000000..98b3b53 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/021_optional_quantifier.txt @@ -0,0 +1,3 @@ +colou?r + +color and colour are both valid diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/022_min_zero_quantifier.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/022_min_zero_quantifier.txt new file mode 100644 index 0000000..ed8924a --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/022_min_zero_quantifier.txt @@ -0,0 +1,3 @@ +x{0,3}y + +y xy xxy xxxy xxxxy diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/023_email_like.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/023_email_like.txt new file mode 100644 index 0000000..da21f6b --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/023_email_like.txt @@ -0,0 +1,3 @@ +[a-zA-Z0-9_.]+@[a-zA-Z0-9.]+ + +contact me at alice@example.com or bob.test@host.io diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/024_string_anchor.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/024_string_anchor.txt new file mode 100644 index 0000000..118ddb0 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/024_string_anchor.txt @@ -0,0 +1,4 @@ +\Aabc +m +abc +abc again diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/025_function_def_pattern.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/025_function_def_pattern.txt new file mode 100644 index 0000000..75621ef --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/025_function_def_pattern.txt @@ -0,0 +1,5 @@ +fn [A-Za-z_][A-Za-z0-9_]*\( + +pub fn main() void +fn helper(x: i32) i32 +const fn_ptr = foo diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/026_unbounded_min.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/026_unbounded_min.txt new file mode 100644 index 0000000..9185539 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/026_unbounded_min.txt @@ -0,0 +1,3 @@ +a{3,} + +a aa aaa aaaa aaaaaaaa diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/027_zero_width_loop.txt b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/027_zero_width_loop.txt new file mode 100644 index 0000000..ba281a7 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/fixtures/027_zero_width_loop.txt @@ -0,0 +1,3 @@ +a* + +bbb diff --git a/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/run.sh b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/run.sh new file mode 100755 index 0000000..e308f41 --- /dev/null +++ b/zig-pkg/nanoregex-0.0.1-EdkhcXoqAgC6HvjxEqNIedT0YWMRfouWk8dh4G5ZC2L9/tests/parity/run.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# parity test harness: zigregex vs python re +# +# Each fixture in $FIXTURES is a 3+ line text file: +# line 1: regex pattern (raw, no quoting) +# line 2: flags (comma-separated subset of: i, m, s — empty line if none) +# line 3+: haystack (joined with \n if multiline) +# +# For each fixture we run python3's re.findall and the zigregex_probe +# binary on the same input. They must produce byte-identical output. +# +# Until exec.zig lands, the matcher is a panic — we tolerate "matcher not +# yet implemented" on the zig side and only verify the python reference +# itself runs. Flip $REQUIRE_MATCH to 1 once the matcher is wired. + +set -uo pipefail + +PROBE="${1:-}" +FIXTURES="${2:-tests/parity/fixtures}" +REQUIRE_MATCH="${REQUIRE_MATCH:-1}" + +if [ -z "$PROBE" ] || [ ! -x "$PROBE" ]; then + echo "usage: $0 " >&2 + exit 2 +fi + +if [ ! -d "$FIXTURES" ]; then + echo "fixtures dir not found: $FIXTURES" >&2 + exit 2 +fi + +pass=0 +fail=0 +skip=0 + +for f in "$FIXTURES"/*.txt; do + [ -e "$f" ] || continue + name=$(basename "$f" .txt) + + pattern=$(sed -n '1p' "$f") + flags=$(sed -n '2p' "$f") + haystack=$(sed -n '3,$p' "$f") + + py_out=$( + PATTERN="$pattern" FLAGS="$flags" HAYSTACK="$haystack" \ + python3 - <<'PY' +import os, re, sys +pattern = os.environ["PATTERN"] +flags_str = os.environ.get("FLAGS", "") +haystack = os.environ["HAYSTACK"] +flag_bits = 0 +if "i" in flags_str: flag_bits |= re.IGNORECASE +if "m" in flags_str: flag_bits |= re.MULTILINE +if "s" in flags_str: flag_bits |= re.DOTALL +try: + pat = re.compile(pattern, flag_bits) +except re.error as e: + print(f"PARSE_ERROR: {e}") + sys.exit(0) +for m in pat.finditer(haystack): + print(f"{m.start()}..{m.end()}\t{m.group(0)}") +PY + ) + + if [ "$REQUIRE_MATCH" != "1" ]; then + # Phase 1: just verify the python reference computes a result. + # We don't yet compare against zig output because the matcher + # panics. Once exec.zig lands, set REQUIRE_MATCH=1. + skip=$((skip + 1)) + echo "SKIP $name (python ref: $(echo "$py_out" | wc -l | tr -d ' ') lines)" + continue + fi + + zig_out=$("$PROBE" "$pattern" "$haystack" "$flags" 2>&1) + if [ "$py_out" = "$zig_out" ]; then + pass=$((pass + 1)) + echo "PASS $name" + else + fail=$((fail + 1)) + echo "FAIL $name" + echo " pattern: $pattern" + echo " py: $(echo "$py_out" | head -5)" + echo " zig: $(echo "$zig_out" | head -5)" + fi +done + +echo "" +echo "parity: $pass passed, $fail failed, $skip skipped" +exit $(( fail > 0 ? 1 : 0 ))