From 930fc77361da595831bcf0082b8beb60597f69ec Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Tue, 12 May 2026 00:40:47 +0800
Subject: [PATCH 1/2] test: failing tests for #447 and #451 (skip-trigram
 invisibility)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/tests.zig | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/src/tests.zig b/src/tests.zig
index 3d3624b..ac21c78 100644
--- a/src/tests.zig
+++ b/src/tests.zig
@@ -11275,3 +11275,83 @@ test "rerank-trace: single-result query records non-zero rerank score" {
     try testing.expect(std.mem.indexOf(u8, data, "\"score\":0.0000") == null);
     try testing.expect(std.mem.indexOf(u8, data, "src/loneSym.zig") != null);
 }
+
+test "issue-447: searchContent surfaces large (>64KB) skip-trigram files for common identifiers" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    var i: usize = 0;
+    while (i < 12) : (i += 1) {
+        var path_buf: [32]u8 = undefined;
+        const path = try std.fmt.bufPrint(&path_buf, "small_{d}.zig", .{i});
+        try explorer.indexFile(path, "fn s() void { _ = widgetX; }\n");
+    }
+
+    const canonical_content =
+        "fn canonical() void {\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "}\n";
+    try explorer.indexFileSkipTrigram("canonical.zig", canonical_content);
+
+    const results = try explorer.searchContent("widgetX", testing.allocator, 5);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.path);
+            testing.allocator.free(r.line_text);
+        }
+        testing.allocator.free(results);
+    }
+
+    var found_canonical = false;
+    for (results) |r| {
+        if (std.mem.eql(u8, r.path, "canonical.zig")) {
+            found_canonical = true;
+            break;
+        }
+    }
+    try testing.expect(found_canonical);
+}
+
+test "issue-451: scope search surfaces skip-trigram canonical file" {
+    var arena = std.heap.ArenaAllocator.init(testing.allocator);
+    defer arena.deinit();
+    var explorer = Explorer.init(arena.allocator());
+
+    var i: usize = 0;
+    while (i < 12) : (i += 1) {
+        var path_buf: [32]u8 = undefined;
+        const path = try std.fmt.bufPrint(&path_buf, "small_{d}.zig", .{i});
+        try explorer.indexFile(path, "fn s() void { _ = widgetX; }\n");
+    }
+
+    const canonical_content =
+        "fn canonical() void {\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "    _ = widgetX;\n" ++
+        "}\n";
+    try explorer.indexFileSkipTrigram("canonical.zig", canonical_content);
+
+    const results = try explorer.searchContentWithScope("widgetX", testing.allocator, 5);
+    defer {
+        for (results) |r| {
+            testing.allocator.free(r.line_text);
+            testing.allocator.free(r.path);
+            if (r.scope_name) |s| testing.allocator.free(s);
+        }
+        testing.allocator.free(results);
+    }
+
+    var found_canonical = false;
+    for (results) |r| {
+        if (std.mem.eql(u8, r.path, "canonical.zig")) found_canonical = true;
+    }
+    try testing.expect(found_canonical);
+}

From 6d3fa0e3259b7bfcb4670a4869d374c039e3fbcd Mon Sep 17 00:00:00 2001
From: justrach <54503978+justrach@users.noreply.github.com>
Date: Tue, 12 May 2026 00:49:07 +0800
Subject: [PATCH 2/2] fix(explore): merge skip_trigram_files into Tier 1
 candidate pool (#447, #451)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Files >64KB skip trigram indexing and land in skip_trigram_files. Pre-fix,
searchContent deferred these to Tier 3 which never ran when Tier 1 (trigram
candidates) already filled max_results — making the canonical definition site
invisible. Same bug existed in searchContentWithScope.

Fix: build word-hit counts per file and merge skip_trigram_files paths that
have word-index hits into the Tier 1 candidate pool, sorted by hit count desc
alongside trigram candidates. This ensures definition-dense files (high word
hit counts) surface even when max_results fills during Tier 1 traversal.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/explore.zig | 145 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 103 insertions(+), 42 deletions(-)

diff --git a/src/explore.zig b/src/explore.zig
index a1c18e6..2851c0e 100644
--- a/src/explore.zig
+++ b/src/explore.zig
@@ -1639,44 +1639,57 @@ pub const Explorer = struct {
         const candidate_paths = self.trigram_index.candidates(query, allocator);
         defer if (candidate_paths) |cp| allocator.free(cp);
 
-        // Tier 1: trigram candidates — fast path, skips files already found by Tier 0.
-        if (candidate_paths) |cp| {
-            if (cp.len > 0) {
-                // Issue #427: rank candidates by per-file word-index hit count
-                // (desc) so the definition-dense file scans first; fall back to
-                // file content length (asc) so small files still come before
-                // unrelated large files at the same hit count. Pre-fix the
-                // sort key was content length alone, which buried the canonical
-                // file behind unrelated short files when max_per_file was 1.
-                var hits_per_file = std.StringHashMap(u32).init(allocator);
-                defer hits_per_file.deinit();
-                for (word_hits) |hit| {
-                    const hp = self.word_index.hitPath(hit);
-                    if (hp.len == 0) continue;
-                    const gop_h = try hits_per_file.getOrPut(hp);
-                    if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
-                    gop_h.value_ptr.* += 1;
+        // Tier 1: trigram candidates merged with skip_trigram_files that have
+        // word-index hits — all sorted by per-file word-hit count desc so the
+        // definition-dense file scans first. Pre-fix: skip_trigram_files were
+        // deferred to Tier 3 which never ran when Tier 1 filled max_results.
+        {
+            var hits_per_file = std.StringHashMap(u32).init(allocator);
+            defer hits_per_file.deinit();
+            for (word_hits) |hit| {
+                const hp = self.word_index.hitPath(hit);
+                if (hp.len == 0) continue;
+                const gop_h = try hits_per_file.getOrPut(hp);
+                if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
+                gop_h.value_ptr.* += 1;
+            }
+
+            // Build a combined list: trigram candidates + skip_trigram_files
+            // paths that the word index knows about for this query.
+            var combined: std.ArrayList([]const u8) = .empty;
+            defer combined.deinit(allocator);
+            if (candidate_paths) |cp| {
+                for (cp) |p| try combined.append(allocator, p);
+            }
+            var skip_iter_t1 = self.skip_trigram_files.keyIterator();
+            while (skip_iter_t1.next()) |key_ptr| {
+                if (hits_per_file.contains(key_ptr.*)) {
+                    try combined.append(allocator, key_ptr.*);
                 }
-                const SortCtx = struct {
-                    contents: *const std.StringHashMap([]const u8),
-                    counts: *const std.StringHashMap(u32),
-                    pub fn lessThan(ctx: @This(), a: []const u8, b: []const u8) bool {
-                        const a_count = ctx.counts.get(a) orelse 0;
-                        const b_count = ctx.counts.get(b) orelse 0;
-                        if (a_count != b_count) return a_count > b_count;
-                        const a_len = if (ctx.contents.get(a)) |c| c.len else std.math.maxInt(usize);
-                        const b_len = if (ctx.contents.get(b)) |c| c.len else std.math.maxInt(usize);
-                        return a_len < b_len;
-                    }
-                };
-                std.mem.sort([]const u8, @constCast(cp), SortCtx{ .contents = &self.contents, .counts = &hits_per_file }, SortCtx.lessThan);
+            }
+
+            const SortCtx = struct {
+                contents: *const std.StringHashMap([]const u8),
+                counts: *const std.StringHashMap(u32),
+                pub fn lessThan(ctx: @This(), a: []const u8, b: []const u8) bool {
+                    const a_count = ctx.counts.get(a) orelse 0;
+                    const b_count = ctx.counts.get(b) orelse 0;
+                    if (a_count != b_count) return a_count > b_count;
+                    const a_len = if (ctx.contents.get(a)) |c| c.len else std.math.maxInt(usize);
+                    const b_len = if (ctx.contents.get(b)) |c| c.len else std.math.maxInt(usize);
+                    return a_len < b_len;
+                }
+            };
+            std.mem.sort([]const u8, combined.items, SortCtx{ .contents = &self.contents, .counts = &hits_per_file }, SortCtx.lessThan);
 
-                const estimated_total = cp.len + self.skip_trigram_files.count();
+            if (combined.items.len > 0) {
+                const estimated_total = combined.items.len + self.skip_trigram_files.count();
                 const max_per_file = @max(@as(usize, 1), max_results / @max(@as(usize, 1), estimated_total));
-                for (cp) |path| {
+                for (combined.items) |path| {
                     if (searched.contains(path)) continue;
                     const ref = self.readContentForSearch(path, allocator) orelse continue;
                     defer ref.deinit();
+                    searched.put(path, {}) catch {};
                     try searchInContent(path, ref.data, query, allocator, max_per_file, max_results, &result_list);
                     if (result_list.items.len >= max_results)
                         return self.rerankAndFinalize(&result_list, query, allocator);
@@ -1684,11 +1697,6 @@ pub const Explorer = struct {
             }
         }
 
-        // Mark all Tier 1 candidates as searched.
-        if (candidate_paths) |cp| {
-            for (cp) |p| searched.put(p, {}) catch {};
-        }
-
         // Tier 2: sparse candidates — LAZY, only computed when Tier 1 found nothing.
         if (result_list.items.len == 0) {
             const sparse_paths = self.sparse_ngram_index.candidates(query, allocator);
@@ -1705,7 +1713,7 @@ pub const Explorer = struct {
             }
         }
 
-        // Tier 3: skip_trigram_files not already searched.
+        // Tier 3: skip_trigram_files not already searched (no word-index hits).
         if (result_list.items.len < max_results) {
             var skip_iter = self.skip_trigram_files.keyIterator();
             while (skip_iter.next()) |key_ptr| {
@@ -1718,6 +1726,7 @@ pub const Explorer = struct {
             }
         }
 
+
         // Tier 4: word index scan — for files not yet searched.
         if (result_list.items.len < max_results) {
             const tier4_hits = self.word_index.search(query);
@@ -3870,13 +3879,47 @@ pub const Explorer = struct {
         var searched = std.StringHashMap(void).init(allocator);
         defer searched.deinit();
 
+        // Build word-hit counts per file so skip_trigram_files with hits can
+        // be merged into the primary candidate loop sorted by relevance,
+        // preventing Tier 1 from filling max_results before canonical.zig is reached.
+        const word_hits_scope = self.word_index.search(query);
+        var hits_per_file_scope = std.StringHashMap(u32).init(allocator);
+        defer hits_per_file_scope.deinit();
+        for (word_hits_scope) |hit| {
+            const hp = self.word_index.hitPath(hit);
+            if (hp.len == 0) continue;
+            const gop_h = try hits_per_file_scope.getOrPut(hp);
+            if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
+            gop_h.value_ptr.* += 1;
+        }
+
+        const SortCtxS = struct {
+            counts: *const std.StringHashMap(u32),
+            pub fn lessThan(ctx: @This(), a: []const u8, b: []const u8) bool {
+                const a_count = ctx.counts.get(a) orelse 0;
+                const b_count = ctx.counts.get(b) orelse 0;
+                return a_count > b_count;
+            }
+        };
+
         if (sparse_paths != null and sparse_paths.?.len > 0) {
             if (candidate_paths != null and candidate_paths.?.len > 0) {
                 var sparse_set = std.StringHashMap(void).init(allocator);
                 defer sparse_set.deinit();
                 for (sparse_paths.?) |p| try sparse_set.put(p, {});
-                for (candidate_paths.?) |path| {
-                    if (!sparse_set.contains(path)) continue;
+
+                var combined_s: std.ArrayList([]const u8) = .empty;
+                defer combined_s.deinit(allocator);
+                for (candidate_paths.?) |p| {
+                    if (sparse_set.contains(p)) try combined_s.append(allocator, p);
+                }
+                var skip_it = self.skip_trigram_files.keyIterator();
+                while (skip_it.next()) |kp| {
+                    if (hits_per_file_scope.contains(kp.*)) try combined_s.append(allocator, kp.*);
+                }
+                std.mem.sort([]const u8, combined_s.items, SortCtxS{ .counts = &hits_per_file_scope }, SortCtxS.lessThan);
+                for (combined_s.items) |path| {
+                    if (searched.contains(path)) continue;
                     const ref = self.readContentForSearch(path, allocator) orelse continue;
                     defer ref.deinit();
                     try searched.put(path, {});
@@ -3884,7 +3927,16 @@ pub const Explorer = struct {
                     if (result_list.items.len >= max_results) break;
                 }
             } else {
-                for (sparse_paths.?) |path| {
+                var combined_s: std.ArrayList([]const u8) = .empty;
+                defer combined_s.deinit(allocator);
+                for (sparse_paths.?) |p| try combined_s.append(allocator, p);
+                var skip_it = self.skip_trigram_files.keyIterator();
+                while (skip_it.next()) |kp| {
+                    if (hits_per_file_scope.contains(kp.*)) try combined_s.append(allocator, kp.*);
+                }
+                std.mem.sort([]const u8, combined_s.items, SortCtxS{ .counts = &hits_per_file_scope }, SortCtxS.lessThan);
+                for (combined_s.items) |path| {
+                    if (searched.contains(path)) continue;
                     const ref = self.readContentForSearch(path, allocator) orelse continue;
                     defer ref.deinit();
                     try searched.put(path, {});
@@ -3895,7 +3947,16 @@ pub const Explorer = struct {
         } else {
             const use_trigram = candidate_paths != null and candidate_paths.?.len > 0;
             if (use_trigram) {
-                for (candidate_paths.?) |path| {
+                var combined_t: std.ArrayList([]const u8) = .empty;
+                defer combined_t.deinit(allocator);
+                for (candidate_paths.?) |p| try combined_t.append(allocator, p);
+                var skip_it = self.skip_trigram_files.keyIterator();
+                while (skip_it.next()) |kp| {
+                    if (hits_per_file_scope.contains(kp.*)) try combined_t.append(allocator, kp.*);
+                }
+                std.mem.sort([]const u8, combined_t.items, SortCtxS{ .counts = &hits_per_file_scope }, SortCtxS.lessThan);
+                for (combined_t.items) |path| {
+                    if (searched.contains(path)) continue;
                     const ref = self.readContentForSearch(path, allocator) orelse continue;
                     defer ref.deinit();
                     try searched.put(path, {});