Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 103 additions & 42 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1639,56 +1639,64 @@ pub const Explorer = struct {
const candidate_paths = self.trigram_index.candidates(query, allocator);
defer if (candidate_paths) |cp| allocator.free(cp);

// Tier 1: trigram candidates — fast path, skips files already found by Tier 0.
if (candidate_paths) |cp| {
if (cp.len > 0) {
// Issue #427: rank candidates by per-file word-index hit count
// (desc) so the definition-dense file scans first; fall back to
// file content length (asc) so small files still come before
// unrelated large files at the same hit count. Pre-fix the
// sort key was content length alone, which buried the canonical
// file behind unrelated short files when max_per_file was 1.
var hits_per_file = std.StringHashMap(u32).init(allocator);
defer hits_per_file.deinit();
for (word_hits) |hit| {
const hp = self.word_index.hitPath(hit);
if (hp.len == 0) continue;
const gop_h = try hits_per_file.getOrPut(hp);
if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
gop_h.value_ptr.* += 1;
// Tier 1: trigram candidates merged with skip_trigram_files that have
// word-index hits — all sorted by per-file word-hit count desc so the
// definition-dense file scans first. Pre-fix: skip_trigram_files were
// deferred to Tier 3 which never ran when Tier 1 filled max_results.
{
var hits_per_file = std.StringHashMap(u32).init(allocator);
defer hits_per_file.deinit();
for (word_hits) |hit| {
const hp = self.word_index.hitPath(hit);
if (hp.len == 0) continue;
const gop_h = try hits_per_file.getOrPut(hp);
if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
gop_h.value_ptr.* += 1;
}

// Build a combined list: trigram candidates + skip_trigram_files
// paths that the word index knows about for this query.
var combined: std.ArrayList([]const u8) = .empty;
defer combined.deinit(allocator);
if (candidate_paths) |cp| {
for (cp) |p| try combined.append(allocator, p);
}
var skip_iter_t1 = self.skip_trigram_files.keyIterator();
while (skip_iter_t1.next()) |key_ptr| {
if (hits_per_file.contains(key_ptr.*)) {
try combined.append(allocator, key_ptr.*);
Comment on lines +1664 to +1667
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid scanning every skipped file per search

In repositories with many files above the trigram limit, this loop makes every searchContent call walk the entire skip_trigram_files map just to find the few paths that also have word hits, even when the normal trigram candidates are sufficient. searchContent is the benchmarked query path, so this adds O(number of large skipped files) work before the fast path can return; consider iterating the word_hits paths and checking membership in skip_trigram_files instead.

Useful? React with 👍 / 👎.

}
const SortCtx = struct {
contents: *const std.StringHashMap([]const u8),
counts: *const std.StringHashMap(u32),
pub fn lessThan(ctx: @This(), a: []const u8, b: []const u8) bool {
const a_count = ctx.counts.get(a) orelse 0;
const b_count = ctx.counts.get(b) orelse 0;
if (a_count != b_count) return a_count > b_count;
const a_len = if (ctx.contents.get(a)) |c| c.len else std.math.maxInt(usize);
const b_len = if (ctx.contents.get(b)) |c| c.len else std.math.maxInt(usize);
return a_len < b_len;
}
};
std.mem.sort([]const u8, @constCast(cp), SortCtx{ .contents = &self.contents, .counts = &hits_per_file }, SortCtx.lessThan);
}

const SortCtx = struct {
contents: *const std.StringHashMap([]const u8),
counts: *const std.StringHashMap(u32),
pub fn lessThan(ctx: @This(), a: []const u8, b: []const u8) bool {
const a_count = ctx.counts.get(a) orelse 0;
const b_count = ctx.counts.get(b) orelse 0;
if (a_count != b_count) return a_count > b_count;
const a_len = if (ctx.contents.get(a)) |c| c.len else std.math.maxInt(usize);
const b_len = if (ctx.contents.get(b)) |c| c.len else std.math.maxInt(usize);
return a_len < b_len;
}
};
std.mem.sort([]const u8, combined.items, SortCtx{ .contents = &self.contents, .counts = &hits_per_file }, SortCtx.lessThan);

const estimated_total = cp.len + self.skip_trigram_files.count();
if (combined.items.len > 0) {
const estimated_total = combined.items.len + self.skip_trigram_files.count();
const max_per_file = @max(@as(usize, 1), max_results / @max(@as(usize, 1), estimated_total));
for (cp) |path| {
for (combined.items) |path| {
if (searched.contains(path)) continue;
const ref = self.readContentForSearch(path, allocator) orelse continue;
defer ref.deinit();
searched.put(path, {}) catch {};
try searchInContent(path, ref.data, query, allocator, max_per_file, max_results, &result_list);
if (result_list.items.len >= max_results)
return self.rerankAndFinalize(&result_list, query, allocator);
}
}
}

// Mark all Tier 1 candidates as searched.
if (candidate_paths) |cp| {
for (cp) |p| searched.put(p, {}) catch {};
}

// Tier 2: sparse candidates — LAZY, only computed when Tier 1 found nothing.
if (result_list.items.len == 0) {
const sparse_paths = self.sparse_ngram_index.candidates(query, allocator);
Expand All @@ -1705,7 +1713,7 @@ pub const Explorer = struct {
}
}

// Tier 3: skip_trigram_files not already searched.
// Tier 3: skip_trigram_files not already searched (no word-index hits).
if (result_list.items.len < max_results) {
var skip_iter = self.skip_trigram_files.keyIterator();
while (skip_iter.next()) |key_ptr| {
Expand All @@ -1718,6 +1726,7 @@ pub const Explorer = struct {
}
}


// Tier 4: word index scan — for files not yet searched.
if (result_list.items.len < max_results) {
const tier4_hits = self.word_index.search(query);
Expand Down Expand Up @@ -3870,21 +3879,64 @@ pub const Explorer = struct {
var searched = std.StringHashMap(void).init(allocator);
defer searched.deinit();

// Build word-hit counts per file so skip_trigram_files with hits can
// be merged into the primary candidate loop sorted by relevance,
// preventing Tier 1 from filling max_results before canonical.zig is reached.
const word_hits_scope = self.word_index.search(query);
var hits_per_file_scope = std.StringHashMap(u32).init(allocator);
defer hits_per_file_scope.deinit();
for (word_hits_scope) |hit| {
const hp = self.word_index.hitPath(hit);
if (hp.len == 0) continue;
const gop_h = try hits_per_file_scope.getOrPut(hp);
if (!gop_h.found_existing) gop_h.value_ptr.* = 0;
gop_h.value_ptr.* += 1;
}

const SortCtxS = struct {
counts: *const std.StringHashMap(u32),
pub fn lessThan(ctx: @This(), a: []const u8, b: []const u8) bool {
const a_count = ctx.counts.get(a) orelse 0;
const b_count = ctx.counts.get(b) orelse 0;
return a_count > b_count;
}
};

if (sparse_paths != null and sparse_paths.?.len > 0) {
if (candidate_paths != null and candidate_paths.?.len > 0) {
var sparse_set = std.StringHashMap(void).init(allocator);
defer sparse_set.deinit();
for (sparse_paths.?) |p| try sparse_set.put(p, {});
for (candidate_paths.?) |path| {
if (!sparse_set.contains(path)) continue;

var combined_s: std.ArrayList([]const u8) = .empty;
defer combined_s.deinit(allocator);
for (candidate_paths.?) |p| {
if (sparse_set.contains(p)) try combined_s.append(allocator, p);
}
var skip_it = self.skip_trigram_files.keyIterator();
while (skip_it.next()) |kp| {
if (hits_per_file_scope.contains(kp.*)) try combined_s.append(allocator, kp.*);
}
std.mem.sort([]const u8, combined_s.items, SortCtxS{ .counts = &hits_per_file_scope }, SortCtxS.lessThan);
for (combined_s.items) |path| {
if (searched.contains(path)) continue;
const ref = self.readContentForSearch(path, allocator) orelse continue;
defer ref.deinit();
try searched.put(path, {});
try self.searchInContentWithScope(path, ref.data, query, allocator, max_results, &result_list);
if (result_list.items.len >= max_results) break;
}
} else {
for (sparse_paths.?) |path| {
var combined_s: std.ArrayList([]const u8) = .empty;
defer combined_s.deinit(allocator);
for (sparse_paths.?) |p| try combined_s.append(allocator, p);
var skip_it = self.skip_trigram_files.keyIterator();
while (skip_it.next()) |kp| {
if (hits_per_file_scope.contains(kp.*)) try combined_s.append(allocator, kp.*);
}
std.mem.sort([]const u8, combined_s.items, SortCtxS{ .counts = &hits_per_file_scope }, SortCtxS.lessThan);
for (combined_s.items) |path| {
if (searched.contains(path)) continue;
const ref = self.readContentForSearch(path, allocator) orelse continue;
defer ref.deinit();
try searched.put(path, {});
Expand All @@ -3895,7 +3947,16 @@ pub const Explorer = struct {
} else {
const use_trigram = candidate_paths != null and candidate_paths.?.len > 0;
if (use_trigram) {
for (candidate_paths.?) |path| {
var combined_t: std.ArrayList([]const u8) = .empty;
defer combined_t.deinit(allocator);
for (candidate_paths.?) |p| try combined_t.append(allocator, p);
var skip_it = self.skip_trigram_files.keyIterator();
while (skip_it.next()) |kp| {
if (hits_per_file_scope.contains(kp.*)) try combined_t.append(allocator, kp.*);
}
std.mem.sort([]const u8, combined_t.items, SortCtxS{ .counts = &hits_per_file_scope }, SortCtxS.lessThan);
for (combined_t.items) |path| {
if (searched.contains(path)) continue;
const ref = self.readContentForSearch(path, allocator) orelse continue;
defer ref.deinit();
try searched.put(path, {});
Expand Down
80 changes: 80 additions & 0 deletions src/tests.zig
Original file line number Diff line number Diff line change
Expand Up @@ -11275,3 +11275,83 @@ test "rerank-trace: single-result query records non-zero rerank score" {
try testing.expect(std.mem.indexOf(u8, data, "\"score\":0.0000") == null);
try testing.expect(std.mem.indexOf(u8, data, "src/loneSym.zig") != null);
}

test "issue-447: searchContent surfaces large (>64KB) skip-trigram files for common identifiers" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var explorer = Explorer.init(arena.allocator());

var i: usize = 0;
while (i < 12) : (i += 1) {
var path_buf: [32]u8 = undefined;
const path = try std.fmt.bufPrint(&path_buf, "small_{d}.zig", .{i});
try explorer.indexFile(path, "fn s() void { _ = widgetX; }\n");
}

const canonical_content =
"fn canonical() void {\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
"}\n";
try explorer.indexFileSkipTrigram("canonical.zig", canonical_content);

const results = try explorer.searchContent("widgetX", testing.allocator, 5);
defer {
for (results) |r| {
testing.allocator.free(r.path);
testing.allocator.free(r.line_text);
}
testing.allocator.free(results);
}

var found_canonical = false;
for (results) |r| {
if (std.mem.eql(u8, r.path, "canonical.zig")) {
found_canonical = true;
break;
}
}
try testing.expect(found_canonical);
}

test "issue-451: scope search surfaces skip-trigram canonical file" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit();
var explorer = Explorer.init(arena.allocator());

var i: usize = 0;
while (i < 12) : (i += 1) {
var path_buf: [32]u8 = undefined;
const path = try std.fmt.bufPrint(&path_buf, "small_{d}.zig", .{i});
try explorer.indexFile(path, "fn s() void { _ = widgetX; }\n");
}

const canonical_content =
"fn canonical() void {\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
" _ = widgetX;\n" ++
"}\n";
try explorer.indexFileSkipTrigram("canonical.zig", canonical_content);

const results = try explorer.searchContentWithScope("widgetX", testing.allocator, 5);
defer {
for (results) |r| {
testing.allocator.free(r.line_text);
testing.allocator.free(r.path);
if (r.scope_name) |s| testing.allocator.free(s);
}
testing.allocator.free(results);
}

var found_canonical = false;
for (results) |r| {
if (std.mem.eql(u8, r.path, "canonical.zig")) found_canonical = true;
}
try testing.expect(found_canonical);
}
Loading