Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 213 additions & 7 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,196 @@ pub const Explorer = struct {
if (prior_outline) |*old_outline| old_outline.deinit();
}

fn computeSymbolEnds(content: []const u8, outline: *FileOutline) void {
if (outline.symbols.items.len == 0) return;

// Build a line offset table for O(1) line lookups
var line_offsets: std.ArrayList(usize) = .{};
defer line_offsets.deinit(outline.allocator);
line_offsets.append(outline.allocator, 0) catch return; // line 1 starts at offset 0
for (content, 0..) |c, i| {
if (c == '\n' and i + 1 <= content.len) {
line_offsets.append(outline.allocator, i + 1) catch return;
}
}
const total_lines: u32 = @intCast(line_offsets.items.len);

const is_brace_lang = outline.language == .zig or outline.language == .c or
outline.language == .cpp or outline.language == .typescript or
outline.language == .javascript or outline.language == .rust or
outline.language == .go_lang or outline.language == .php;

for (outline.symbols.items) |*sym| {
// Skip single-line kinds
switch (sym.kind) {
.import, .variable, .constant, .comment_block, .type_alias, .macro_def => continue,
else => {},
}

if (sym.line_start == 0 or sym.line_start > total_lines) continue;

if (is_brace_lang) {
sym.line_end = findBraceEnd(content, line_offsets.items, sym.line_start, total_lines);
} else if (outline.language == .python) {
sym.line_end = findPythonEnd(content, line_offsets.items, sym.line_start, total_lines);
} else if (outline.language == .ruby) {
sym.line_end = findRubyEnd(content, line_offsets.items, sym.line_start, total_lines);
}
}
}

fn findBraceEnd(content: []const u8, line_offsets: []const usize, line_start: u32, total_lines: u32) u32 {
const start_idx = line_offsets[line_start - 1];
var depth: i32 = 0;
var found_open = false;
var in_string: u8 = 0; // 0=none, '"', '\''
var in_line_comment = false;
var in_block_comment = false;
var i = start_idx;
var current_line = line_start;

while (i < content.len) : (i += 1) {
const c = content[i];

if (c == '\n') {
current_line += 1;
in_line_comment = false;
// Bail out if no opening brace found within 10 lines
if (!found_open and current_line > line_start + 10) return line_start;
continue;
}

if (in_line_comment) continue;

if (in_block_comment) {
if (c == '*' and i + 1 < content.len and content[i + 1] == '/') {
in_block_comment = false;
i += 1;
}
continue;
}

if (in_string != 0) {
if (c == '\\') {
i += 1; // skip escaped char
} else if (c == in_string) {
in_string = 0;
}
continue;
}

// Check for comments
if (c == '/' and i + 1 < content.len) {
if (content[i + 1] == '/') {
in_line_comment = true;
continue;
} else if (content[i + 1] == '*') {
in_block_comment = true;
i += 1;
continue;
}
}

// Check for strings
if (c == '"' or c == '\'') {
in_string = c;
continue;
}

if (c == '{') {
depth += 1;
found_open = true;
} else if (c == '}') {
depth -= 1;
if (found_open and depth == 0) {
return @min(current_line, total_lines);
}
}
}

return if (found_open) total_lines else line_start;
}

fn findPythonEnd(content: []const u8, line_offsets: []const usize, line_start: u32, total_lines: u32) u32 {
if (line_start >= total_lines) return line_start;

// Get the indent of the signature line
const sig_offset = line_offsets[line_start - 1];
const sig_indent = countIndent(content, sig_offset);

// Find the colon-terminated signature (may span multiple lines)
var body_start = line_start + 1;
// Check if signature line itself has the colon
{
const line_end_offset = if (line_start < total_lines) line_offsets[line_start] else content.len;
const sig_line = content[sig_offset..line_end_offset];
if (std.mem.indexOf(u8, sig_line, ":") == null) {
// Multi-line signature — skip ahead to find the colon
Comment on lines +444 to +445
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Ignore annotation colons when detecting Python signature end

This colon check treats any : on the def line as the signature terminator, but typed multiline signatures often include parameter annotations before the actual closing ):. In cases like def f(a: int, ... ) -> None:, body_start is left too early, and the later dedent check can stop before the real function body, producing incorrect line_end and truncated codedb_symbol body=true output.

Useful? React with 👍 / 👎.

var ln = line_start + 1;
while (ln <= total_lines) : (ln += 1) {
const lo = line_offsets[ln - 1];
const le = if (ln < total_lines) line_offsets[ln] else content.len;
const line = content[lo..le];
if (std.mem.indexOf(u8, line, ":") != null) {
body_start = ln + 1;
break;
}
}
}
}

var last_body_line = line_start;
var ln = body_start;
while (ln <= total_lines) : (ln += 1) {
const lo = line_offsets[ln - 1];
const le = if (ln < total_lines) line_offsets[ln] else content.len;
const line = content[lo..le];
const trimmed = std.mem.trim(u8, line, " \t\r\n");

// Blank lines and comments don't end the body
if (trimmed.len == 0 or std.mem.startsWith(u8, trimmed, "#")) {
continue;
}

const indent = countIndent(content, lo);
if (indent <= sig_indent) break;
last_body_line = ln;
}

return if (last_body_line > line_start) last_body_line else line_start;
}

fn findRubyEnd(content: []const u8, line_offsets: []const usize, line_start: u32, total_lines: u32) u32 {
if (line_start >= total_lines) return line_start;

const sig_offset = line_offsets[line_start - 1];
const sig_indent = countIndent(content, sig_offset);

var ln = line_start + 1;
while (ln <= total_lines) : (ln += 1) {
const lo = line_offsets[ln - 1];
const le = if (ln < total_lines) line_offsets[ln] else content.len;
const line = content[lo..le];
const trimmed = std.mem.trim(u8, line, " \t\r\n");

if (std.mem.eql(u8, trimmed, "end")) {
const indent = countIndent(content, lo);
if (indent <= sig_indent) return ln;
}
}

return line_start;
}

fn countIndent(content: []const u8, offset: usize) usize {
var count: usize = 0;
var i = offset;
while (i < content.len and (content[i] == ' ' or content[i] == '\t')) : (i += 1) {
count += if (content[i] == '\t') 4 else 1;
}
return count;
}

fn parseOutlineWithParser(parser: *Explorer, path: []const u8, content: []const u8) !FileOutline {
var outline = FileOutline.init(parser.allocator, path);
errdefer outline.deinit();
Expand Down Expand Up @@ -422,6 +612,7 @@ fn parseOutlineWithParser(parser: *Explorer, path: []const u8, content: []const
prev_line_trimmed = trimmed;
}
outline.line_count = line_num;
computeSymbolEnds(content, &outline);
return outline;
}

Expand Down Expand Up @@ -809,14 +1000,29 @@ pub fn parseContentForIndexing(allocator: std.mem.Allocator, path: []const u8, c
if (result_list.items.len >= max_results) break;
}
} else {
var iter = self.outlines.keyIterator();
while (iter.next()) |key_ptr| {
const ref = self.readContentForSearch(key_ptr.*, allocator) orelse continue;
defer ref.deinit();
try searchInContent(key_ptr.*, ref.data, query, allocator, max_results, &result_list);
if (result_list.items.len >= max_results) break;
// No trigram/sparse candidates — use word_index to narrow (#250)
const word_hits = self.word_index.search(query);
if (word_hits.len > 0) {
var word_paths = std.StringHashMap(void).init(allocator);
Comment on lines +1004 to +1006
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve full-scan fallback when word index yields partial candidates

When trigram/sparse candidates are empty, this path searches only word_index.search(query) hits if any exist. That lookup is exact-token based (WordIndex.search), but searchInContent matches case-insensitive substrings, so files containing valid substring matches (for example foo inside foobar, especially for short queries where trigram is disabled) are skipped whenever at least one exact-token hit exists. This changes searchContent from exhaustive to lossy and can silently miss real results.

Useful? React with 👍 / 👎.

defer word_paths.deinit();
for (word_hits) |hit| word_paths.put(hit.path, {}) catch {};
var wp_iter = word_paths.keyIterator();
while (wp_iter.next()) |key_ptr| {
const ref = self.readContentForSearch(key_ptr.*, allocator) orelse continue;
defer ref.deinit();
try searched.put(key_ptr.*, {});
try searchInContent(key_ptr.*, ref.data, query, allocator, max_results, &result_list);
if (result_list.items.len >= max_results) break;
}
} else {
var iter = self.outlines.keyIterator();
while (iter.next()) |key_ptr| {
const ref = self.readContentForSearch(key_ptr.*, allocator) orelse continue;
defer ref.deinit();
try searchInContent(key_ptr.*, ref.data, query, allocator, max_results, &result_list);
if (result_list.items.len >= max_results) break;
}
}
return result_list.toOwnedSlice(allocator);
}
}

Expand Down
8 changes: 8 additions & 0 deletions src/mcp.zig
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,14 @@ const ProjectCache = struct {

loadProjectTrigramFromDiskIfPresent(&new_entry.explorer, p, self.alloc);

// Release raw file contents retained by the snapshot load — outlines,
// trigram index, and word index are sufficient for all query tools.
const fc = new_entry.explorer.outlines.count();
if (fc > 1000) {
new_entry.explorer.releaseContents();
new_entry.explorer.releaseSecondaryIndexes();
}

// Find free slot or evict LRU
var target_slot: usize = 0;
var found_free = false;
Expand Down
42 changes: 16 additions & 26 deletions src/snapshot.zig
Original file line number Diff line number Diff line change
Expand Up @@ -413,32 +413,27 @@ pub fn loadSnapshotValidated(
const file = std.fs.cwd().openFile(snapshot_path, .{}) catch return false;
defer file.close();

// Validate magic
var magic_buf: [4]u8 = undefined;
const lmn = file.readAll(&magic_buf) catch return false;
if (lmn != 4) return false;
if (!std.mem.eql(u8, &magic_buf, &MAGIC)) return false;

// Read section table
const sections_opt = readSections(snapshot_path, allocator) catch return false;
var sections = sections_opt orelse return false;
// Read section table (validates magic internally) — reuse already-open file (#253)
file.seekTo(0) catch return false;
var sections = (readSectionsFromFile(file, allocator) catch return false) orelse return false;
defer sections.deinit();

// Parse META section to get expected file_count and root_hash
var expected_file_count: ?u32 = null;
var meta_root_hash: ?u64 = null;
if (sections.get(@intFromEnum(SectionId.meta))) |meta_entry| {
const meta_bytes = readSectionBytes(snapshot_path, .meta, allocator) catch null;
if (meta_bytes) |mb| {
if (meta_entry.length <= 256 * 1024 * 1024) blk: {
file.seekTo(meta_entry.offset) catch break :blk;
const mb = allocator.alloc(u8, @intCast(meta_entry.length)) catch break :blk;
defer allocator.free(mb);
// Simple integer extraction from JSON: "file_count":NNN
const nr = file.readAll(mb) catch break :blk;
if (nr != mb.len) break :blk;
if (parseJsonU32(mb, "file_count")) |fc| {
expected_file_count = fc;
}
if (parseJsonU64(mb, "root_hash")) |rh| {
meta_root_hash = rh;
}
_ = meta_entry;
}
}

Expand All @@ -460,23 +455,20 @@ pub fn loadSnapshotValidated(
// Load CONTENT section — this is the core data
const content_entry = sections.get(@intFromEnum(SectionId.content)) orelse return false;

const content_file = std.fs.cwd().openFile(snapshot_path, .{}) catch return false;
defer content_file.close();

// Validate content section fits within actual file size (issue-40: truncation detection)
const file_stat = compat.fileStat(content_file) catch return false;
const file_stat = compat.fileStat(file) catch return false;
const file_size = file_stat.size;
if (content_entry.offset + content_entry.length > file_size) return false;

content_file.seekTo(content_entry.offset) catch return false;
file.seekTo(content_entry.offset) catch return false;

const snap_mtime: i128 = file_stat.mtime;
var bytes_read: u64 = 0;
var file_count: u32 = 0;
while (bytes_read < content_entry.length) {
// Read path_len(u16)
var pl_buf: [2]u8 = undefined;
const pln = content_file.readAll(&pl_buf) catch return false;
const pln = file.readAll(&pl_buf) catch return false;
if (pln != 2) break;
const path_len = std.mem.readInt(u16, &pl_buf, .little);
if (path_len == 0 or path_len > 4096) break; // sanity cap
Expand All @@ -485,13 +477,13 @@ pub fn loadSnapshotValidated(
// Read path
const path_buf = allocator.alloc(u8, path_len) catch return false;
defer allocator.free(path_buf);
const prn = content_file.readAll(path_buf) catch return false;
const prn = file.readAll(path_buf) catch return false;
if (prn != path_len) break;
bytes_read += path_len;

// Read content_len(u32)
var cl_buf: [4]u8 = undefined;
const cln = content_file.readAll(&cl_buf) catch return false;
const cln = file.readAll(&cl_buf) catch return false;
if (cln != 4) break;
const content_len = std.mem.readInt(u32, &cl_buf, .little);
if (content_len > 64 * 1024 * 1024) break; // sanity cap: 64MB per file
Expand All @@ -500,7 +492,7 @@ pub fn loadSnapshotValidated(
// Read content
const content = allocator.alloc(u8, content_len) catch return false;
defer allocator.free(content);
const crn = content_file.readAll(content) catch return false;
const crn = file.readAll(content) catch return false;
if (crn != content_len) break;
bytes_read += content_len;

Expand Down Expand Up @@ -539,15 +531,13 @@ pub fn loadSnapshotValidated(
if (freq_entry.length == 256 * 256 * 2) {
const index_mod = @import("index.zig");
const ft = allocator.create([256][256]u16) catch return file_count > 0;
const freq_file = std.fs.cwd().openFile(snapshot_path, .{}) catch return file_count > 0;
defer freq_file.close();
freq_file.seekTo(freq_entry.offset) catch {
file.seekTo(freq_entry.offset) catch {
allocator.destroy(ft);
return file_count > 0;
};
var row_buf: [256 * 2]u8 = undefined;
for (0..256) |a| {
if (freq_file.readAll(&row_buf) catch {
if (file.readAll(&row_buf) catch {
allocator.destroy(ft);
return file_count > 0;
} != 512) {
Expand Down
Loading
Loading