feat(read_file): bounded chunked reads via start_line / max_lines

Harvested from PR #1451 by @Oliver-ZPLiu (closes part of #1450).
Token-budget control for read_file: large files no longer drop their
entire contents into the conversation context on every turn.

Default window is 200 lines / ~16 KB; the hard cap is 500 lines.
Small files (≤ 200 lines AND ≤ 16 KB, no explicit range params) keep
the historical raw-contents return so existing prompts that read
config files / single source files see no behavior change. Otherwise
the response is wrapped in a `<file …>` tag with line-numbered
content, `shown_lines`, `truncated`, and `next_start_line`
attributes, plus a `[TRUNCATED]` hint so the model can page through
in 16 KB slices.

Cleanups from the original PR while harvesting:
- shown_lines is now 1-based inclusive (e.g. "3-6"), matching
  start_line / next_start_line / the line-number prefix on each
  rendered row. The original PR mixed 0-based indices in attributes
  with 1-based numbers in body output, which was confusing.
- The continuation hint mirrors that 1-based range so model
  reasoning over "what did I just see?" is unambiguous.
- Added 6 unit tests covering: small-file fast path, explicit
  range wrap-in-file-tag with 1-based lines, out-of-range
  no-content sentinel, zero start_line / max_lines rejection,
  hard-cap clamp at 500 lines, and large-file no-range default
  window. The original PR shipped without tests.
This commit is contained in:
Hunter Bown
2026-05-11 22:07:07 -05:00
parent fd82f85800
commit b4158dcc1b
2 changed files with 282 additions and 2 deletions
+12
View File
@@ -37,6 +37,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2026 cleanly; it is purely a rendering-quality knob, not a
correctness one. Set via `/set synchronized_output <auto|on|off>`
or in `~/.config/deepseek/settings.toml`.
- **`read_file` accepts `start_line` and `max_lines`** for chunked,
bounded reads of large files (#1450, harvested from PR #1451 by
**@Oliver-ZPLiu**). Default window is 200 lines / ~16 KB; the hard
cap is 500 lines. Small files (≤ 200 lines AND ≤ 16 KB) still
return their contents unchanged, so existing prompts that read
config files / single source files see no behavior change. Large
files now return a `<file …>`-wrapped, line-numbered window with
`shown_lines`, `truncated`, and `next_start_line` attributes plus
a `[TRUNCATED]` continuation hint — so the model can page through
a 50 KB file in 16 KB slices instead of dragging the whole thing
into the conversation context on every turn. PDFs continue to use
`pages`; `start_line` / `max_lines` apply to text files only.
## [0.8.30] - 2026-05-11
+270 -2
View File
@@ -26,7 +26,7 @@ impl ToolSpec for ReadFileTool {
}
fn description(&self) -> &'static str {
"Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via `pdftotext` (poppler) when available. Cannot read images or non-PDF binaries."
"Read a UTF-8 file from the workspace. Use this instead of `cat`, `head`, `tail`, or `sed -n '..p'` in `exec_shell` — it's faster, sandbox-aware, and skips the approval prompt. Plain text is returned as-is; PDFs are auto-extracted via `pdftotext` (poppler) when available. Cannot read images or non-PDF binaries.\n\nFor large files, use `start_line` and `max_lines` to read in chunks. By default, returns at most 200 lines (~16KB). If `truncated=\"true\"` in the response, use `next_start_line` to continue reading. For PDFs, use `pages` instead — `start_line`/`max_lines` only apply to text files."
}
fn input_schema(&self) -> Value {
@@ -37,6 +37,14 @@ impl ToolSpec for ReadFileTool {
"type": "string",
"description": "Path to the file (relative to workspace or absolute)"
},
"start_line": {
"type": "integer",
"description": "Starting line (1-based, default 1)"
},
"max_lines": {
"type": "integer",
"description": "Maximum lines to return (default 200, max 500)"
},
"pages": {
"type": "string",
"description": "PDF only: page range to extract, e.g. \"1-5\" or \"10\". Ignored for non-PDF files."
@@ -55,6 +63,20 @@ impl ToolSpec for ReadFileTool {
}
async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
// Bounded output for large files. The small-file fast path keeps the
// historical "return contents unchanged" behavior so existing flows
// (small configs, single source files, etc.) don't suddenly start
// seeing wrapped output. Once a file is large or the caller asks
// for an explicit range, we switch to a numbered, line-tagged
// window with continuation hints so the model can page through
// without re-loading the entire file on every turn. Harvested
// from PR #1451 by @Oliver-ZPLiu, closes part of #1450.
const DEFAULT_READ_LINES: usize = 200;
const HARD_MAX_READ_LINES: usize = 500;
const MAX_VISIBLE_BYTES: usize = 16 * 1024;
const SMALL_FILE_LINES: usize = 200;
const SMALL_FILE_BYTES: usize = 16 * 1024;
let path_str = required_str(&input, "path")?;
let file_path = context.resolve_path(path_str)?;
let pages = optional_str(&input, "pages");
@@ -67,7 +89,102 @@ impl ToolSpec for ReadFileTool {
ToolError::execution_failed(format!("Failed to read {}: {}", file_path.display(), e))
})?;
Ok(ToolResult::success(contents))
let total_lines = contents.lines().count();
let total_bytes = contents.len();
let explicit_range = input
.get("start_line")
.or_else(|| input.get("max_lines"))
.is_some();
// Small-file fast path. Only applies when the caller didn't pass an
// explicit range — otherwise an explicit `start_line = 5` on a
// tiny file would silently ignore the request.
if !explicit_range && total_lines <= SMALL_FILE_LINES && total_bytes <= SMALL_FILE_BYTES {
return Ok(ToolResult::success(contents));
}
let start_line = match input.get("start_line").and_then(Value::as_u64) {
Some(0) => {
return Err(ToolError::invalid_input(
"start_line must be 1-based and greater than 0".to_string(),
));
}
Some(v) => v as usize,
None => 1,
};
let max_lines = match input.get("max_lines").and_then(Value::as_u64) {
Some(0) => {
return Err(ToolError::invalid_input(
"max_lines must be greater than 0".to_string(),
));
}
Some(v) => std::cmp::min(v as usize, HARD_MAX_READ_LINES),
None => DEFAULT_READ_LINES,
};
// `start_line > total_lines` is not an error — it lets the model
// page past the end without raising. Returns an empty-content
// sentinel so subsequent reads can stop.
if start_line > total_lines {
let output = format!(
"<file path=\"{path_str}\" total_lines=\"{total_lines}\" shown_lines=\"none\" truncated=\"false\">\n\
\n\
[NO CONTENT] start_line {start_line} is beyond total_lines {total_lines}.\n\
</file>"
);
return Ok(ToolResult::success(output));
}
let lines: Vec<&str> = contents.lines().collect();
let zero_based_start = start_line - 1;
let zero_based_end = std::cmp::min(zero_based_start + max_lines, total_lines);
let shown_first = start_line;
let shown_last = zero_based_end; // 1-based inclusive line number of the last shown line
let mut numbered = String::new();
for (offset, line) in lines[zero_based_start..zero_based_end].iter().enumerate() {
let line_no = start_line + offset;
numbered.push_str(&format!("{line_no:>6}{line}\n"));
}
// UTF-8-safe byte truncation of the rendered range.
let truncated_by_bytes = numbered.len() > MAX_VISIBLE_BYTES;
let shown_content = if truncated_by_bytes {
let mut end = MAX_VISIBLE_BYTES;
while end > 0 && !numbered.is_char_boundary(end) {
end -= 1;
}
&numbered[..end]
} else {
&numbered
};
let truncated_by_lines = zero_based_end < total_lines;
let truncated = truncated_by_lines || truncated_by_bytes;
let next_start = zero_based_end + 1;
let mut attrs = format!(
"path=\"{path_str}\" total_lines=\"{total_lines}\" shown_lines=\"{shown_first}-{shown_last}\" truncated=\"{truncated}\""
);
if truncated_by_lines {
attrs.push_str(&format!(" next_start_line=\"{next_start}\""));
}
let mut output = format!("<file {attrs}>\n{shown_content}");
if truncated_by_lines {
output.push_str(&format!(
"\n[TRUNCATED] Showing lines {shown_first}-{shown_last} of {total_lines}. To continue, call read_file with path=\"{path_str}\" start_line={next_start} max_lines={max_lines}\n"
));
}
if truncated_by_bytes {
output.push_str(
"\n[TRUNCATED] The selected range exceeded 16KB. Continue with a smaller max_lines value.\n",
);
}
output.push_str("</file>");
Ok(ToolResult::success(output))
}
}
@@ -531,6 +648,157 @@ mod tests {
assert!(result.is_err());
}
#[tokio::test]
async fn read_file_small_file_returns_unwrapped_contents() {
// Small files (≤ 200 lines AND ≤ 16KB, no explicit range) keep
// the historical "return contents unchanged" behavior so
// existing prompts don't suddenly see <file> tags appear.
// Harvested from #1451 — pin the fast-path contract.
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
let file = tmp.path().join("small.txt");
fs::write(&file, "line 1\nline 2\nline 3\n").expect("write");
let tool = ReadFileTool;
let result = tool
.execute(json!({ "path": "small.txt" }), &ctx)
.await
.expect("execute");
assert!(result.success);
assert_eq!(result.content, "line 1\nline 2\nline 3\n");
assert!(
!result.content.contains("<file"),
"small-file fast path must not wrap output"
);
}
#[tokio::test]
async fn read_file_explicit_range_wraps_in_file_tag_with_one_based_lines() {
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
let file = tmp.path().join("ranged.txt");
let body: String = (1..=10).map(|n| format!("line {n}\n")).collect();
fs::write(&file, &body).expect("write");
let tool = ReadFileTool;
let result = tool
.execute(
json!({ "path": "ranged.txt", "start_line": 3, "max_lines": 4 }),
&ctx,
)
.await
.expect("execute");
assert!(result.success);
assert!(
result.content.contains("shown_lines=\"3-6\""),
"1-based inclusive range must be reflected in shown_lines: {}",
result.content
);
assert!(
result.content.contains("next_start_line=\"7\""),
"next_start_line must point one past the last shown line: {}",
result.content
);
assert!(
result.content.contains(" 3│ line 3"),
"rendered lines must start at the requested line number"
);
assert!(
result.content.contains(" 6│ line 6"),
"rendered lines must end at the last in-range line"
);
assert!(
!result.content.contains(" 7│ line 7"),
"lines past max_lines must be excluded"
);
assert!(result.content.contains("truncated=\"true\""));
}
#[tokio::test]
async fn read_file_range_beyond_total_returns_no_content_sentinel() {
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
let file = tmp.path().join("short.txt");
fs::write(&file, "only\nthree\nlines\n").expect("write");
let tool = ReadFileTool;
let result = tool
.execute(json!({ "path": "short.txt", "start_line": 99 }), &ctx)
.await
.expect("execute");
assert!(
result.success,
"out-of-range must not raise — it's a sentinel"
);
assert!(result.content.contains("[NO CONTENT]"));
assert!(result.content.contains("shown_lines=\"none\""));
assert!(result.content.contains("truncated=\"false\""));
}
#[tokio::test]
async fn read_file_rejects_zero_start_line_and_zero_max_lines() {
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
fs::write(tmp.path().join("any.txt"), "x\n").expect("write");
let tool = ReadFileTool;
let zero_start = tool
.execute(json!({ "path": "any.txt", "start_line": 0 }), &ctx)
.await;
assert!(zero_start.is_err(), "start_line=0 must error (1-based)");
let zero_max = tool
.execute(json!({ "path": "any.txt", "max_lines": 0 }), &ctx)
.await;
assert!(zero_max.is_err(), "max_lines=0 must error");
}
#[tokio::test]
async fn read_file_clamps_max_lines_to_hard_cap() {
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
let file = tmp.path().join("bigish.txt");
let body: String = (1..=600).map(|n| format!("L{n}\n")).collect();
fs::write(&file, &body).expect("write");
let tool = ReadFileTool;
let result = tool
.execute(json!({ "path": "bigish.txt", "max_lines": 5000 }), &ctx)
.await
.expect("execute");
// Hard cap is 500 lines; line 500 must appear, line 501 must not.
assert!(
result.content.contains(" 500│ L500"),
"line 500 should be in the window (max_lines clamped to 500)"
);
assert!(
!result.content.contains(" 501│ L501"),
"line 501 must be outside the clamped window"
);
assert!(result.content.contains("next_start_line=\"501\""));
assert!(result.content.contains("truncated=\"true\""));
}
#[tokio::test]
async fn read_file_large_file_without_range_uses_default_window() {
// A file over 200 lines / 16KB with no explicit range still
// gets the default window, not the unbounded raw content —
// this is the entire point of the patch (token-budget control).
let tmp = tempdir().expect("tempdir");
let ctx = ToolContext::new(tmp.path().to_path_buf());
let file = tmp.path().join("big.txt");
let body: String = (1..=250).map(|n| format!("row {n}\n")).collect();
fs::write(&file, &body).expect("write");
let tool = ReadFileTool;
let result = tool
.execute(json!({ "path": "big.txt" }), &ctx)
.await
.expect("execute");
assert!(result.content.contains("<file "));
assert!(result.content.contains("shown_lines=\"1-200\""));
assert!(result.content.contains("next_start_line=\"201\""));
assert!(result.content.contains(" 1│ row 1"));
assert!(result.content.contains(" 200│ row 200"));
assert!(
!result.content.contains(" 201│ row 201"),
"default max_lines=200 must hold"
);
}
#[tokio::test]
async fn test_read_file_missing_path() {
let tmp = tempdir().expect("tempdir");