fix(release): harden Windows shell env and PDF extraction

(cherry picked from commit 2d5f2d235849cd2fa520937a529935759e3782a3)
This commit is contained in:
CodeWhale Agent
2026-06-12 11:58:52 -07:00
parent a448194b81
commit a0724bc5af
3 changed files with 140 additions and 3 deletions
+63
View File
@@ -35,6 +35,8 @@ where
value.as_ref().to_os_string(),
);
}
#[cfg(windows)]
fill_windows_common_program_files(&mut env);
env
}
@@ -184,6 +186,9 @@ fn is_allowed_parent_env_key(key: &OsStr) -> bool {
| "PROGRAMFILES"
| "PROGRAMFILES(X86)"
| "PROGRAMW6432"
| "COMMONPROGRAMFILES"
| "COMMONPROGRAMFILES(X86)"
| "COMMONPROGRAMW6432"
| "PROCESSOR_ARCHITECTURE"
| "NUGET_PACKAGES"
| "NUGET_HTTP_CACHE_PATH"
@@ -266,6 +271,26 @@ fn upsert_env(env: &mut Vec<(OsString, OsString)>, key: OsString, value: OsStrin
env.push((key, value));
}
#[cfg(any(windows, test))]
fn fill_windows_common_program_files(env: &mut Vec<(OsString, OsString)>) {
for (key, default) in [
("CommonProgramFiles", r"C:\Program Files\Common Files"),
(
"CommonProgramFiles(x86)",
r"C:\Program Files (x86)\Common Files",
),
("CommonProgramW6432", r"C:\Program Files\Common Files"),
] {
let existing = env
.iter()
.find(|(existing, _)| normalize_key(existing) == normalize_key(OsStr::new(key)))
.map(|(_, value)| value.to_string_lossy().trim().is_empty());
if existing.unwrap_or(true) {
upsert_env(env, OsString::from(key), OsString::from(default));
}
}
}
fn normalize_key(key: &OsStr) -> String {
key.to_string_lossy().to_ascii_uppercase()
}
@@ -387,6 +412,9 @@ mod tests {
"PROGRAMFILES",
"PROGRAMFILES(X86)",
"PROGRAMW6432",
"COMMONPROGRAMFILES",
"COMMONPROGRAMFILES(X86)",
"COMMONPROGRAMW6432",
"PROCESSOR_ARCHITECTURE",
"NUGET_PACKAGES",
"DOTNET_ROOT",
@@ -408,6 +436,41 @@ mod tests {
);
}
#[test]
fn windows_common_program_files_defaults_replace_empty_values() {
let mut env = vec![
(OsString::from("CommonProgramFiles"), OsString::new()),
(
OsString::from("CommonProgramFiles(x86)"),
OsString::from(" "),
),
(
OsString::from("CommonProgramW6432"),
OsString::from(r"D:\Common Files"),
),
];
fill_windows_common_program_files(&mut env);
let get = |name: &str| {
env.iter()
.find(|(key, _)| normalize_key(key) == normalize_key(OsStr::new(name)))
.map(|(_, value)| value.to_string_lossy().into_owned())
};
assert_eq!(
get("CommonProgramFiles").as_deref(),
Some(r"C:\Program Files\Common Files")
);
assert_eq!(
get("CommonProgramFiles(x86)").as_deref(),
Some(r"C:\Program Files (x86)\Common Files")
);
assert_eq!(
get("CommonProgramW6432").as_deref(),
Some(r"D:\Common Files")
);
}
#[test]
fn mcp_env_allowlist_includes_python_bootstrap_keys() {
for key in [
+39 -2
View File
@@ -10,6 +10,7 @@ use super::spec::{
};
use async_trait::async_trait;
use serde_json::{Value, json};
use std::fmt::Display;
use std::fs;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
@@ -356,7 +357,7 @@ fn read_pdf_via_pdf_extract(
// pdf-extract returns pages in document order; `start`/`end` are
// 1-indexed inclusive (validated above), so we convert to a
// 0-indexed half-open slice with bounds clamping.
let pages = pdf_extract::extract_text_by_pages(path).map_err(|e| {
let pages = guard_pdf_extract(|| pdf_extract::extract_text_by_pages(path)).map_err(|e| {
ToolError::execution_failed(format!(
"pdf-extract failed on {}: {e} (set `prefer_external_pdftotext = true` in settings.toml to retry via pdftotext)",
path.display()
@@ -379,7 +380,7 @@ fn read_pdf_via_pdf_extract(
// extract_text uses an internal codepath that can hang on certain PDF
// cross-reference tables or font encodings (#2641). The per-page path
// avoids that hang and produces identical output when joined.
pdf_extract::extract_text_by_pages(path)
guard_pdf_extract(|| pdf_extract::extract_text_by_pages(path))
.map(|pages| pages.join("\n"))
.map_err(|e| {
ToolError::execution_failed(format!(
@@ -391,6 +392,31 @@ fn read_pdf_via_pdf_extract(
Ok(ToolResult::success(clean_pdf_text(&text)))
}
fn guard_pdf_extract<T, E, F>(extract: F) -> Result<T, String>
where
E: Display,
F: FnOnce() -> Result<T, E>,
{
match std::panic::catch_unwind(std::panic::AssertUnwindSafe(extract)) {
Ok(Ok(value)) => Ok(value),
Ok(Err(err)) => Err(err.to_string()),
Err(payload) => Err(format!(
"extractor panicked: {}",
panic_payload_message(payload.as_ref())
)),
}
}
fn panic_payload_message(payload: &(dyn std::any::Any + Send)) -> String {
if let Some(message) = payload.downcast_ref::<&str>() {
(*message).to_string()
} else if let Some(message) = payload.downcast_ref::<String>() {
message.clone()
} else {
"unknown panic".to_string()
}
}
fn read_pdf_via_pdftotext(
path: &Path,
page_range: Option<(u32, u32)>,
@@ -1380,6 +1406,17 @@ mod tests {
assert!(single.content.contains("Recursive Language Models"));
}
#[test]
fn pdf_extract_panic_is_returned_as_tool_error_text() {
let err = guard_pdf_extract(|| -> Result<String, &'static str> {
panic!("assertion failed: name == \"Identity-H\"");
})
.expect_err("panic should become an error");
assert!(err.contains("extractor panicked"));
assert!(err.contains("Identity-H"));
}
#[tokio::test]
async fn read_file_pdf_path_uses_pdf_extract_by_default() {
if !sample_pdf_present() {
+38 -1
View File
@@ -14,6 +14,7 @@ use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use std::collections::{HashMap, VecDeque};
use std::fmt::Display;
use std::hash::{Hash, Hasher};
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant};
@@ -1204,10 +1205,35 @@ fn parse_pdf_page(
}
fn pdf_extract_text(bytes: &[u8]) -> Result<String, ToolError> {
pdf_extract::extract_text_from_mem(bytes)
guard_pdf_extract(|| pdf_extract::extract_text_from_mem(bytes))
.map_err(|e| ToolError::execution_failed(format!("PDF extract failed: {e}")))
}
fn guard_pdf_extract<T, E, F>(extract: F) -> Result<T, String>
where
E: Display,
F: FnOnce() -> Result<T, E>,
{
match std::panic::catch_unwind(std::panic::AssertUnwindSafe(extract)) {
Ok(Ok(value)) => Ok(value),
Ok(Err(err)) => Err(err.to_string()),
Err(payload) => Err(format!(
"extractor panicked: {}",
panic_payload_message(payload.as_ref())
)),
}
}
fn panic_payload_message(payload: &(dyn std::any::Any + Send)) -> String {
if let Some(message) = payload.downcast_ref::<&str>() {
(*message).to_string()
} else if let Some(message) = payload.downcast_ref::<String>() {
message.clone()
} else {
"unknown panic".to_string()
}
}
fn split_pdf_pages(text: &str) -> Vec<Vec<String>> {
let raw_pages: Vec<&str> = text.split('\x0C').collect();
raw_pages
@@ -1794,6 +1820,17 @@ mod tests {
assert_eq!(percent_decode("foo+bar%20baz"), "foo+bar baz");
}
#[test]
fn pdf_extract_panic_is_returned_as_tool_error_text() {
let err = guard_pdf_extract(|| -> Result<String, &'static str> {
panic!("assertion failed: name == \"Identity-H\"");
})
.expect_err("panic should become an error");
assert!(err.contains("extractor panicked"));
assert!(err.contains("Identity-H"));
}
#[test]
fn scoped_ref_prefix_is_session_specific() {
let _lock = lock_web_run_test_state();