feat(tools): add byte-level schema canonicalize for prefix-cache stability

When MCP servers return tool schemas, the field order within each schema
object and the order of entries in required / dependentRequired arrays
can vary across reconnections. This causes the serialized tool catalog
bytes to change even when the logical schema is unchanged, busting
DeepSeek's KV prefix cache.

Add schema_canonicalize::canonicalize_schema which recursively:
- Sorts every required array alphabetically
- Sorts every dependentRequired sub-array alphabetically
- Rebuilds object keys in alphabetical order
- Recurses into all nested objects and arrays

The canonicalize step runs after schema_sanitize in build_api_tools,
so each tool's input_schema is first cleaned then byte-stabilized.
The existing OnceLock api_cache pins the result, ensuring the tool
catalog bytes are identical across reads and across process restarts.

8 unit tests cover: required sorting, dependentRequired sorting,
equivalent-ordering byte match, recursive nesting, empty schemas,
deeply nested schemas, non-required array preservation, and key
ordering.

(cherry picked from commit 7cee9cd5e12a74e8072bf2f6a1b18555ed0db0bf)
This commit is contained in:
Hu Qiantao
2026-06-01 20:30:24 +08:00
committed by Hunter B
parent 69cff93754
commit 49791905f9
3 changed files with 210 additions and 0 deletions
+1
View File
@@ -41,6 +41,7 @@ pub mod remember;
pub mod revert_turn;
pub mod review;
pub mod rlm;
pub mod schema_canonicalize;
pub mod schema_sanitize;
pub mod search;
pub mod shell;
+2
View File
@@ -16,6 +16,7 @@ use serde_json::Value;
use crate::client::DeepSeekClient;
use crate::models::Tool;
use super::schema_canonicalize;
use super::schema_sanitize;
use super::spec::{
ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec,
@@ -224,6 +225,7 @@ impl ToolRegistry {
.map(|tool| {
let mut schema = tool.input_schema();
schema_sanitize::sanitize(&mut schema);
schema_canonicalize::canonicalize_schema(&mut schema);
Tool {
tool_type: None,
name: tool.name().to_string(),
+207
View File
@@ -0,0 +1,207 @@
//! Byte-level canonicalization of JSON Schema for prefix-cache stability.
//!
//! When MCP servers return tool schemas, the field order within each schema
//! object and the order of entries in `required` / `dependentRequired` arrays
//! can vary across reconnections. This module normalizes those orderings so
//! that two logically equivalent schemas always produce identical bytes after
//! serialization.
//!
//! The approach mirrors `reasonix/internal/provider/schema_canonicalize.go`:
//!
//! 1. Sort every `"required"` array alphabetically.
//! 2. Sort every `"dependentRequired"` sub-array alphabetically.
//! 3. Recurse into all nested objects and arrays.
//!
//! `serde_json::Value::Object` uses `IndexMap` when `preserve_order` is
//! enabled (which this crate does). We therefore rebuild the map with sorted
//! keys to guarantee deterministic key ordering.
use serde_json::Value;
/// Recursively canonicalize a JSON Schema value in-place.
///
/// After canonicalization, two schemas that are semantically equivalent
/// (same keys, same `required` set, same `dependentRequired` sets) will
/// serialize to byte-identical JSON regardless of the original field or
/// array order.
pub fn canonicalize_schema(value: &mut Value) {
match value {
Value::Object(map) => {
// Sort `required` arrays (they are sets per JSON Schema spec).
if let Some(Value::Array(req)) = map.get_mut("required") {
sort_string_array(req);
}
// Sort `dependentRequired` sub-arrays.
if let Some(Value::Object(deps)) = map.get_mut("dependentRequired") {
for dep_value in deps.values_mut() {
if let Value::Array(arr) = dep_value {
sort_string_array(arr);
}
}
}
// Recurse into every child value.
for v in map.values_mut() {
canonicalize_schema(v);
}
// Rebuild the map with sorted keys so serialization is deterministic.
// serde_json::Map backed by IndexMap (preserve_order) doesn't have
// drain(), so we swap to a temporary and rebuild.
let old = std::mem::take(map);
let mut entries: Vec<(String, Value)> = old.into_iter().collect();
entries.sort_by(|a, b| a.0.cmp(&b.0));
for (k, v) in entries {
map.insert(k, v);
}
}
Value::Array(arr) => {
for v in arr.iter_mut() {
canonicalize_schema(v);
}
}
_ => {}
}
}
/// Sort a JSON array of string values alphabetically in-place.
///
/// Non-string entries are left at the end in their original relative order.
fn sort_string_array(arr: &mut [Value]) {
arr.sort_by(|a, b| match (a.as_str(), b.as_str()) {
(Some(x), Some(y)) => x.cmp(y),
(Some(_), None) => std::cmp::Ordering::Less,
(None, Some(_)) => std::cmp::Ordering::Greater,
(None, None) => std::cmp::Ordering::Equal,
});
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn sorts_required_array() {
let mut schema = json!({
"type": "object",
"required": ["z", "a", "m"],
"properties": {}
});
canonicalize_schema(&mut schema);
assert_eq!(schema["required"], json!(["a", "m", "z"]));
}
#[test]
fn equivalent_ordering_matches() {
// Two schemas that differ only in field order and required order
// must serialize to identical bytes.
let mut a = json!({
"required": ["b", "a"],
"properties": {"x": {}, "y": {}},
"type": "object"
});
let mut b = json!({
"type": "object",
"properties": {"y": {}, "x": {}},
"required": ["a", "b"]
});
canonicalize_schema(&mut a);
canonicalize_schema(&mut b);
assert_eq!(
serde_json::to_string(&a).unwrap(),
serde_json::to_string(&b).unwrap(),
"logically equivalent schemas must produce identical bytes"
);
}
#[test]
fn sorts_dependent_required() {
let mut schema = json!({
"type": "object",
"dependentRequired": {
"x": ["z", "a"],
"y": ["m", "b"]
}
});
canonicalize_schema(&mut schema);
assert_eq!(schema["dependentRequired"]["x"], json!(["a", "z"]));
assert_eq!(schema["dependentRequired"]["y"], json!(["b", "m"]));
}
#[test]
fn recursive_into_properties() {
let mut schema = json!({
"type": "object",
"properties": {
"nested": {
"type": "object",
"required": ["z", "a"],
"properties": {}
}
}
});
canonicalize_schema(&mut schema);
assert_eq!(
schema["properties"]["nested"]["required"],
json!(["a", "z"])
);
}
#[test]
fn preserves_non_required_array_order() {
// Arrays that are not `required` or `dependentRequired` should
// keep their semantic order (e.g. enum values, oneOf items).
let mut schema = json!({
"type": "string",
"enum": ["z", "a", "m"]
});
canonicalize_schema(&mut schema);
assert_eq!(schema["enum"], json!(["z", "a", "m"]));
}
#[test]
fn handles_empty_schema() {
let mut schema = json!({});
canonicalize_schema(&mut schema);
assert_eq!(schema, json!({}));
}
#[test]
fn handles_deeply_nested() {
let mut schema = json!({
"type": "object",
"properties": {
"level1": {
"type": "object",
"properties": {
"level2": {
"type": "object",
"required": ["z", "a"]
}
}
}
}
});
canonicalize_schema(&mut schema);
assert_eq!(
schema["properties"]["level1"]["properties"]["level2"]["required"],
json!(["a", "z"])
);
}
#[test]
fn key_order_is_alphabetical_after_canonicalize() {
let mut schema = json!({
"z_field": 1,
"a_field": 2,
"m_field": 3
});
canonicalize_schema(&mut schema);
let keys: Vec<&str> = schema
.as_object()
.unwrap()
.keys()
.map(|s| s.as_str())
.collect();
assert_eq!(keys, vec!["a_field", "m_field", "z_field"]);
}
}