diff --git a/crates/tui/src/pricing.rs b/crates/tui/src/pricing.rs index 2da0eab9..89395f88 100644 --- a/crates/tui/src/pricing.rs +++ b/crates/tui/src/pricing.rs @@ -166,6 +166,20 @@ fn deepseek_v4_flash_pricing() -> ModelPricing { } } +/// Return a one-line cost note for the given model, suitable for the +/// sub-agent economics section of the system prompt (#3025). +/// +/// Returns `None` when pricing is unknown — the prompt should use +/// cost-agnostic wording instead. +#[must_use] +pub fn input_cost_note(model: &str) -> Option { + let pricing = pricing_for_model(model)?; + Some(format!( + "Sub-agents are cheap — {} costs ${:.2} per million input tokens.", + model, pricing.usd.input_cache_miss_per_million + )) +} + /// Calculate cost for a turn given token usage and model. #[must_use] #[allow(dead_code)] @@ -299,6 +313,22 @@ mod tests { assert!(calculate_turn_cost("deepseek-ai/deepseek-v4-pro", 1_000, 1_000).is_none()); } + #[test] + fn input_cost_note_for_flash_names_official_price() { + let note = input_cost_note("deepseek-v4-flash").expect("flash pricing is known"); + assert!( + note.contains("$0.14"), + "flash cost note must name the official $0.14/M input price, got: {note}" + ); + assert!(note.contains("deepseek-v4-flash")); + } + + #[test] + fn input_cost_note_unknown_model_returns_none() { + assert!(input_cost_note("llama3.3:70b").is_none()); + assert!(input_cost_note("moonshotai/kimi-k2.6").is_none()); + } + #[test] fn v4_pro_uses_limited_time_discount_before_expiry() { let before_expiry = Utc diff --git a/crates/tui/src/prompts.rs b/crates/tui/src/prompts.rs index c4d1511c..039339b3 100644 --- a/crates/tui/src/prompts.rs +++ b/crates/tui/src/prompts.rs @@ -796,9 +796,82 @@ pub(crate) fn render_runtime_policy_reference() -> String { /// says "You are deepseek-v4-pro" or "You are deepseek-v4-flash" instead /// of a static placeholder. fn apply_model_template(prompt: &str, model_id: &str) -> String { - prompt.replace("{model_id}", model_id) + let mut prompt = prompt.replace("{model_id}", model_id); + + // #3025: Substitute model-specific facts so non-DeepSeek models don't + // get V4 architecture claims, 1M-window assumptions, or Flash pricing. + let ctx_window = crate::models::context_window_for_model(model_id); + let window_note = if let Some(window) = ctx_window { + format!( + "You have a {}-token context window. Do not summarize or delete \ + earlier turns just because the transcript has crossed an older \ + threshold.", + if window >= 1_000_000 { + "one-million".to_string() + } else { + format!("{}", window) + } + ) + } else { + "Your context window is provider-dependent and not known to the \ + harness; treat the app's context-pressure indicator as authoritative \ + and suggest /compact when it reports high pressure." + .to_string() + }; + prompt = prompt.replace("{context_window_note}", &window_note); + + let subagent_econ = crate::pricing::input_cost_note(model_id).unwrap_or_else(|| { + "Sub-agents keep your main context clean; their pricing depends on \ + your provider." + .to_string() + }); + prompt = prompt.replace("{subagent_economics}", &subagent_econ); + + let thinking_note = if crate::models::model_supports_reasoning(model_id) { + "Models may emit *thinking tokens* before final answers. These are \ + invisible to the user but count against context. Use them strategically: \ + skip for lookups, light for simple code generation, deep for debugging." + .to_string() + } else { + String::new() + }; + prompt = prompt.replace("{model_thinking_note}", &thinking_note); + + let model_lower = model_id.to_ascii_lowercase(); + let is_v4 = model_lower.contains("deepseek") && model_lower.contains("v4"); + let characteristics = if is_v4 { + V4_MODEL_CHARACTERISTICS + } else { + GENERIC_MODEL_CHARACTERISTICS + }; + prompt = prompt.replace("{model_characteristics}", characteristics); + + prompt } +/// Architecture self-management section injected for DeepSeek V4 model ids +/// (the original hardcoded base.md section, now model-gated — #3025). +const V4_MODEL_CHARACTERISTICS: &str = "## Your V4 Characteristics + +You run on V4 architecture. Understanding the internals helps you self-manage: + +**Degradation curve.** Retrieval quality holds well through large V4 contexts and remains usable deep into the 1M window. Do not summarize or delete earlier turns just because the transcript has crossed an older 128K-era threshold. Prefer appending stable evidence and suggest `/compact` only near real pressure or when the user asks. + +**Prefix cache economics.** V4 caches shared prefixes at 128-token granularity with ~90% cost discount. Prefer appending to existing messages over mutating old ones — deletion or replacement breaks the cache and increases cost. Structure output to maximize prefix reuse across turns. + +**Thinking token strategy.** Thinking tokens count against context and replay across turns (the `reasoning_content` rule). Use them strategically: skip for lookups, light for simple code generation, deep for architecture and debugging. Cache conclusions in concise inline summaries rather than re-deriving each turn. + +**Parallel execution.** Batch independent reads, searches, and greps into a single turn. Never serialize operations that can run concurrently — parallel tool calls share the same turn and finish faster."; + +/// Provider-neutral fallback for non-V4 models: only claims that hold across +/// providers (prefix caching is widespread; parallel tool calls are harness +/// behavior, not model behavior). +const GENERIC_MODEL_CHARACTERISTICS: &str = "## Model Characteristics + +**Prefix-cache hygiene.** Many providers cache shared prompt prefixes. Prefer appending to existing messages over mutating old ones — deletion or replacement can break the cache and increase cost. Structure output to maximize prefix reuse across turns. + +**Parallel execution.** Batch independent reads, searches, and greps into a single turn. Never serialize operations that can run concurrently — parallel tool calls share the same turn and finish faster."; + const TOOL_TAXONOMY_DISCOVERY: &[&str] = &["grep_files", "file_search"]; const TOOL_TAXONOMY_GIT: &[&str] = &["git_status", "git_diff"]; const TOOL_TAXONOMY_VERIFICATION: &[&str] = &["run_tests", "run_verifiers"]; @@ -1371,6 +1444,77 @@ mod tests { BASE_PROMPT.contains("{model_id}"), "BASE_PROMPT must contain the {{model_id}} template for dynamic injection" ); + // #3025: the model-facts placeholders must exist in base.md or the + // apply_model_template substitutions are inert. + for placeholder in [ + "{context_window_note}", + "{subagent_economics}", + "{model_thinking_note}", + "{model_characteristics}", + ] { + assert!( + BASE_PROMPT.contains(placeholder), + "BASE_PROMPT must contain the {placeholder} template" + ); + } + } + + fn assert_no_unresolved_model_placeholders(prompt: &str) { + for placeholder in [ + "{model_id}", + "{context_window_note}", + "{subagent_economics}", + "{model_thinking_note}", + "{model_characteristics}", + ] { + assert!( + !prompt.contains(placeholder), + "composed prompt must not contain unresolved {placeholder}" + ); + } + } + + #[test] + fn compose_prompt_for_v4_model_keeps_v4_facts() { + let prompt = + compose_prompt_with_approval_model_and_shell(Personality::Calm, "deepseek-v4-pro"); + assert!(prompt.contains("Your V4 Characteristics")); + assert!(prompt.contains("one-million-token context window")); + assert!( + !prompt.contains("one-million-token-token"), + "window wording must not duplicate the -token suffix" + ); + assert_no_unresolved_model_placeholders(&prompt); + } + + #[test] + fn compose_prompt_for_kimi_uses_model_accurate_facts() { + let prompt = + compose_prompt_with_approval_model_and_shell(Personality::Calm, "moonshotai/kimi-k2.6"); + assert!(!prompt.contains("Your V4 Characteristics")); + assert!(!prompt.contains("one-million")); + assert!(!prompt.contains("$0.14")); + assert!(prompt.contains("262144-token context window")); + assert!( + prompt.contains("Models may emit *thinking tokens*"), + "kimi-k2.6 supports reasoning so the thinking note must appear" + ); + assert_no_unresolved_model_placeholders(&prompt); + } + + #[test] + fn compose_prompt_for_unknown_model_uses_honest_fallbacks() { + let prompt = + compose_prompt_with_approval_model_and_shell(Personality::Calm, "llama3.3:70b"); + assert!(!prompt.contains("Your V4 Characteristics")); + assert!(!prompt.contains("one-million")); + assert!(!prompt.contains("$0.14")); + assert!(prompt.contains("provider-dependent and not known")); + assert!( + !prompt.contains("Models may emit *thinking tokens*"), + "unknown models must not get the thinking-token note" + ); + assert_no_unresolved_model_placeholders(&prompt); } #[test] diff --git a/crates/tui/src/prompts/base.md b/crates/tui/src/prompts/base.md index da456e8b..81c2cd6f 100644 --- a/crates/tui/src/prompts/base.md +++ b/crates/tui/src/prompts/base.md @@ -163,7 +163,7 @@ For any task estimated to take 5+ concrete steps: ## Sub-Agent Strategy -Sub-agents are cheap — DeepSeek V4 Flash costs $0.14/M input. Use them liberally for parallel work: +{subagent_economics} Use them liberally for parallel work: - **Parallel investigation**: When you need to understand 3+ independent files or modules, open one read-only sub-agent session per target. They run concurrently in one turn and return structured findings you synthesize. This is faster AND more thorough than reading sequentially. - **Parallel implementation**: After a plan is laid out, open one sub-agent session per independent leaf task. Each does one thing well; you integrate results. @@ -204,21 +204,13 @@ For exact counts or structured aggregates, compute them directly in Python insid ## Context Management -You have a 1M-token context window. During long coding sessions, suggest `/compact` or Ctrl+L when usage approaches ~60% or when the app marks context pressure as high. If auto_compact is enabled, the engine can compact before the next send once the configured threshold is crossed. Compaction summarizes earlier turns so you can keep working without losing thread. +{context_window_note} During long coding sessions, suggest `/compact` or Ctrl+L when usage approaches ~60% or when the app marks context pressure as high. If auto_compact is enabled, the engine can compact before the next send once the configured threshold is crossed. Compaction summarizes earlier turns so you can keep working without losing thread. -Model notes: DeepSeek V4 models emit *thinking tokens* (`ContentBlock::Thinking`) before final answers. These are invisible to the user but count against context. Cost/token estimates are approximate; treat them as a rough guide. +{model_thinking_note} -## Your V4 Characteristics +Cost/token estimates are approximate; treat them as a rough guide. -You run on V4 architecture. Understanding the internals helps you self-manage: - -**Degradation curve.** Retrieval quality holds well through large V4 contexts and remains usable deep into the 1M window. Do not summarize or delete earlier turns just because the transcript has crossed an older 128K-era threshold. Prefer appending stable evidence and suggest `/compact` only near real pressure or when the user asks. - -**Prefix cache economics.** V4 caches shared prefixes at 128-token granularity with ~90% cost discount. Prefer appending to existing messages over mutating old ones — deletion or replacement breaks the cache and increases cost. Structure output to maximize prefix reuse across turns. - -**Thinking token strategy.** Thinking tokens count against context and replay across turns (the `reasoning_content` rule). Use them strategically: skip for lookups, light for simple code generation, deep for architecture and debugging. Cache conclusions in concise inline summaries rather than re-deriving each turn. - -**Parallel execution.** Batch independent reads, searches, and greps into a single turn. Never serialize operations that can run concurrently — parallel tool calls share the same turn and finish faster. +{model_characteristics} ## Thinking Budget @@ -270,7 +262,7 @@ Use `exec_shell` for shell-native diagnostics, pipelines, and bounded commands. ### `agent_open` / `agent_eval` / `agent_close` / `tool_agent` Use `agent_open` for independent investigations or implementation slices that can run while you continue coordinating. Fresh sessions are the default and are best when the child only needs the assignment you pass. Use `fork_context: true` when multiple perspectives should share the same parent context: the runtime preserves the parent prefill/prompt prefix byte-identically where available so DeepSeek prefix-cache reuse stays high, then appends the child instructions and task at the tail. -Use `tool_agent` for the experimental Fin fast lane: simple OCR, search, fetch, or command-probe tasks where Flash V4 with thinking off should execute tools while the parent keeps planning and synthesis context clean. Do not use it for nuanced implementation, architecture, release decisions, or anything that needs careful reasoning. +Use `tool_agent` for the experimental Fin fast lane: simple OCR, search, fetch, or command-probe tasks where a fast low-cost model with thinking off should execute tools while the parent keeps planning and synthesis context clean. Do not use it for nuanced implementation, architecture, release decisions, or anything that needs careful reasoning. Use `agent_eval` to send follow-up input, block for completion, or retrieve the current session projection. Use `agent_close` to cancel or release a session that is no longer useful. Keep tiny single-read/search tasks local so the transcript stays compact.