From d90031f06fbc14d6614359027016a730fad5c432 Mon Sep 17 00:00:00 2001
From: Paulo Aboim Pinto <aboimpinto@gmail.com>
Date: Sun, 7 Jun 2026 16:12:12 +0200
Subject: [PATCH] Add Gherkin acceptance E2E harness example

---
 Cargo.lock                                    | 364 +++++++++-
 crates/tui/Cargo.toml                         |   1 +
 .../tui/tests/directory_listing_acceptance.rs | 182 +++++
 crates/tui/tests/eval_harness.rs              |  95 ++-
 .../features/list_dir_happy_path.feature      |  10 +
 .../tool_lifecycle_happy_path.feature         |  31 +
 crates/tui/tests/tool_lifecycle_acceptance.rs | 630 ++++++++++++++++++
 7 files changed, 1270 insertions(+), 43 deletions(-)
 create mode 100644 crates/tui/tests/directory_listing_acceptance.rs
 create mode 100644 crates/tui/tests/features/list_dir_happy_path.feature
 create mode 100644 crates/tui/tests/features/tool_lifecycle_happy_path.feature
 create mode 100644 crates/tui/tests/tool_lifecycle_acceptance.rs

diff --git a/Cargo.lock b/Cargo.lock
index f3b55c66..c1011ba2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -160,7 +160,7 @@ version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -171,7 +171,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
 dependencies = [
  "anstyle",
  "once_cell_polyfill",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -724,6 +724,7 @@ dependencies = [
  "anstyle",
  "clap_lex",
  "strsim 0.11.1",
+ "terminal_size",
 ]
 
 [[package]]
@@ -973,6 +974,7 @@ dependencies = [
  "codewhale-tools",
  "colored",
  "crossterm 0.28.1",
+ "cucumber",
  "dirs",
  "dotenvy",
  "fd-lock",
@@ -1106,6 +1108,18 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "console"
+version = "0.16.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "unicode-width 0.2.2",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "convert_case"
 version = "0.6.0"
@@ -1299,6 +1313,63 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "cucumber"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96a87e18d925b19ebe0fd47ea45316abd216d81ec0879c2448c3f9a0e9da62be"
+dependencies = [
+ "anyhow",
+ "clap",
+ "console",
+ "cucumber-codegen",
+ "cucumber-expressions",
+ "derive_more 2.1.1",
+ "either",
+ "futures",
+ "gherkin",
+ "globwalk",
+ "humantime",
+ "inventory",
+ "itertools 0.14.0",
+ "linked-hash-map",
+ "pin-project",
+ "ref-cast",
+ "regex",
+ "sealed",
+ "smart-default",
+]
+
+[[package]]
+name = "cucumber-codegen"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2fc8a8bbb73af3230db699e8690c5c786655f75eb89e5f18d76055fa1a9a4d"
+dependencies = [
+ "cucumber-expressions",
+ "inflections",
+ "itertools 0.14.0",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "syn 2.0.117",
+ "synthez",
+]
+
+[[package]]
+name = "cucumber-expressions"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6401038de3af44fe74e6fccdb8a5b7db7ba418f480c8e9ad584c6f65c05a27a6"
+dependencies = [
+ "derive_more 2.1.1",
+ "either",
+ "nom 8.0.0",
+ "nom_locate",
+ "regex",
+ "regex-syntax 0.8.8",
+]
+
 [[package]]
 name = "darling"
 version = "0.23.0"
@@ -1465,6 +1536,7 @@ dependencies = [
  "quote",
  "rustc_version",
  "syn 2.0.117",
+ "unicode-xid",
 ]
 
 [[package]]
@@ -1522,7 +1594,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users 0.5.2",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -1638,6 +1710,12 @@ dependencies = [
  "log",
 ]
 
+[[package]]
+name = "encode_unicode"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+
 [[package]]
 name = "encoding_rs"
 version = "0.8.35"
@@ -1702,7 +1780,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -1923,9 +2001,9 @@ dependencies = [
 
 [[package]]
 name = "futures"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1938,9 +2016,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -1948,15 +2026,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -1965,9 +2043,9 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
 
 [[package]]
 name = "futures-lite"
@@ -1984,9 +2062,9 @@ dependencies = [
 
 [[package]]
 name = "futures-macro"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1995,21 +2073,21 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
 
 [[package]]
 name = "futures-task"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
 
 [[package]]
 name = "futures-util"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -2019,7 +2097,6 @@ dependencies = [
  "futures-task",
  "memchr",
  "pin-project-lite",
- "pin-utils",
  "slab",
 ]
 
@@ -2077,6 +2154,23 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "gherkin"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e2c0d8c632f8a251ce9a8198079b1022adc586ff4e3d33e18debd40eb463b31"
+dependencies = [
+ "heck",
+ "peg",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.117",
+ "textwrap 0.16.2",
+ "thiserror 2.0.18",
+ "typed-builder",
+]
+
 [[package]]
 name = "globset"
 version = "0.4.18"
@@ -2090,6 +2184,17 @@ dependencies = [
  "regex-syntax 0.8.8",
 ]
 
+[[package]]
+name = "globwalk"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757"
+dependencies = [
+ "bitflags 2.12.1",
+ "ignore",
+ "walkdir",
+]
+
 [[package]]
 name = "h2"
 version = "0.4.13"
@@ -2240,6 +2345,12 @@ version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
+[[package]]
+name = "humantime"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424"
+
 [[package]]
 name = "hybrid-array"
 version = "0.4.11"
@@ -2520,6 +2631,12 @@ dependencies = [
  "rustversion",
 ]
 
+[[package]]
+name = "inflections"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a257582fdcde896fd96463bf2d40eefea0580021c0712a0e2b028b60b47a837a"
+
 [[package]]
 name = "inout"
 version = "0.1.4"
@@ -2576,7 +2693,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
 dependencies = [
  "hermit-abi",
  "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2794,6 +2911,12 @@ dependencies = [
  "bitflags 2.12.1",
 ]
 
+[[package]]
+name = "linked-hash-map"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
+
 [[package]]
 name = "linux-keyutils"
 version = "0.2.5"
@@ -2878,7 +3001,7 @@ dependencies = [
  "itoa",
  "log",
  "md-5",
- "nom",
+ "nom 7.1.3",
  "rangemap",
  "time",
  "weezl",
@@ -3096,13 +3219,33 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nom"
+version = "8.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "nom_locate"
+version = "5.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
+dependencies = [
+ "bytecount",
+ "memchr",
+ "nom 8.0.0",
+]
+
 [[package]]
 name = "nu-ansi-term"
 version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -3392,6 +3535,33 @@ dependencies = [
  "unicode-normalization",
 ]
 
+[[package]]
+name = "peg"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f76678828272f177ac33b7e2ac2e3e73cc6c1cd1e3e387928aa69562fa51367"
+dependencies = [
+ "peg-macros",
+ "peg-runtime",
+]
+
+[[package]]
+name = "peg-macros"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "636d60acf97633e48d266d7415a9355d4389cea327a193f87df395d88cd2b14d"
+dependencies = [
+ "peg-runtime",
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "peg-runtime"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9555b1514d2d99d78150d3c799d4c357a3e2c2a8062cd108e93a06d9057629c5"
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@@ -3503,6 +3673,26 @@ dependencies = [
  "siphasher",
 ]
 
+[[package]]
+name = "pin-project"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.16"
@@ -4038,7 +4228,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys 0.11.0",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4094,7 +4284,7 @@ dependencies = [
  "security-framework 3.5.1",
  "security-framework-sys",
  "webpki-root-certs",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4262,6 +4452,17 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
+[[package]]
+name = "sealed"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22f968c5ea23d555e670b449c1c5e7b2fc399fdaec1d304a17cd48e288abc107"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "secret-service"
 version = "4.0.0"
@@ -4566,6 +4767,23 @@ version = "1.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
 
+[[package]]
+name = "smart-default"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eb01866308440fc64d6c44d9e86c5cc17adfe33c4d6eed55da9145044d0ffc1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "smawk"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c"
+
 [[package]]
 name = "socket2"
 version = "0.6.1"
@@ -4618,7 +4836,7 @@ dependencies = [
  "starlark_syntax",
  "static_assertions",
  "strsim 0.10.0",
- "textwrap",
+ "textwrap 0.11.0",
  "thiserror 1.0.69",
 ]
 
@@ -4771,6 +4989,39 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "synthez"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d8a928f38f1bc873f28e0d2ba8298ad65374a6ac2241dabd297271531a736cd"
+dependencies = [
+ "syn 2.0.117",
+ "synthez-codegen",
+ "synthez-core",
+]
+
+[[package]]
+name = "synthez-codegen"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb83b8df4238e11746984dfb3819b155cd270de0e25847f45abad56b3671047"
+dependencies = [
+ "syn 2.0.117",
+ "synthez-core",
+]
+
+[[package]]
+name = "synthez-core"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "906fba967105d822e7c7ed60477b5e76116724d33de68a585681fb253fc30d5c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "sealed",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "tar"
 version = "0.4.46"
@@ -4792,7 +5043,7 @@ dependencies = [
  "getrandom 0.3.4",
  "once_cell",
  "rustix 1.1.3",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4806,6 +5057,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "terminal_size"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874"
+dependencies = [
+ "rustix 1.1.3",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "terminfo"
 version = "0.9.0"
@@ -4813,7 +5074,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662"
 dependencies = [
  "fnv",
- "nom",
+ "nom 7.1.3",
  "phf",
  "phf_codegen",
 ]
@@ -4878,6 +5139,17 @@ dependencies = [
  "unicode-width 0.1.14",
 ]
 
+[[package]]
+name = "textwrap"
+version = "0.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057"
+dependencies = [
+ "smawk",
+ "unicode-linebreak",
+ "unicode-width 0.2.2",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -5286,6 +5558,26 @@ dependencies = [
  "pom",
 ]
 
+[[package]]
+name = "typed-builder"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31aa81521b70f94402501d848ccc0ecaa8f93c8eb6999eb9747e72287757ffda"
+dependencies = [
+ "typed-builder-macro",
+]
+
+[[package]]
+name = "typed-builder-macro"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "076a02dc54dd46795c2e9c8282ed40bcfb1e22747e955de9389a1de28190fb26"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "typenum"
 version = "1.20.0"
@@ -5306,7 +5598,7 @@ checksum = "f2f6fb2847f6742cd76af783a2a2c49e9375d0a111c7bef6f71cd9e738c72d6e"
 dependencies = [
  "memoffset 0.9.1",
  "tempfile",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5327,6 +5619,12 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
 
+[[package]]
+name = "unicode-linebreak"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f"
+
 [[package]]
 name = "unicode-normalization"
 version = "0.1.25"
@@ -5723,7 +6021,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
diff --git a/crates/tui/Cargo.toml b/crates/tui/Cargo.toml
index c4efe10e..c0bb09a2 100644
--- a/crates/tui/Cargo.toml
+++ b/crates/tui/Cargo.toml
@@ -76,6 +76,7 @@ flate2 = "1.1"
 sha2 = "0.10"
 
 [dev-dependencies]
+cucumber = "0.23.0"
 wiremock = "0.6"
 pretty_assertions = "1.4"
 vt100 = "0.15"
diff --git a/crates/tui/tests/directory_listing_acceptance.rs b/crates/tui/tests/directory_listing_acceptance.rs
new file mode 100644
index 00000000..c998dcf6
--- /dev/null
+++ b/crates/tui/tests/directory_listing_acceptance.rs
@@ -0,0 +1,182 @@
+//! Cucumber acceptance test for directory listing.
+
+use std::path::{Path, PathBuf};
+use std::process::Command;
+
+use cucumber::{World as _, gherkin::Step, given, then, when, writer::Stats as _};
+use tempfile::TempDir;
+
+const FEATURE_NAME: &str = "Directory listing acceptance";
+const FEATURE_PATH: &str = concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/tests/features/list_dir_happy_path.feature"
+);
+const HAPPY_PATH_SCENARIO: &str = "Happy path lists a workspace directory";
+
+#[derive(Debug, Default, cucumber::World)]
+struct DirectoryListingWorld {
+    record_dir: Option<TempDir>,
+    report: Option<serde_json::Value>,
+    fixture_records: Vec<serde_json::Value>,
+}
+
+#[given("an offline CodeWhale evaluation workspace")]
+fn offline_codewhale_evaluation_workspace(world: &mut DirectoryListingWorld) {
+    world.record_dir = Some(TempDir::new().expect("record tempdir"));
+}
+
+#[when(regex = r#"^the user asks "([^"]+)"$"#)]
+fn user_asks(world: &mut DirectoryListingWorld, prompt: String) {
+    assert_eq!(prompt, "list the current directory");
+
+    let record_dir = world
+        .record_dir
+        .as_ref()
+        .expect("offline evaluation workspace should be initialized");
+    let output = Command::new(codewhale_tui_binary())
+        .args(["eval", "--json", "--shell-command", "echo eval-harness"])
+        .arg("--record")
+        .arg(record_dir.path())
+        .output()
+        .expect("run codewhale-tui eval");
+
+    assert!(
+        output.status.success(),
+        "eval command failed\nstdout:\n{}\nstderr:\n{}",
+        String::from_utf8_lossy(&output.stdout),
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    world.report = Some(
+        serde_json::from_slice(&output.stdout)
+            .expect("eval --json should emit a serializable report"),
+    );
+    world.fixture_records = read_jsonl_records(&record_dir.path().join("offline-tool-loop.jsonl"));
+}
+
+#[then(regex = r#"^the simulated LLM should call the "([^"]+)" tool$"#)]
+fn simulated_llm_should_call_tool(world: &mut DirectoryListingWorld, expected_tool: String) {
+    let first_step = first_report_step(world);
+
+    assert_eq!(
+        first_step.get("kind").and_then(|value| value.as_str()),
+        Some("List")
+    );
+    assert_eq!(
+        first_step.get("tool_name").and_then(|value| value.as_str()),
+        Some(expected_tool.as_str())
+    );
+    assert_eq!(
+        first_step.get("success").and_then(|value| value.as_bool()),
+        Some(true)
+    );
+
+    let first_record = world
+        .fixture_records
+        .first()
+        .expect("recorded list_dir fixture");
+    assert_eq!(
+        first_record
+            .get("request")
+            .and_then(|request| request.get("step"))
+            .and_then(|step| step.as_str()),
+        Some(expected_tool.as_str())
+    );
+}
+
+#[then("the tool output should include:")]
+fn tool_output_should_include(world: &mut DirectoryListingWorld, step: &Step) {
+    let first_step = first_report_step(world);
+    let list_output = first_step
+        .get("output")
+        .and_then(|value| value.as_str())
+        .expect("list_dir output");
+
+    for expected_entry in data_table_column(step, "entry") {
+        assert!(
+            list_output.contains(&expected_entry),
+            "list_dir output should include {expected_entry}: {list_output}"
+        );
+    }
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn happy_path_lists_a_workspace_directory() {
+    run_scenario(HAPPY_PATH_SCENARIO).await;
+}
+
+async fn run_scenario(name: &'static str) {
+    let writer = DirectoryListingWorld::cucumber()
+        .fail_on_skipped()
+        .with_default_cli()
+        .filter_run(FEATURE_PATH, move |feature, _, scenario| {
+            feature.name == FEATURE_NAME && scenario.name == name
+        })
+        .await;
+    assert_eq!(writer.failed_steps(), 0, "scenario failed: {name}");
+    assert_eq!(writer.skipped_steps(), 0, "scenario skipped steps: {name}");
+    assert_eq!(writer.passed_steps(), 4, "scenario did not run: {name}");
+}
+
+fn first_report_step(world: &DirectoryListingWorld) -> &serde_json::Value {
+    world
+        .report
+        .as_ref()
+        .expect("evaluation report should exist")
+        .get("steps")
+        .and_then(|value| value.as_array())
+        .and_then(|steps| steps.first())
+        .expect("report should include at least one step")
+}
+
+fn data_table_column(step: &Step, header: &str) -> Vec<String> {
+    let table = step
+        .table
+        .as_ref()
+        .expect("step should include a data table");
+    let mut rows = table.rows.iter();
+    let header_row = rows.next().expect("data table should include a header");
+    let column_index = header_row
+        .iter()
+        .position(|value| value == header)
+        .expect("data table should include expected header");
+
+    let values: Vec<String> = rows
+        .map(|row| {
+            row.get(column_index)
+                .unwrap_or_else(|| panic!("data table row missing {header} value"))
+                .clone()
+        })
+        .collect();
+    assert!(
+        !values.is_empty(),
+        "data table should include at least one {header} value"
+    );
+    values
+}
+
+fn read_jsonl_records(path: &Path) -> Vec<serde_json::Value> {
+    std::fs::read_to_string(path)
+        .expect("read fixture records")
+        .lines()
+        .filter(|line| !line.trim().is_empty())
+        .map(|line| serde_json::from_str(line).expect("fixture line should parse"))
+        .collect()
+}
+
+fn codewhale_tui_binary() -> PathBuf {
+    if let Some(path) = option_env!("CARGO_BIN_EXE_codewhale-tui") {
+        return PathBuf::from(path);
+    }
+    if let Ok(path) = std::env::var("CARGO_BIN_EXE_codewhale-tui") {
+        return PathBuf::from(path);
+    }
+
+    let mut path = std::env::current_exe().expect("current test executable path");
+    path.pop();
+    if path.ends_with("deps") {
+        path.pop();
+    }
+    path.push(format!("codewhale-tui{}", std::env::consts::EXE_SUFFIX));
+    path
+}
diff --git a/crates/tui/tests/eval_harness.rs b/crates/tui/tests/eval_harness.rs
index 00a5d26b..f44d8cd9 100644
--- a/crates/tui/tests/eval_harness.rs
+++ b/crates/tui/tests/eval_harness.rs
@@ -2,13 +2,23 @@
 
 use std::fs;
 
+use tempfile::tempdir;
+
 #[path = "../src/eval.rs"]
 mod eval;
 #[path = "../src/shell_dispatcher.rs"]
 mod shell_dispatcher;
 
-use eval::{EvalHarness, EvalHarnessConfig, ScenarioStepKind};
-use tempfile::tempdir;
+use eval::{EvalHarness, EvalHarnessConfig, FixtureRecord, ScenarioStepKind};
+
+const HAPPY_PATH_TOOL_LOOP: [ScenarioStepKind; 6] = [
+    ScenarioStepKind::List,
+    ScenarioStepKind::Read,
+    ScenarioStepKind::Search,
+    ScenarioStepKind::Edit,
+    ScenarioStepKind::ApplyPatch,
+    ScenarioStepKind::ExecShell,
+];
 
 #[test]
 fn runs_offline_tool_loop_successfully() {
@@ -26,14 +36,7 @@ fn runs_offline_tool_loop_successfully() {
     assert!(!run.scenario_name.is_empty());
     assert!(run.workspace_summary.file_count >= 3);
 
-    for kind in [
-        ScenarioStepKind::List,
-        ScenarioStepKind::Read,
-        ScenarioStepKind::Search,
-        ScenarioStepKind::Edit,
-        ScenarioStepKind::ApplyPatch,
-        ScenarioStepKind::ExecShell,
-    ] {
+    for kind in HAPPY_PATH_TOOL_LOOP {
         let stats = run
             .metrics
             .per_tool
@@ -53,6 +56,78 @@ fn runs_offline_tool_loop_successfully() {
     assert_eq!(report.metrics.success, run.metrics.success);
 }
 
+#[test]
+fn acceptance_happy_path_records_simulated_llm_tool_plan() {
+    let record_dir = tempdir().expect("tempdir");
+    let scenario_name = "issue-2791-happy-path-tool-loop";
+    let config = EvalHarnessConfig {
+        scenario_name: scenario_name.to_string(),
+        record_dir: Some(record_dir.path().to_path_buf()),
+        ..EvalHarnessConfig::default()
+    };
+    let harness = EvalHarness::new(config);
+
+    let run = harness.run().expect("happy-path acceptance run");
+
+    assert!(run.metrics.success, "expected success metrics: {run:#?}");
+    assert_eq!(run.metrics.tool_errors, 0);
+    assert_eq!(run.metrics.steps, HAPPY_PATH_TOOL_LOOP.len());
+
+    let actual_tool_names: Vec<&str> = run.steps.iter().map(|step| step.tool_name).collect();
+    let expected_tool_names: Vec<&str> = HAPPY_PATH_TOOL_LOOP
+        .iter()
+        .map(|kind| kind.tool_name())
+        .collect();
+    assert_eq!(actual_tool_names, expected_tool_names);
+
+    let scenario_file = record_dir.path().join(format!("{scenario_name}.jsonl"));
+    let records = read_fixture_records(&scenario_file);
+    assert_eq!(records.len(), HAPPY_PATH_TOOL_LOOP.len());
+
+    for (record, kind) in records.iter().zip(HAPPY_PATH_TOOL_LOOP) {
+        assert_eq!(
+            record.request.get("step").and_then(|value| value.as_str()),
+            Some(kind.tool_name())
+        );
+
+        let expected_kind = format!("{kind:?}");
+        assert_eq!(
+            record.request.get("kind").and_then(|value| value.as_str()),
+            Some(expected_kind.as_str())
+        );
+
+        let event = record
+            .response_events
+            .first()
+            .expect("simulated LLM fixture should include a response event");
+        assert_eq!(
+            event.get("type").and_then(|value| value.as_str()),
+            Some("ok")
+        );
+        assert!(
+            event
+                .get("output")
+                .and_then(|value| value.as_str())
+                .is_some_and(|output| !output.is_empty()),
+            "fixture event should include non-empty tool output"
+        );
+    }
+
+    let notes_path = run.workspace_root().join("notes.txt");
+    let notes = fs::read_to_string(&notes_path).expect("notes.txt should exist");
+    assert!(notes.contains("edited = true"));
+    assert!(notes.contains("todo: offline metrics (patched)"));
+}
+
+fn read_fixture_records(path: &std::path::Path) -> Vec<FixtureRecord> {
+    fs::read_to_string(path)
+        .expect("read fixture records")
+        .lines()
+        .filter(|line| !line.trim().is_empty())
+        .map(|line| serde_json::from_str(line).expect("fixture line should parse"))
+        .collect()
+}
+
 #[test]
 fn records_tool_errors_when_step_fails() {
     let config = EvalHarnessConfig {
diff --git a/crates/tui/tests/features/list_dir_happy_path.feature b/crates/tui/tests/features/list_dir_happy_path.feature
new file mode 100644
index 00000000..c677ff82
--- /dev/null
+++ b/crates/tui/tests/features/list_dir_happy_path.feature
@@ -0,0 +1,10 @@
+Feature: Directory listing acceptance
+  Scenario: Happy path lists a workspace directory
+    Given an offline CodeWhale evaluation workspace
+    When the user asks "list the current directory"
+    Then the simulated LLM should call the "list_dir" tool
+    And the tool output should include:
+      | entry     |
+      | README.md |
+      | notes.txt |
+      | src       |
diff --git a/crates/tui/tests/features/tool_lifecycle_happy_path.feature b/crates/tui/tests/features/tool_lifecycle_happy_path.feature
new file mode 100644
index 00000000..43c13f1c
--- /dev/null
+++ b/crates/tui/tests/features/tool_lifecycle_happy_path.feature
@@ -0,0 +1,31 @@
+Feature: Tool call lifecycle
+  Scenario: Happy path lists the current directory through a tool
+    # This executable slice asserts the public exec stream and mocked LLM border.
+    # The PTY screen slice should also assert Statusline state and BlueWhale activity:
+    # running while the tool is executing, stopped or completed when the turn finishes.
+    Given an offline CodeWhale workspace containing:
+      | path      | kind   |
+      | README.md | file   |
+      | notes.txt | file   |
+      | src       | folder |
+    And the mocked LLM will request the "list_dir" tool with:
+      | path |
+      | .    |
+    And the mocked LLM will answer after the tool result:
+      | content                                                    |
+      | The directory contains README.md, notes.txt, and src/.      |
+    When the user asks "list the current directory"
+    Then CodeWhale should send the user request to the mocked LLM
+    And the public tool lifecycle should show a running tool:
+      | status  | marker | tool     | input |
+      | running | [~]    | list_dir | .     |
+    And the public tool result should return directory entries:
+      | entry     | kind   |
+      | README.md | file   |
+      | notes.txt | file   |
+      | src       | folder |
+    And CodeWhale should send the tool result back to the mocked LLM
+    And the public tool lifecycle should show a completed tool:
+      | status    | marker | tool     | input |
+      | completed | ✓      | list_dir | .     |
+    And the public output should include "The directory contains README.md, notes.txt, and src/."
diff --git a/crates/tui/tests/tool_lifecycle_acceptance.rs b/crates/tui/tests/tool_lifecycle_acceptance.rs
new file mode 100644
index 00000000..21ecd28c
--- /dev/null
+++ b/crates/tui/tests/tool_lifecycle_acceptance.rs
@@ -0,0 +1,630 @@
+//! Cucumber acceptance test for the public LLM/tool lifecycle.
+
+use std::io::Read;
+use std::path::PathBuf;
+use std::process::{Command, Stdio};
+use std::time::Duration;
+
+use cucumber::{World as _, gherkin::Step, given, then, when, writer::Stats as _};
+use serde_json::{Value, json};
+use tempfile::TempDir;
+use wait_timeout::ChildExt;
+use wiremock::matchers::{method, path};
+use wiremock::{Mock, MockServer, Request, ResponseTemplate};
+
+const FEATURE_NAME: &str = "Tool call lifecycle";
+const FEATURE_PATH: &str = concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/tests/features/tool_lifecycle_happy_path.feature"
+);
+const HAPPY_PATH_SCENARIO: &str = "Happy path lists the current directory through a tool";
+const TOOL_CALL_ID: &str = "call_list_dir";
+const TEST_MODEL: &str = "acceptance-model";
+
+#[derive(Debug, Default, cucumber::World)]
+struct ToolLifecycleWorld {
+    workspace: Option<TempDir>,
+    home: Option<TempDir>,
+    llm_server: Option<MockServer>,
+    tool_name: Option<String>,
+    tool_input: Option<Value>,
+    final_answer: Option<String>,
+    stdout: String,
+    stderr: String,
+    events: Vec<Value>,
+    requests: Vec<Value>,
+}
+
+#[given("an offline CodeWhale workspace containing:")]
+fn offline_codewhale_workspace_containing(world: &mut ToolLifecycleWorld, step: &Step) {
+    let workspace = TempDir::new().expect("workspace tempdir");
+    let home = TempDir::new().expect("home tempdir");
+
+    for row in data_table_rows(step) {
+        let relative_path = row_value(&row, "path");
+        let kind = row_value(&row, "kind");
+        let path = workspace.path().join(relative_path);
+        match kind.as_str() {
+            "file" => std::fs::write(&path, "").expect("write workspace file"),
+            "folder" => std::fs::create_dir_all(&path).expect("create workspace folder"),
+            other => panic!("unsupported workspace entry kind: {other}"),
+        }
+    }
+
+    world.workspace = Some(workspace);
+    world.home = Some(home);
+}
+
+#[given(regex = r#"^the mocked LLM will request the "([^"]+)" tool with:$"#)]
+fn mocked_llm_will_request_tool(world: &mut ToolLifecycleWorld, tool_name: String, step: &Step) {
+    let rows = data_table_rows(step);
+    assert_eq!(rows.len(), 1, "tool input table should contain one row");
+    let input = Value::Object(
+        rows[0]
+            .iter()
+            .map(|(key, value)| (key.clone(), Value::String(value.clone())))
+            .collect(),
+    );
+
+    world.tool_name = Some(tool_name);
+    world.tool_input = Some(input);
+}
+
+#[given("the mocked LLM will answer after the tool result:")]
+fn mocked_llm_will_answer_after_tool_result(world: &mut ToolLifecycleWorld, step: &Step) {
+    let rows = data_table_rows(step);
+    assert_eq!(rows.len(), 1, "final answer table should contain one row");
+    world.final_answer = Some(row_value(&rows[0], "content"));
+}
+
+#[when(regex = r#"^the user asks "([^"]+)"$"#)]
+async fn user_asks(world: &mut ToolLifecycleWorld, prompt: String) {
+    let server = start_mock_llm(world).await;
+    let output = run_codewhale_exec(world, &server, &prompt);
+
+    world.stdout = String::from_utf8_lossy(&output.stdout).into_owned();
+    world.stderr = String::from_utf8_lossy(&output.stderr).into_owned();
+    assert!(
+        output.status.success(),
+        "codewhale-tui exec failed\nstdout:\n{}\nstderr:\n{}",
+        world.stdout,
+        world.stderr
+    );
+
+    world.events = parse_stream_events(&world.stdout);
+    world.requests = server
+        .received_requests()
+        .await
+        .expect("mock server should record requests")
+        .into_iter()
+        .filter(|request| request.url.path().ends_with("/chat/completions"))
+        .map(|request| {
+            request
+                .body_json()
+                .expect("chat request body should be JSON")
+        })
+        .collect();
+    world.llm_server = Some(server);
+}
+
+#[then("CodeWhale should send the user request to the mocked LLM")]
+fn codewhale_should_send_user_request_to_mocked_llm(world: &mut ToolLifecycleWorld) {
+    let first_request = world
+        .requests
+        .first()
+        .expect("expected an initial chat request");
+
+    assert!(
+        request_contains_user_text(first_request, "list the current directory"),
+        "initial request should include the user prompt:\n{first_request:#}"
+    );
+    assert!(
+        !request_contains_tool_result(first_request),
+        "initial request should not include a tool result:\n{first_request:#}"
+    );
+}
+
+#[then("the public tool lifecycle should show a running tool:")]
+fn public_tool_lifecycle_should_show_running_tool(world: &mut ToolLifecycleWorld, step: &Step) {
+    let expected = one_table_row(step);
+    assert_eq!(row_value(&expected, "status"), "running");
+    assert_eq!(row_value(&expected, "marker"), "[~]");
+
+    let event = tool_use_event(world, &row_value(&expected, "tool"));
+    assert_eq!(
+        event.get("input").and_then(|input| input.get("path")),
+        Some(&json!(row_value(&expected, "input")))
+    );
+}
+
+#[then("the public tool result should return directory entries:")]
+fn public_tool_result_should_return_directory_entries(world: &mut ToolLifecycleWorld, step: &Step) {
+    let output = tool_result_output(world);
+    let entries: Vec<Value> =
+        serde_json::from_str(output).expect("list_dir result should be JSON entries");
+
+    for row in data_table_rows(step) {
+        let expected_name = row_value(&row, "entry");
+        let expected_is_dir = match row_value(&row, "kind").as_str() {
+            "file" => false,
+            "folder" => true,
+            other => panic!("unsupported expected entry kind: {other}"),
+        };
+        assert!(
+            entries.iter().any(|entry| {
+                entry.get("name").and_then(Value::as_str) == Some(expected_name.as_str())
+                    && entry.get("is_dir").and_then(Value::as_bool) == Some(expected_is_dir)
+            }),
+            "missing {expected_name} in list_dir result:\n{output}"
+        );
+    }
+}
+
+#[then("CodeWhale should send the tool result back to the mocked LLM")]
+fn codewhale_should_send_tool_result_back_to_mocked_llm(world: &mut ToolLifecycleWorld) {
+    let request = world
+        .requests
+        .iter()
+        .find(|request| request_contains_tool_result(request))
+        .expect("expected a follow-up chat request containing the tool result");
+    let tool_result = tool_result_message(request).expect("tool result message");
+    assert_eq!(
+        tool_result
+            .get("tool_call_id")
+            .and_then(serde_json::Value::as_str),
+        Some(TOOL_CALL_ID)
+    );
+
+    let content = tool_result
+        .get("content")
+        .and_then(serde_json::Value::as_str)
+        .expect("tool result content");
+    for entry in ["README.md", "notes.txt", "src"] {
+        assert!(
+            content.contains(entry),
+            "tool result sent to LLM should include {entry}:\n{content}"
+        );
+    }
+}
+
+#[then("the public tool lifecycle should show a completed tool:")]
+fn public_tool_lifecycle_should_show_completed_tool(world: &mut ToolLifecycleWorld, step: &Step) {
+    let expected = one_table_row(step);
+    assert_eq!(row_value(&expected, "status"), "completed");
+    assert_eq!(row_value(&expected, "marker"), "✓");
+
+    let event = tool_result_event(world);
+    assert_eq!(event.get("status").and_then(Value::as_str), Some("success"));
+
+    let tool_use = tool_use_event(world, &row_value(&expected, "tool"));
+    assert_eq!(
+        tool_use.get("input").and_then(|input| input.get("path")),
+        Some(&json!(row_value(&expected, "input")))
+    );
+}
+
+#[then(regex = r#"^the public output should include "([^"]+)"$"#)]
+fn public_output_should_include(world: &mut ToolLifecycleWorld, expected: String) {
+    let content = world
+        .events
+        .iter()
+        .filter(|event| event.get("type").and_then(Value::as_str) == Some("content"))
+        .filter_map(|event| event.get("content").and_then(Value::as_str))
+        .collect::<String>();
+    assert!(
+        content.contains(&expected),
+        "public content output should include {expected:?}:\nstdout:\n{}\nstderr:\n{}",
+        world.stdout,
+        world.stderr
+    );
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn happy_path_lists_current_directory_through_tool() {
+    run_scenario(HAPPY_PATH_SCENARIO).await;
+}
+
+async fn run_scenario(name: &'static str) {
+    let writer = ToolLifecycleWorld::cucumber()
+        .fail_on_skipped()
+        .with_default_cli()
+        .filter_run(FEATURE_PATH, move |feature, _, scenario| {
+            feature.name == FEATURE_NAME && scenario.name == name
+        })
+        .await;
+    assert_eq!(writer.failed_steps(), 0, "scenario failed: {name}");
+    assert_eq!(writer.skipped_steps(), 0, "scenario skipped steps: {name}");
+    assert_eq!(writer.passed_steps(), 10, "scenario did not run: {name}");
+}
+
+async fn start_mock_llm(world: &ToolLifecycleWorld) -> MockServer {
+    let server = MockServer::start().await;
+
+    Mock::given(method("GET"))
+        .and(path("/v1/models"))
+        .respond_with(json_response(json!({
+            "object": "list",
+            "data": [{ "id": TEST_MODEL, "object": "model" }]
+        })))
+        .mount(&server)
+        .await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .and(request_has_tool_result)
+        .respond_with(sse_response(&final_answer_sse(
+            world.final_answer.as_ref().expect("final LLM answer"),
+        )))
+        .mount(&server)
+        .await;
+
+    Mock::given(method("POST"))
+        .and(path("/v1/chat/completions"))
+        .and(request_has_no_tool_result)
+        .respond_with(sse_response(&tool_call_sse(
+            world.tool_name.as_ref().expect("tool name"),
+            world.tool_input.as_ref().expect("tool input"),
+        )))
+        .mount(&server)
+        .await;
+
+    server
+}
+
+fn run_codewhale_exec(
+    world: &ToolLifecycleWorld,
+    server: &MockServer,
+    prompt: &str,
+) -> std::process::Output {
+    let workspace = world
+        .workspace
+        .as_ref()
+        .expect("workspace")
+        .path()
+        .to_path_buf();
+    let home = world.home.as_ref().expect("home").path().to_path_buf();
+
+    let mut command = Command::new(codewhale_tui_binary());
+    preserve_host_env(&mut command);
+    command
+        .current_dir(&workspace)
+        .arg("--workspace")
+        .arg(&workspace)
+        .arg("--no-project-config")
+        .arg("exec")
+        .arg("--auto")
+        .arg("--model")
+        .arg(TEST_MODEL)
+        .arg("--output-format")
+        .arg("stream-json")
+        .arg(prompt)
+        .env("HOME", &home)
+        .env("USERPROFILE", &home)
+        .env("XDG_CONFIG_HOME", home.join(".config"))
+        .env("XDG_DATA_HOME", home.join(".local").join("share"))
+        .env("XDG_CACHE_HOME", home.join(".cache"))
+        .env(
+            "CODEWHALE_CONFIG_PATH",
+            home.join(".codewhale").join("config.toml"),
+        )
+        .env(
+            "DEEPSEEK_CONFIG_PATH",
+            home.join(".deepseek").join("config.toml"),
+        )
+        .env("DEEPSEEK_API_KEY", "ci-test-key-not-real")
+        .env("DEEPSEEK_BASE_URL", server.uri())
+        .env("CODEWHALE_BASE_URL", server.uri())
+        .env("DEEPSEEK_MODEL", TEST_MODEL)
+        .env("CODEWHALE_MODEL", TEST_MODEL)
+        .env("RUST_LOG", "warn")
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped());
+
+    std::fs::create_dir_all(home.join(".codewhale")).expect("create codewhale home config dir");
+    std::fs::create_dir_all(home.join(".deepseek")).expect("create deepseek home config dir");
+
+    run_with_timeout(command, Duration::from_secs(45))
+}
+
+fn run_with_timeout(mut command: Command, timeout: Duration) -> std::process::Output {
+    let mut child = command.spawn().expect("spawn codewhale-tui exec");
+    let status = match child.wait_timeout(timeout).expect("wait for codewhale-tui") {
+        Some(status) => status,
+        None => {
+            let _ = child.kill();
+            let _ = child.wait();
+            panic!("codewhale-tui exec timed out after {timeout:?}");
+        }
+    };
+
+    let mut stdout = Vec::new();
+    let mut stderr = Vec::new();
+    child
+        .stdout
+        .take()
+        .expect("stdout pipe")
+        .read_to_end(&mut stdout)
+        .expect("read stdout");
+    child
+        .stderr
+        .take()
+        .expect("stderr pipe")
+        .read_to_end(&mut stderr)
+        .expect("read stderr");
+
+    std::process::Output {
+        status,
+        stdout,
+        stderr,
+    }
+}
+
+fn preserve_host_env(command: &mut Command) {
+    command.env_clear();
+    for key in [
+        "PATH",
+        "PATHEXT",
+        "SystemRoot",
+        "SystemDrive",
+        "WINDIR",
+        "COMSPEC",
+        "TEMP",
+        "TMP",
+    ] {
+        if let Some(value) = std::env::var_os(key) {
+            command.env(key, value);
+        }
+    }
+}
+
+fn tool_call_sse(tool_name: &str, tool_input: &Value) -> String {
+    let arguments = serde_json::to_string(tool_input).expect("tool input arguments");
+    [
+        sse_chunk(json!({
+            "id": "chatcmpl-tool",
+            "object": "chat.completion.chunk",
+            "model": TEST_MODEL,
+            "choices": [{
+                "index": 0,
+                "delta": {
+                    "tool_calls": [{
+                        "index": 0,
+                        "id": TOOL_CALL_ID,
+                        "type": "function",
+                        "function": {
+                            "name": tool_name,
+                            "arguments": arguments
+                        }
+                    }]
+                },
+                "finish_reason": null
+            }]
+        })),
+        sse_chunk(json!({
+            "id": "chatcmpl-tool",
+            "object": "chat.completion.chunk",
+            "model": TEST_MODEL,
+            "choices": [{
+                "index": 0,
+                "delta": {},
+                "finish_reason": "tool_calls"
+            }],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 2,
+                "total_tokens": 12
+            }
+        })),
+        "data: [DONE]\n\n".to_string(),
+    ]
+    .join("")
+}
+
+fn final_answer_sse(answer: &str) -> String {
+    [
+        sse_chunk(json!({
+            "id": "chatcmpl-final",
+            "object": "chat.completion.chunk",
+            "model": TEST_MODEL,
+            "choices": [{
+                "index": 0,
+                "delta": { "content": answer },
+                "finish_reason": null
+            }]
+        })),
+        sse_chunk(json!({
+            "id": "chatcmpl-final",
+            "object": "chat.completion.chunk",
+            "model": TEST_MODEL,
+            "choices": [{
+                "index": 0,
+                "delta": {},
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": 20,
+                "completion_tokens": 8,
+                "total_tokens": 28
+            }
+        })),
+        "data: [DONE]\n\n".to_string(),
+    ]
+    .join("")
+}
+
+fn sse_chunk(value: Value) -> String {
+    format!(
+        "data: {}\n\n",
+        serde_json::to_string(&value).expect("SSE JSON")
+    )
+}
+
+fn sse_response(body: &str) -> ResponseTemplate {
+    ResponseTemplate::new(200)
+        .insert_header("content-type", "text/event-stream")
+        .insert_header("cache-control", "no-cache")
+        .set_body_string(body.to_string())
+}
+
+fn json_response(value: Value) -> ResponseTemplate {
+    ResponseTemplate::new(200)
+        .insert_header("content-type", "application/json")
+        .set_body_json(value)
+}
+
+fn request_has_tool_result(request: &Request) -> bool {
+    request
+        .body_json::<Value>()
+        .is_ok_and(|body| request_contains_tool_result(&body))
+}
+
+fn request_has_no_tool_result(request: &Request) -> bool {
+    !request_has_tool_result(request)
+}
+
+fn request_contains_tool_result(request: &Value) -> bool {
+    tool_result_message(request).is_some()
+}
+
+fn tool_result_message(request: &Value) -> Option<&Value> {
+    request
+        .get("messages")
+        .and_then(Value::as_array)?
+        .iter()
+        .find(|message| message.get("role").and_then(Value::as_str) == Some("tool"))
+}
+
+fn request_contains_user_text(request: &Value, expected: &str) -> bool {
+    request
+        .get("messages")
+        .and_then(Value::as_array)
+        .into_iter()
+        .flatten()
+        .any(|message| {
+            message.get("role").and_then(Value::as_str) == Some("user")
+                && message
+                    .get("content")
+                    .is_some_and(|content| value_contains_text(content, expected))
+        })
+}
+
+fn value_contains_text(value: &Value, expected: &str) -> bool {
+    match value {
+        Value::String(text) => text.contains(expected),
+        Value::Array(values) => values
+            .iter()
+            .any(|value| value_contains_text(value, expected)),
+        Value::Object(values) => values
+            .values()
+            .any(|value| value_contains_text(value, expected)),
+        _ => false,
+    }
+}
+
+fn parse_stream_events(stdout: &str) -> Vec<Value> {
+    stdout
+        .lines()
+        .filter(|line| !line.trim().is_empty())
+        .filter_map(|line| {
+            let json_start = line.find('{')?;
+            let json_line = &line[json_start..];
+            Some(serde_json::from_str(json_line).unwrap_or_else(|err| {
+                panic!(
+                    "stream-json line should parse: {err}\nline: {line}\njson: {json_line}\nstdout:\n{stdout}"
+                )
+            }))
+        })
+        .collect()
+}
+
+fn tool_use_event<'a>(world: &'a ToolLifecycleWorld, expected_tool: &str) -> &'a Value {
+    world
+        .events
+        .iter()
+        .find(|event| {
+            event.get("type").and_then(Value::as_str) == Some("tool_use")
+                && event.get("name").and_then(Value::as_str) == Some(expected_tool)
+        })
+        .unwrap_or_else(|| {
+            panic!(
+                "expected tool_use event for {expected_tool}\nstdout:\n{}\nstderr:\n{}",
+                world.stdout, world.stderr
+            )
+        })
+}
+
+fn tool_result_event(world: &ToolLifecycleWorld) -> &Value {
+    world
+        .events
+        .iter()
+        .find(|event| event.get("type").and_then(Value::as_str) == Some("tool_result"))
+        .unwrap_or_else(|| {
+            panic!(
+                "expected tool_result event\nstdout:\n{}\nstderr:\n{}",
+                world.stdout, world.stderr
+            )
+        })
+}
+
+fn tool_result_output(world: &ToolLifecycleWorld) -> &str {
+    tool_result_event(world)
+        .get("output")
+        .and_then(Value::as_str)
+        .expect("tool_result output")
+}
+
+fn one_table_row(step: &Step) -> Vec<(String, String)> {
+    let rows = data_table_rows(step);
+    assert_eq!(rows.len(), 1, "expected exactly one data table row");
+    rows.into_iter().next().expect("one row")
+}
+
+fn data_table_rows(step: &Step) -> Vec<Vec<(String, String)>> {
+    let table = step
+        .table
+        .as_ref()
+        .expect("step should include a data table");
+    let mut rows = table.rows.iter();
+    let headers = rows
+        .next()
+        .expect("data table should include a header")
+        .clone();
+
+    let values: Vec<Vec<(String, String)>> = rows
+        .map(|row| {
+            headers
+                .iter()
+                .zip(row.iter())
+                .map(|(header, value)| (header.clone(), value.clone()))
+                .collect()
+        })
+        .collect();
+    assert!(
+        !values.is_empty(),
+        "data table should include at least one row"
+    );
+    values
+}
+
+fn row_value(row: &[(String, String)], header: &str) -> String {
+    row.iter()
+        .find_map(|(key, value)| (key == header).then(|| value.clone()))
+        .unwrap_or_else(|| panic!("data table row missing {header} value"))
+}
+
+fn codewhale_tui_binary() -> PathBuf {
+    if let Some(path) = option_env!("CARGO_BIN_EXE_codewhale-tui") {
+        return PathBuf::from(path);
+    }
+    if let Ok(path) = std::env::var("CARGO_BIN_EXE_codewhale-tui") {
+        return PathBuf::from(path);
+    }
+
+    let mut path = std::env::current_exe().expect("current test executable path");
+    path.pop();
+    if path.ends_with("deps") {
+        path.pop();
+    }
+    path.push(format!("codewhale-tui{}", std::env::consts::EXE_SUFFIX));
+    path
+}