diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 157c09a2d7b..7eec10020b5 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -636,6 +636,7 @@ version = "0.0.0" dependencies = [ "anyhow", "assert_cmd", + "once_cell", "pretty_assertions", "similar", "tempfile", diff --git a/codex-rs/apply-patch/Cargo.toml b/codex-rs/apply-patch/Cargo.toml index 32c7f6e43ff..84d3630cfad 100644 --- a/codex-rs/apply-patch/Cargo.toml +++ b/codex-rs/apply-patch/Cargo.toml @@ -20,6 +20,7 @@ similar = "2.7.0" thiserror = "2.0.12" tree-sitter = "0.25.8" tree-sitter-bash = "0.25.0" +once_cell = "1" [dev-dependencies] assert_cmd = "2" diff --git a/codex-rs/apply-patch/src/lib.rs b/codex-rs/apply-patch/src/lib.rs index 46ddf056c8e..18d3bd4d325 100644 --- a/codex-rs/apply-patch/src/lib.rs +++ b/codex-rs/apply-patch/src/lib.rs @@ -9,6 +9,7 @@ use std::str::Utf8Error; use anyhow::Context; use anyhow::Result; +use once_cell::sync::Lazy; pub use parser::Hunk; pub use parser::ParseError; use parser::ParseError::*; @@ -18,6 +19,9 @@ use similar::TextDiff; use thiserror::Error; use tree_sitter::LanguageError; use tree_sitter::Parser; +use tree_sitter::Query; +use tree_sitter::QueryCursor; +use tree_sitter::StreamingIterator; use tree_sitter_bash::LANGUAGE as BASH; pub use standalone_executable::main; @@ -84,6 +88,7 @@ pub enum MaybeApplyPatch { pub struct ApplyPatchArgs { pub patch: String, pub hunks: Vec, + pub workdir: Option, } pub fn maybe_parse_apply_patch(argv: &[String]) -> MaybeApplyPatch { @@ -92,18 +97,18 @@ pub fn maybe_parse_apply_patch(argv: &[String]) -> MaybeApplyPatch { Ok(source) => MaybeApplyPatch::Body(source), Err(e) => MaybeApplyPatch::PatchParseError(e), }, - [bash, flag, script] - if bash == "bash" - && flag == "-lc" - && APPLY_PATCH_COMMANDS - .iter() - .any(|cmd| script.trim_start().starts_with(cmd)) => - { - match extract_heredoc_body_from_apply_patch_command(script) { - Ok(body) => match parse_patch(&body) { - Ok(source) => MaybeApplyPatch::Body(source), + [bash, flag, script] if bash == "bash" && flag == "-lc" => { + match extract_apply_patch_from_bash(script) { + Ok((body, workdir)) => match parse_patch(&body) { + Ok(mut source) => { + source.workdir = workdir; + MaybeApplyPatch::Body(source) + } Err(e) => MaybeApplyPatch::PatchParseError(e), }, + Err(ExtractHeredocError::CommandDidNotStartWithApplyPatch) => { + MaybeApplyPatch::NotApplyPatch + } Err(e) => MaybeApplyPatch::ShellParseError(e), } } @@ -203,10 +208,25 @@ impl ApplyPatchAction { /// patch. pub fn maybe_parse_apply_patch_verified(argv: &[String], cwd: &Path) -> MaybeApplyPatchVerified { match maybe_parse_apply_patch(argv) { - MaybeApplyPatch::Body(ApplyPatchArgs { patch, hunks }) => { + MaybeApplyPatch::Body(ApplyPatchArgs { + patch, + hunks, + workdir, + }) => { + let effective_cwd = workdir + .as_ref() + .map(|dir| { + let path = Path::new(dir); + if path.is_absolute() { + path.to_path_buf() + } else { + cwd.join(path) + } + }) + .unwrap_or_else(|| cwd.to_path_buf()); let mut changes = HashMap::new(); for hunk in hunks { - let path = hunk.resolve_path(cwd); + let path = hunk.resolve_path(&effective_cwd); match hunk { Hunk::AddFile { contents, .. } => { changes.insert(path, ApplyPatchFileChange::Add { content: contents }); @@ -251,7 +271,7 @@ pub fn maybe_parse_apply_patch_verified(argv: &[String], cwd: &Path) -> MaybeApp MaybeApplyPatchVerified::Body(ApplyPatchAction { changes, patch, - cwd: cwd.to_path_buf(), + cwd: effective_cwd, }) } MaybeApplyPatch::ShellParseError(e) => MaybeApplyPatchVerified::ShellParseError(e), @@ -260,33 +280,96 @@ pub fn maybe_parse_apply_patch_verified(argv: &[String], cwd: &Path) -> MaybeApp } } -/// Attempts to extract a heredoc_body object from a string bash command like: -/// Optimistically -/// -/// ```bash -/// bash -lc 'apply_patch < && apply_patch <<'EOF'\n...\nEOF` /// -/// * `src` - A string slice that holds the full command +/// Notes about matching: +/// - Parsed with Tree‑sitter Bash and a strict query that uses anchors so the +/// heredoc‑redirected statement is the only top‑level statement. +/// - The connector between `cd` and `apply_patch` must be `&&` (not `|` or `||`). +/// - Exactly one positional `word` argument is allowed for `cd` (no flags, no quoted +/// strings, no second argument). +/// - The apply command is validated in‑query via `#any-of?` to allow `apply_patch` +/// or `applypatch`. +/// - Preceding or trailing commands (e.g., `echo ...;` or `... && echo done`) do not match. /// -/// # Returns -/// -/// This function returns a `Result` which is: -/// -/// * `Ok(String)` - The heredoc body if the extraction is successful. -/// * `Err(anyhow::Error)` - An error if the extraction fails. -/// -fn extract_heredoc_body_from_apply_patch_command( +/// Returns `(heredoc_body, Some(path))` when the `cd` variant matches, or +/// `(heredoc_body, None)` for the direct form. Errors are returned if the script +/// cannot be parsed or does not match the allowed patterns. +fn extract_apply_patch_from_bash( src: &str, -) -> std::result::Result { - if !APPLY_PATCH_COMMANDS - .iter() - .any(|cmd| src.trim_start().starts_with(cmd)) - { - return Err(ExtractHeredocError::CommandDidNotStartWithApplyPatch); - } +) -> std::result::Result<(String, Option), ExtractHeredocError> { + // This function uses a Tree-sitter query to recognize one of two + // whole-script forms, each expressed as a single top-level statement: + // + // 1. apply_patch <<'EOF'\n...\nEOF + // 2. cd && apply_patch <<'EOF'\n...\nEOF + // + // Key ideas when reading the query: + // - dots (`.`) between named nodes enforces adjacency among named children and + // anchor to the start/end of the expression. + // - we match a single redirected_statement directly under program with leading + // and trailing anchors (`.`). This ensures it is the only top-level statement + // (so prefixes like `echo ...;` or suffixes like `... && echo done` do not match). + // + // Overall, we want to be conservative and only match the intended forms, as other + // forms are likely to be model errors, or incorrectly interpreted by later code. + // + // If you're editing this query, it's helpful to start by creating a debugging binary + // which will let you see the AST of an arbitrary bash script passed in, and optionally + // also run an arbitrary query against the AST. This is useful for understanding + // how tree-sitter parses the script and whether the query syntax is correct. Be sure + // to test both positive and negative cases. + static APPLY_PATCH_QUERY: Lazy = Lazy::new(|| { + let language = BASH.into(); + #[expect(clippy::expect_used)] + Query::new( + &language, + r#" + ( + program + . (redirected_statement + body: (command + name: (command_name (word) @apply_name) .) + (#any-of? @apply_name "apply_patch" "applypatch") + redirect: (heredoc_redirect + . (heredoc_start) + . (heredoc_body) @heredoc + . (heredoc_end) + .)) + .) + + ( + program + . (redirected_statement + body: (list + . (command + name: (command_name (word) @cd_name) . + argument: [ + (word) @cd_path + (string (string_content) @cd_path) + (raw_string) @cd_raw_string + ] .) + "&&" + . (command + name: (command_name (word) @apply_name)) + .) + (#eq? @cd_name "cd") + (#any-of? @apply_name "apply_patch" "applypatch") + redirect: (heredoc_redirect + . (heredoc_start) + . (heredoc_body) @heredoc + . (heredoc_end) + .)) + .) + "#, + ) + .expect("valid bash query") + }); let lang = BASH.into(); let mut parser = Parser::new(); @@ -298,26 +381,55 @@ fn extract_heredoc_body_from_apply_patch_command( .ok_or(ExtractHeredocError::FailedToParsePatchIntoAst)?; let bytes = src.as_bytes(); - let mut c = tree.root_node().walk(); - - loop { - let node = c.node(); - if node.kind() == "heredoc_body" { - let text = node - .utf8_text(bytes) - .map_err(ExtractHeredocError::HeredocNotUtf8)?; - return Ok(text.trim_end_matches('\n').to_owned()); + let root = tree.root_node(); + + let mut cursor = QueryCursor::new(); + let mut matches = cursor.matches(&APPLY_PATCH_QUERY, root, bytes); + while let Some(m) = matches.next() { + let mut heredoc_text: Option = None; + let mut cd_path: Option = None; + + for capture in m.captures.iter() { + let name = APPLY_PATCH_QUERY.capture_names()[capture.index as usize]; + match name { + "heredoc" => { + let text = capture + .node + .utf8_text(bytes) + .map_err(ExtractHeredocError::HeredocNotUtf8)? + .trim_end_matches('\n') + .to_string(); + heredoc_text = Some(text); + } + "cd_path" => { + let text = capture + .node + .utf8_text(bytes) + .map_err(ExtractHeredocError::HeredocNotUtf8)? + .to_string(); + cd_path = Some(text); + } + "cd_raw_string" => { + let raw = capture + .node + .utf8_text(bytes) + .map_err(ExtractHeredocError::HeredocNotUtf8)?; + let trimmed = raw + .strip_prefix('\'') + .and_then(|s| s.strip_suffix('\'')) + .unwrap_or(raw); + cd_path = Some(trimmed.to_string()); + } + _ => {} + } } - if c.goto_first_child() { - continue; - } - while !c.goto_next_sibling() { - if !c.goto_parent() { - return Err(ExtractHeredocError::FailedToFindHeredocBody); - } + if let Some(heredoc) = heredoc_text { + return Ok((heredoc, cd_path)); } } + + Err(ExtractHeredocError::CommandDidNotStartWithApplyPatch) } #[derive(Debug, PartialEq)] @@ -718,6 +830,49 @@ mod tests { strs.iter().map(|s| s.to_string()).collect() } + // Test helpers to reduce repetition when building bash -lc heredoc scripts + fn args_bash(script: &str) -> Vec { + strs_to_strings(&["bash", "-lc", script]) + } + + fn heredoc_script(prefix: &str) -> String { + format!( + "{prefix}apply_patch <<'PATCH'\n*** Begin Patch\n*** Add File: foo\n+hi\n*** End Patch\nPATCH" + ) + } + + fn heredoc_script_ps(prefix: &str, suffix: &str) -> String { + format!( + "{prefix}apply_patch <<'PATCH'\n*** Begin Patch\n*** Add File: foo\n+hi\n*** End Patch\nPATCH{suffix}" + ) + } + + fn expected_single_add() -> Vec { + vec![Hunk::AddFile { + path: PathBuf::from("foo"), + contents: "hi\n".to_string(), + }] + } + + fn assert_match(script: &str, expected_workdir: Option<&str>) { + let args = args_bash(script); + match maybe_parse_apply_patch(&args) { + MaybeApplyPatch::Body(ApplyPatchArgs { hunks, workdir, .. }) => { + assert_eq!(workdir.as_deref(), expected_workdir); + assert_eq!(hunks, expected_single_add()); + } + result => panic!("expected MaybeApplyPatch::Body got {result:?}"), + } + } + + fn assert_not_match(script: &str) { + let args = args_bash(script); + assert!(matches!( + maybe_parse_apply_patch(&args), + MaybeApplyPatch::NotApplyPatch + )); + } + #[test] fn test_literal() { let args = strs_to_strings(&[ @@ -730,7 +885,7 @@ mod tests { ]); match maybe_parse_apply_patch(&args) { - MaybeApplyPatch::Body(ApplyPatchArgs { hunks, patch: _ }) => { + MaybeApplyPatch::Body(ApplyPatchArgs { hunks, .. }) => { assert_eq!( hunks, vec![Hunk::AddFile { @@ -755,7 +910,7 @@ mod tests { ]); match maybe_parse_apply_patch(&args) { - MaybeApplyPatch::Body(ApplyPatchArgs { hunks, patch: _ }) => { + MaybeApplyPatch::Body(ApplyPatchArgs { hunks, .. }) => { assert_eq!( hunks, vec![Hunk::AddFile { @@ -770,29 +925,7 @@ mod tests { #[test] fn test_heredoc() { - let args = strs_to_strings(&[ - "bash", - "-lc", - r#"apply_patch <<'PATCH' -*** Begin Patch -*** Add File: foo -+hi -*** End Patch -PATCH"#, - ]); - - match maybe_parse_apply_patch(&args) { - MaybeApplyPatch::Body(ApplyPatchArgs { hunks, patch: _ }) => { - assert_eq!( - hunks, - vec![Hunk::AddFile { - path: PathBuf::from("foo"), - contents: "hi\n".to_string() - }] - ); - } - result => panic!("expected MaybeApplyPatch::Body got {result:?}"), - } + assert_match(&heredoc_script(""), None); } #[test] @@ -809,7 +942,8 @@ PATCH"#, ]); match maybe_parse_apply_patch(&args) { - MaybeApplyPatch::Body(ApplyPatchArgs { hunks, patch: _ }) => { + MaybeApplyPatch::Body(ApplyPatchArgs { hunks, workdir, .. }) => { + assert_eq!(workdir, None); assert_eq!( hunks, vec![Hunk::AddFile { @@ -822,6 +956,69 @@ PATCH"#, } } + #[test] + fn test_heredoc_with_leading_cd() { + assert_match(&heredoc_script("cd foo && "), Some("foo")); + } + + #[test] + fn test_cd_with_semicolon_is_ignored() { + assert_not_match(&heredoc_script("cd foo; ")); + } + + #[test] + fn test_cd_or_apply_patch_is_ignored() { + assert_not_match(&heredoc_script("cd bar || ")); + } + + #[test] + fn test_cd_pipe_apply_patch_is_ignored() { + assert_not_match(&heredoc_script("cd bar | ")); + } + + #[test] + fn test_cd_single_quoted_path_with_spaces() { + assert_match(&heredoc_script("cd 'foo bar' && "), Some("foo bar")); + } + + #[test] + fn test_cd_double_quoted_path_with_spaces() { + assert_match(&heredoc_script("cd \"foo bar\" && "), Some("foo bar")); + } + + #[test] + fn test_echo_and_apply_patch_is_ignored() { + assert_not_match(&heredoc_script("echo foo && ")); + } + + #[test] + fn test_apply_patch_with_arg_is_ignored() { + let script = "apply_patch foo <<'PATCH'\n*** Begin Patch\n*** Add File: foo\n+hi\n*** End Patch\nPATCH"; + assert_not_match(script); + } + + #[test] + fn test_double_cd_then_apply_patch_is_ignored() { + assert_not_match(&heredoc_script("cd foo && cd bar && ")); + } + + #[test] + fn test_cd_two_args_is_ignored() { + assert_not_match(&heredoc_script("cd foo bar && ")); + } + + #[test] + fn test_cd_then_apply_patch_then_extra_is_ignored() { + let script = heredoc_script_ps("cd bar && ", " && echo done"); + assert_not_match(&script); + } + + #[test] + fn test_echo_then_cd_and_apply_patch_is_ignored() { + // Ensure preceding commands before the `cd && apply_patch <<...` sequence do not match. + assert_not_match(&heredoc_script("echo foo; cd bar && ")); + } + #[test] fn test_add_file_hunk_creates_file_with_contents() { let dir = tempdir().unwrap(); diff --git a/codex-rs/apply-patch/src/parser.rs b/codex-rs/apply-patch/src/parser.rs index ff9dfd6f8fc..3b89c4da852 100644 --- a/codex-rs/apply-patch/src/parser.rs +++ b/codex-rs/apply-patch/src/parser.rs @@ -175,7 +175,11 @@ fn parse_patch_text(patch: &str, mode: ParseMode) -> Result Option> { .collect(); if commands.len() > 1 { commands.retain(|pc| !matches!(pc, ParsedCommand::Unknown { cmd } if cmd == "true")); + // Apply the same simplifications used for non-bash parsing, e.g., drop leading `cd`. + while let Some(next) = simplify_once(&commands) { + commands = next; + } } if commands.len() == 1 { // If we reduced to a single command, attribute the full original script