Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
fix: escape invalid UTF-8 bytes in debug output for Match
  • Loading branch information
notJoon committed Jun 8, 2024
commit 5881c81204bcf4bfb7ac73d36712f0bc24632dec
78 changes: 68 additions & 10 deletions src/regex/bytes.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use alloc::{borrow::Cow, string::String, sync::Arc, vec::Vec};
use alloc::{borrow::Cow, format, string::String, sync::Arc, vec::Vec};

use regex_automata::{meta, util::captures, Input, PatternID};

Expand Down Expand Up @@ -1557,18 +1557,25 @@ impl<'h> core::fmt::Debug for Match<'h> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let mut fmt = f.debug_struct("Match");
fmt.field("start", &self.start).field("end", &self.end);
if let Ok(s) = core::str::from_utf8(self.as_bytes()) {
fmt.field("bytes", &s);

let bytes = self.as_bytes();
let formatted = bytes_to_string_with_invalid_utf8_escaped(bytes);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use regex_automata::util::escape::DebugHaystack instead? It will basically do what you have here, but will only escape invalid UTF-8. What you've implemented here will escape not only invalid UTF-8, but all UTF-8 that isn't ASCII. (I think that would be a cure worse than the disease.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modified to use DebugHaystack. I thought there would be such a feature but couldn't find it. Thanks for your suggestion. 88112b3

fmt.field("bytes", &formatted);

fmt.finish()
}
}

fn bytes_to_string_with_invalid_utf8_escaped(bytes: &[u8]) -> String {
let mut result = String::new();
for &byte in bytes {
if byte.is_ascii() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outputs valid UTF-8 characters as is

This is why what you said isn't accurate here. This only outputs ASCII characters as-is. Everything else, including valid UTF-8 that isn't ASCII, is emitted as escape byte sequences.

result.push(byte as char);
} else {
// FIXME: It would be nice if this could be printed as a string
// with invalid UTF-8 replaced with hex escapes. A alloc would
// probably okay if that makes it easier, but regex-automata does
// (at time of writing) have internal routines that do this. So
// maybe we should expose them.
fmt.field("bytes", &self.as_bytes());
result.push_str(&format!("\\x{:02X}", byte));
}
fmt.finish()
}
result
}

impl<'h> From<Match<'h>> for &'h [u8] {
Expand Down Expand Up @@ -2620,3 +2627,54 @@ fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> {
None => Some(Cow::Borrowed(replacement)),
}
}

#[cfg(test)]
mod tests {
use super::*;
use alloc::format;

#[test]
fn test_match_properties() {
let haystack = b"Hello, world!";
let m = Match::new(haystack, 7, 12);

assert_eq!(m.start(), 7);
assert_eq!(m.end(), 12);
assert_eq!(m.is_empty(), false);
assert_eq!(m.len(), 5);
assert_eq!(m.as_bytes(), b"world");
}

#[test]
fn test_empty_match() {
let haystack = b"";
let m = Match::new(haystack, 0, 0);

assert_eq!(m.is_empty(), true);
assert_eq!(m.len(), 0);
}

#[test]
fn test_debug_output_valid_utf8() {
let haystack = b"Hello, world!";
let m = Match::new(haystack, 7, 12);
let debug_str = format!("{:?}", m);

assert_eq!(
debug_str,
r#"Match { start: 7, end: 12, bytes: "world" }"#
);
}

#[test]
fn test_debug_output_invalid_utf8() {
let haystack = b"Hello, \xFFworld!";
let m = Match::new(haystack, 7, 13);
let debug_str = format!("{:?}", m);

assert_eq!(
debug_str,
r#"Match { start: 7, end: 13, bytes: "\\xFFworld" }"#
);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add some tests with non-ASCII UTF-8.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added along with other tests.
d18841e

}