-
Notifications
You must be signed in to change notification settings - Fork 489
fix: escape invalid UTF-8 bytes in debug output for Match #1203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| use alloc::{borrow::Cow, string::String, sync::Arc, vec::Vec}; | ||
| use alloc::{borrow::Cow, format, string::String, sync::Arc, vec::Vec}; | ||
|
|
||
| use regex_automata::{meta, util::captures, Input, PatternID}; | ||
|
|
||
|
|
@@ -1557,18 +1557,25 @@ impl<'h> core::fmt::Debug for Match<'h> { | |
| fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { | ||
| let mut fmt = f.debug_struct("Match"); | ||
| fmt.field("start", &self.start).field("end", &self.end); | ||
| if let Ok(s) = core::str::from_utf8(self.as_bytes()) { | ||
| fmt.field("bytes", &s); | ||
|
|
||
| let bytes = self.as_bytes(); | ||
| let formatted = bytes_to_string_with_invalid_utf8_escaped(bytes); | ||
| fmt.field("bytes", &formatted); | ||
|
|
||
| fmt.finish() | ||
| } | ||
| } | ||
|
|
||
| fn bytes_to_string_with_invalid_utf8_escaped(bytes: &[u8]) -> String { | ||
| let mut result = String::new(); | ||
| for &byte in bytes { | ||
| if byte.is_ascii() { | ||
|
||
| result.push(byte as char); | ||
| } else { | ||
| // FIXME: It would be nice if this could be printed as a string | ||
| // with invalid UTF-8 replaced with hex escapes. A alloc would | ||
| // probably okay if that makes it easier, but regex-automata does | ||
| // (at time of writing) have internal routines that do this. So | ||
| // maybe we should expose them. | ||
| fmt.field("bytes", &self.as_bytes()); | ||
| result.push_str(&format!("\\x{:02X}", byte)); | ||
| } | ||
| fmt.finish() | ||
| } | ||
| result | ||
| } | ||
|
|
||
| impl<'h> From<Match<'h>> for &'h [u8] { | ||
|
|
@@ -2620,3 +2627,54 @@ fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> { | |
| None => Some(Cow::Borrowed(replacement)), | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
| use alloc::format; | ||
|
|
||
| #[test] | ||
| fn test_match_properties() { | ||
| let haystack = b"Hello, world!"; | ||
| let m = Match::new(haystack, 7, 12); | ||
|
|
||
| assert_eq!(m.start(), 7); | ||
| assert_eq!(m.end(), 12); | ||
| assert_eq!(m.is_empty(), false); | ||
| assert_eq!(m.len(), 5); | ||
| assert_eq!(m.as_bytes(), b"world"); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_empty_match() { | ||
| let haystack = b""; | ||
| let m = Match::new(haystack, 0, 0); | ||
|
|
||
| assert_eq!(m.is_empty(), true); | ||
| assert_eq!(m.len(), 0); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_debug_output_valid_utf8() { | ||
| let haystack = b"Hello, world!"; | ||
| let m = Match::new(haystack, 7, 12); | ||
| let debug_str = format!("{:?}", m); | ||
|
|
||
| assert_eq!( | ||
| debug_str, | ||
| r#"Match { start: 7, end: 12, bytes: "world" }"# | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_debug_output_invalid_utf8() { | ||
| let haystack = b"Hello, \xFFworld!"; | ||
| let m = Match::new(haystack, 7, 13); | ||
| let debug_str = format!("{:?}", m); | ||
|
|
||
| assert_eq!( | ||
| debug_str, | ||
| r#"Match { start: 7, end: 13, bytes: "\\xFFworld" }"# | ||
| ); | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add some tests with non-ASCII UTF-8.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added along with other tests. |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you use
regex_automata::util::escape::DebugHaystackinstead? It will basically do what you have here, but will only escape invalid UTF-8. What you've implemented here will escape not only invalid UTF-8, but all UTF-8 that isn't ASCII. (I think that would be a cure worse than the disease.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Modified to use DebugHaystack. I thought there would be such a feature but couldn't find it. Thanks for your suggestion. 88112b3