Skip to content

Commit 9287d4f

Browse files
Fix UTF-8 hex escapes in string literals
Co-authored-by: Raj_Pabnani <RajPabnani03@users.noreply.github.com>
1 parent 514b442 commit 9287d4f

5 files changed

Lines changed: 70 additions & 33 deletions

File tree

cli/src/template_parser.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,9 @@ fn parse_term_node(pair: Pair<Rule>) -> TemplateParseResult<ExpressionNode> {
465465
let span = expr.as_span();
466466
let primary = match expr.as_rule() {
467467
Rule::string_literal => {
468-
let text = STRING_LITERAL_PARSER.parse(expr.into_inner());
468+
let text = STRING_LITERAL_PARSER
469+
.parse(expr.into_inner())
470+
.map_err(|err| TemplateParseError::expression(err.to_string(), span))?;
469471
ExpressionNode::new(ExpressionKind::String(text), span)
470472
}
471473
Rule::raw_string_literal => {
@@ -1122,9 +1124,15 @@ mod tests {
11221124
Ok(ExpressionKind::String("aeiou".to_owned())),
11231125
);
11241126
assert_eq!(
1125-
parse_into_kind(r#""\xe0\xe8\xec\xf0\xf9""#),
1127+
parse_into_kind(r#""\xc3\xa0\xc3\xa8\xc3\xac\xc3\xb0\xc3\xb9""#),
11261128
Ok(ExpressionKind::String("àèìðù".to_owned())),
11271129
);
1130+
assert_eq!(
1131+
parse_into_kind(r#""\xc3""#),
1132+
Err(TemplateParseErrorKind::Expression(
1133+
"Invalid UTF-8 sequence".to_owned(),
1134+
)),
1135+
);
11281136
assert_eq!(
11291137
parse_into_kind(r#""\x""#),
11301138
Err(TemplateParseErrorKind::SyntaxError),

docs/templates.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ A double-quoted string literal supports the following escape sequences:
458458
* `\n`: new line
459459
* `\0`: null
460460
* `\e`: escape (i.e., `\x1b`)
461-
* `\xHH`: byte with hex value `HH`
461+
* `\xHH`: byte with hex value `HH` (must be part of a valid UTF-8 string)
462462

463463
Other escape sequences are not supported. Any UTF-8 characters are allowed
464464
inside a string literal, with two exceptions: unescaped `"`-s and uses of `\`

lib/src/dsl_util.rs

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
1717
use std::ascii;
1818
use std::collections::HashMap;
19+
use std::error;
1920
use std::fmt;
2021
use std::slice;
2122

@@ -397,34 +398,44 @@ pub struct StringLiteralParser<R> {
397398
pub escape_rule: R,
398399
}
399400

401+
#[expect(missing_docs)]
402+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
403+
pub struct StringLiteralParseError;
404+
405+
impl fmt::Display for StringLiteralParseError {
406+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
407+
f.write_str("Invalid UTF-8 sequence")
408+
}
409+
}
410+
411+
impl error::Error for StringLiteralParseError {}
412+
400413
impl<R: RuleType> StringLiteralParser<R> {
401414
/// Parses the given string literal `pairs` into string.
402-
pub fn parse(&self, pairs: Pairs<R>) -> String {
403-
let mut result = String::new();
415+
pub fn parse(&self, pairs: Pairs<R>) -> Result<String, StringLiteralParseError> {
416+
let mut result = Vec::new();
404417
for part in pairs {
405418
if part.as_rule() == self.content_rule {
406-
result.push_str(part.as_str());
419+
result.extend_from_slice(part.as_str().as_bytes());
407420
} else if part.as_rule() == self.escape_rule {
408421
match &part.as_str()[1..] {
409-
"\"" => result.push('"'),
410-
"\\" => result.push('\\'),
411-
"t" => result.push('\t'),
412-
"r" => result.push('\r'),
413-
"n" => result.push('\n'),
414-
"0" => result.push('\0'),
415-
"e" => result.push('\x1b'),
422+
"\"" => result.push(b'"'),
423+
"\\" => result.push(b'\\'),
424+
"t" => result.push(b'\t'),
425+
"r" => result.push(b'\r'),
426+
"n" => result.push(b'\n'),
427+
"0" => result.push(b'\0'),
428+
"e" => result.push(b'\x1b'),
416429
hex if hex.starts_with('x') => {
417-
result.push(char::from(
418-
u8::from_str_radix(&hex[1..], 16).expect("hex characters"),
419-
));
430+
result.push(u8::from_str_radix(&hex[1..], 16).expect("hex characters"));
420431
}
421432
char => panic!("invalid escape: \\{char:?}"),
422433
}
423434
} else {
424435
panic!("unexpected part of string: {part:?}");
425436
}
426437
}
427-
result
438+
String::from_utf8(result).map_err(|_| StringLiteralParseError)
428439
}
429440
}
430441

lib/src/fileset_parser.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,14 +236,17 @@ fn parse_function_call_node(pair: Pair<Rule>) -> FilesetParseResult<FunctionCall
236236
})
237237
}
238238

239-
fn parse_as_string_literal(pair: Pair<Rule>) -> String {
239+
fn parse_as_string_literal(pair: Pair<Rule>) -> FilesetParseResult<String> {
240+
let span = pair.as_span();
240241
match pair.as_rule() {
241-
Rule::identifier => pair.as_str().to_owned(),
242-
Rule::string_literal => STRING_LITERAL_PARSER.parse(pair.into_inner()),
242+
Rule::identifier => Ok(pair.as_str().to_owned()),
243+
Rule::string_literal => STRING_LITERAL_PARSER
244+
.parse(pair.into_inner())
245+
.map_err(|err| FilesetParseError::expression(err.to_string(), span)),
243246
Rule::raw_string_literal => {
244247
let [content] = pair.into_inner().collect_array().unwrap();
245248
assert_eq!(content.as_rule(), Rule::raw_string_content);
246-
content.as_str().to_owned()
249+
Ok(content.as_str().to_owned())
247250
}
248251
r => panic!("unexpected string literal rule: {r:?}"),
249252
}
@@ -264,12 +267,12 @@ fn parse_primary_node(pair: Pair<Rule>) -> FilesetParseResult<ExpressionNode> {
264267
assert_eq!(lhs.as_rule(), Rule::strict_identifier);
265268
assert_eq!(op.as_rule(), Rule::pattern_kind_op);
266269
let kind = lhs.as_str();
267-
let value = parse_as_string_literal(rhs);
270+
let value = parse_as_string_literal(rhs)?;
268271
ExpressionKind::StringPattern { kind, value }
269272
}
270273
Rule::identifier => ExpressionKind::Identifier(first.as_str()),
271274
Rule::string_literal | Rule::raw_string_literal => {
272-
ExpressionKind::String(parse_as_string_literal(first))
275+
ExpressionKind::String(parse_as_string_literal(first)?)
273276
}
274277
r => panic!("unexpected primary rule: {r:?}"),
275278
};
@@ -519,9 +522,15 @@ mod tests {
519522
Ok(ExpressionKind::String("aeiou".to_owned())),
520523
);
521524
assert_eq!(
522-
parse_into_kind(r#""\xe0\xe8\xec\xf0\xf9""#),
525+
parse_into_kind(r#""\xc3\xa0\xc3\xa8\xc3\xac\xc3\xb0\xc3\xb9""#),
523526
Ok(ExpressionKind::String("àèìðù".to_owned())),
524527
);
528+
assert_eq!(
529+
parse_into_kind(r#""\xc3""#),
530+
Err(FilesetParseErrorKind::Expression(
531+
"Invalid UTF-8 sequence".to_owned(),
532+
)),
533+
);
525534
assert_eq!(
526535
parse_into_kind(r#""\x""#),
527536
Err(FilesetParseErrorKind::SyntaxError),

lib/src/revset_parser.rs

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -636,14 +636,14 @@ fn parse_primary_node(pair: Pair<Rule>) -> Result<ExpressionNode, RevsetParseErr
636636
assert_eq!(lhs.as_rule(), Rule::strict_identifier);
637637
assert_eq!(op.as_rule(), Rule::pattern_kind_op);
638638
let kind = lhs.as_str();
639-
let value = parse_as_string_literal(rhs);
639+
let value = parse_as_string_literal(rhs)?;
640640
ExpressionKind::StringPattern { kind, value }
641641
}
642642
// Identifier without "@" may be substituted by aliases. Primary expression including "@"
643643
// is considered an indecomposable unit, and no alias substitution would be made.
644644
Rule::identifier if pairs.peek().is_none() => ExpressionKind::Identifier(first.as_str()),
645645
Rule::identifier | Rule::string_literal | Rule::raw_string_literal => {
646-
let name = parse_as_string_literal(first);
646+
let name = parse_as_string_literal(first)?;
647647
match pairs.next() {
648648
None => ExpressionKind::String(name),
649649
Some(op) => {
@@ -654,7 +654,7 @@ fn parse_primary_node(pair: Pair<Rule>) -> Result<ExpressionNode, RevsetParseErr
654654
// infix "<name>@<remote>"
655655
Some(second) => {
656656
let name: RefNameBuf = name.into();
657-
let remote: RemoteNameBuf = parse_as_string_literal(second).into();
657+
let remote: RemoteNameBuf = parse_as_string_literal(second)?.into();
658658
ExpressionKind::RemoteSymbol(RemoteRefSymbolBuf { name, remote })
659659
}
660660
}
@@ -669,14 +669,17 @@ fn parse_primary_node(pair: Pair<Rule>) -> Result<ExpressionNode, RevsetParseErr
669669
}
670670

671671
/// Parses part of compound symbol to string.
672-
fn parse_as_string_literal(pair: Pair<Rule>) -> String {
672+
fn parse_as_string_literal(pair: Pair<Rule>) -> Result<String, RevsetParseError> {
673+
let span = pair.as_span();
673674
match pair.as_rule() {
674-
Rule::identifier => pair.as_str().to_owned(),
675-
Rule::string_literal => STRING_LITERAL_PARSER.parse(pair.into_inner()),
675+
Rule::identifier => Ok(pair.as_str().to_owned()),
676+
Rule::string_literal => STRING_LITERAL_PARSER
677+
.parse(pair.into_inner())
678+
.map_err(|err| RevsetParseError::expression(err.to_string(), span)),
676679
Rule::raw_string_literal => {
677680
let [content] = pair.into_inner().collect_array().unwrap();
678681
assert_eq!(content.as_rule(), Rule::raw_string_content);
679-
content.as_str().to_owned()
682+
Ok(content.as_str().to_owned())
680683
}
681684
_ => {
682685
panic!("unexpected string literal rule: {:?}", pair.as_str());
@@ -697,7 +700,7 @@ pub fn parse_symbol(text: &str) -> Result<String, RevsetParseError> {
697700
let mut pairs = RevsetParser::parse(Rule::symbol_name, text)?;
698701
let first = pairs.next().unwrap();
699702
let span = first.as_span();
700-
let name = parse_as_string_literal(first);
703+
let name = parse_as_string_literal(first)?;
701704
if name.is_empty() {
702705
Err(RevsetParseError::expression(
703706
"Expected non-empty string",
@@ -1284,9 +1287,15 @@ mod tests {
12841287
Ok(ExpressionKind::String("aeiou".to_owned()))
12851288
);
12861289
assert_eq!(
1287-
parse_into_kind(r#""\xe0\xe8\xec\xf0\xf9""#),
1290+
parse_into_kind(r#""\xc3\xa0\xc3\xa8\xc3\xac\xc3\xb0\xc3\xb9""#),
12881291
Ok(ExpressionKind::String("àèìðù".to_owned()))
12891292
);
1293+
assert_eq!(
1294+
parse_into_kind(r#""\xc3""#),
1295+
Err(RevsetParseErrorKind::Expression(
1296+
"Invalid UTF-8 sequence".to_owned()
1297+
))
1298+
);
12901299
assert_eq!(
12911300
parse_into_kind(r#""\x""#),
12921301
Err(RevsetParseErrorKind::SyntaxError)

0 commit comments

Comments
 (0)