diff --git a/CHANGELOG.md b/CHANGELOG.md index 9063a22a3c..2606af8bd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,24 @@ +1.3.2 (2020-01-09) +================== +This is a small maintenance release with some house cleaning and bug fixes. + +New features: + +* [FEATURE #631](https://github.com/rust-lang/regex/issues/631): + Add a `Match::range` method an a `From for Range` impl. + +Bug fixes: + +* [BUG #521](https://github.com/rust-lang/regex/issues/521): + Corrects `/-/.splitn("a", 2)` to return `["a"]` instead of `["a", ""]`. +* [BUG #594](https://github.com/rust-lang/regex/pull/594): + Improve error reporting when writing `\p\`. +* [BUG #627](https://github.com/rust-lang/regex/issues/627): + Corrects `/-/.split("a-")` to return `["a", ""]` instead of `["a"]`. +* [BUG #633](https://github.com/rust-lang/regex/pull/633): + Squash deprecation warnings for the `std::error::Error::description` method. + + 1.3.1 (2019-09-04) ================== This is a maintenance release with no changes in order to try to work-around diff --git a/bench/src/ffi/tcl.rs b/bench/src/ffi/tcl.rs index 25d182344c..0c5c52e190 100644 --- a/bench/src/ffi/tcl.rs +++ b/bench/src/ffi/tcl.rs @@ -2,12 +2,12 @@ use std::mem; use std::ptr; -use std::sync::{Once, ONCE_INIT}; +use std::sync::Once; use libc::{c_char, c_int, c_long, c_void}; // Used to initialize the TCL interpreter exactly once. -static ONCE: Once = ONCE_INIT; +static ONCE: Once = Once::new(); /// Text is a TCL string object backed by a Rust string. /// diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 6cd0bc48f4..7179f2d403 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -156,6 +156,9 @@ pub enum ErrorKind { /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, + /// The Unicode class is not valid. This typically occurs when a `\p` is + /// followed by something other than a `{`. + UnicodeClassInvalid, /// When octal support is disabled, this error is produced when an octal /// escape is used. The octal escape is assumed to be an invocation of /// a backreference, which is the common case. @@ -176,6 +179,8 @@ pub enum ErrorKind { } impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] fn description(&self) -> &str { use self::ErrorKind::*; match self.kind { @@ -206,6 +211,7 @@ impl error::Error for Error { RepetitionCountInvalid => "invalid repetition count range", RepetitionCountUnclosed => "unclosed counted repetition", RepetitionMissing => "repetition operator missing expression", + UnicodeClassInvalid => "invalid Unicode character class", UnsupportedBackreference => "backreferences are not supported", UnsupportedLookAround => "look-around is not supported", _ => unreachable!(), @@ -293,6 +299,9 @@ impl fmt::Display for ErrorKind { RepetitionMissing => { write!(f, "repetition operator missing expression") } + UnicodeClassInvalid => { + write!(f, "invalid Unicode character class") + } UnsupportedBackreference => { write!(f, "backreferences are not supported") } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index c063ea9dc2..f5b4548b23 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2095,6 +2095,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { } else { let start = self.pos(); let c = self.char(); + if c == '\\' { + return Err(self.error( + self.span_char(), + ast::ErrorKind::UnicodeClassInvalid, + )); + } self.bump_and_bump_space(); let kind = ast::ClassUnicodeKind::OneLetter(c); (start, kind) @@ -5713,6 +5719,20 @@ bar ], })) ); + assert_eq!( + parser(r"\p\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + assert_eq!( + parser(r"\P\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); } #[test] diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs index 464148e3dd..93c2b0dd93 100644 --- a/regex-syntax/src/error.rs +++ b/regex-syntax/src/error.rs @@ -40,6 +40,8 @@ impl From for Error { } impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] fn description(&self) -> &str { match *self { Error::Parse(ref x) => x.description(), diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ee08e83dba..67411fcdc3 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -91,6 +91,8 @@ pub enum ErrorKind { } impl ErrorKind { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] fn description(&self) -> &str { use self::ErrorKind::*; match *self { @@ -113,6 +115,8 @@ impl ErrorKind { } impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] fn description(&self) -> &str { self.kind.description() } @@ -126,6 +130,8 @@ impl fmt::Display for Error { impl fmt::Display for ErrorKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // TODO: Remove this on the next breaking semver release. + #[allow(deprecated)] f.write_str(self.description()) } } diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 38a996aa05..506529d117 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -277,7 +277,7 @@ enum CanonicalClassQuery { /// Looks up a Unicode class given a query. If one doesn't exist, then /// `None` is returned. -pub fn class<'a>(query: ClassQuery<'a>) -> Result { +pub fn class(query: ClassQuery) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { diff --git a/regex-syntax/src/unicode_tables/property_bool.rs b/regex-syntax/src/unicode_tables/property_bool.rs index 59713a882e..efe6eb369d 100644 --- a/regex-syntax/src/unicode_tables/property_bool.rs +++ b/regex-syntax/src/unicode_tables/property_bool.rs @@ -8203,12 +8203,8 @@ pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ pub const OTHER_ID_CONTINUE: &'static [(char, char)] = &[('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚')]; -pub const OTHER_ID_START: &'static [(char, char)] = &[ - ('\u{1885}', '\u{1886}'), - ('℘', '℘'), - ('℮', '℮'), - ('゛', '゜'), -]; +pub const OTHER_ID_START: &'static [(char, char)] = + &[('\u{1885}', '\u{1886}'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜')]; pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ ('ª', 'ª'), @@ -8370,13 +8366,8 @@ pub const OTHER_MATH: &'static [(char, char)] = &[ ('𞺫', '𞺻'), ]; -pub const OTHER_UPPERCASE: &'static [(char, char)] = &[ - ('Ⅰ', 'Ⅿ'), - ('Ⓐ', 'Ⓩ'), - ('🄰', '🅉'), - ('🅐', '🅩'), - ('🅰', '🆉'), -]; +pub const OTHER_UPPERCASE: &'static [(char, char)] = + &[('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉')]; pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ ('!', '/'), diff --git a/regex-syntax/src/unicode_tables/script.rs b/regex-syntax/src/unicode_tables/script.rs index 12ddf0167f..19dbc38e56 100644 --- a/regex-syntax/src/unicode_tables/script.rs +++ b/regex-syntax/src/unicode_tables/script.rs @@ -230,11 +230,9 @@ pub const ARABIC: &'static [(char, char)] = &[ pub const ARMENIAN: &'static [(char, char)] = &[('Ա', 'Ֆ'), ('ՙ', 'ֈ'), ('֊', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; -pub const AVESTAN: &'static [(char, char)] = - &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; +pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; -pub const BALINESE: &'static [(char, char)] = - &[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')]; +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')]; pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; @@ -260,12 +258,8 @@ pub const BENGALI: &'static [(char, char)] = &[ ('০', '\u{9fe}'), ]; -pub const BHAIKSUKI: &'static [(char, char)] = &[ - ('𑰀', '𑰈'), - ('𑰊', '\u{11c36}'), - ('\u{11c38}', '𑱅'), - ('𑱐', '𑱬'), -]; +pub const BHAIKSUKI: &'static [(char, char)] = + &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; pub const BOPOMOFO: &'static [(char, char)] = &[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆺ')]; @@ -275,8 +269,7 @@ pub const BRAHMI: &'static [(char, char)] = pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; -pub const BUGINESE: &'static [(char, char)] = - &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')]; +pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')]; pub const BUHID: &'static [(char, char)] = &[('ᝀ', '\u{1753}')]; @@ -478,14 +471,8 @@ pub const COPTIC: &'static [(char, char)] = pub const CUNEIFORM: &'static [(char, char)] = &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; -pub const CYPRIOT: &'static [(char, char)] = &[ - ('𐠀', '𐠅'), - ('𐠈', '𐠈'), - ('𐠊', '𐠵'), - ('𐠷', '𐠸'), - ('𐠼', '𐠼'), - ('𐠿', '𐠿'), -]; +pub const CYPRIOT: &'static [(char, char)] = + &[('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿')]; pub const CYRILLIC: &'static [(char, char)] = &[ ('Ѐ', '\u{484}'), @@ -509,13 +496,8 @@ pub const DEVANAGARI: &'static [(char, char)] = &[ pub const DOGRA: &'static [(char, char)] = &[('𑠀', '𑠻')]; -pub const DUPLOYAN: &'static [(char, char)] = &[ - ('𛰀', '𛱪'), - ('𛱰', '𛱼'), - ('𛲀', '𛲈'), - ('𛲐', '𛲙'), - ('𛲜', '𛲟'), -]; +pub const DUPLOYAN: &'static [(char, char)] = + &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲟')]; pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𓀀', '𓐮'), ('\u{13430}', '\u{13438}')]; @@ -825,8 +807,7 @@ pub const KATAKANA: &'static [(char, char)] = &[ ('\u{1b164}', '\u{1b167}'), ]; -pub const KAYAH_LI: &'static [(char, char)] = - &[('꤀', '\u{a92d}'), ('꤯', '꤯')]; +pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '\u{a92d}'), ('꤯', '꤯')]; pub const KHAROSHTHI: &'static [(char, char)] = &[ ('𐨀', '\u{10a03}'), @@ -842,8 +823,7 @@ pub const KHAROSHTHI: &'static [(char, char)] = &[ pub const KHMER: &'static [(char, char)] = &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; -pub const KHOJKI: &'static [(char, char)] = - &[('𑈀', '𑈑'), ('𑈓', '\u{1123e}')]; +pub const KHOJKI: &'static [(char, char)] = &[('𑈀', '𑈑'), ('𑈓', '\u{1123e}')]; pub const KHUDAWADI: &'static [(char, char)] = &[('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; @@ -925,8 +905,7 @@ pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿')]; pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; -pub const LYDIAN: &'static [(char, char)] = - &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; +pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; pub const MAHAJANI: &'static [(char, char)] = &[('𑅐', '𑅶')]; @@ -943,8 +922,7 @@ pub const MALAYALAM: &'static [(char, char)] = &[ ('൦', 'ൿ'), ]; -pub const MANDAIC: &'static [(char, char)] = - &[('ࡀ', '\u{85b}'), ('࡞', '࡞')]; +pub const MANDAIC: &'static [(char, char)] = &[('ࡀ', '\u{85b}'), ('࡞', '࡞')]; pub const MANICHAEAN: &'static [(char, char)] = &[('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; @@ -975,14 +953,10 @@ pub const MEROITIC_CURSIVE: &'static [(char, char)] = pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; -pub const MIAO: &'static [(char, char)] = &[ - ('𖼀', '\u{16f4a}'), - ('\u{16f4f}', '\u{16f87}'), - ('\u{16f8f}', '𖾟'), -]; +pub const MIAO: &'static [(char, char)] = + &[('𖼀', '\u{16f4a}'), ('\u{16f4f}', '\u{16f87}'), ('\u{16f8f}', '𖾟')]; -pub const MODI: &'static [(char, char)] = - &[('𑘀', '𑙄'), ('𑙐', '𑙙')]; +pub const MODI: &'static [(char, char)] = &[('𑘀', '𑙄'), ('𑙐', '𑙙')]; pub const MONGOLIAN: &'static [(char, char)] = &[ ('᠀', '᠁'), @@ -994,22 +968,15 @@ pub const MONGOLIAN: &'static [(char, char)] = &[ ('𑙠', '𑙬'), ]; -pub const MRO: &'static [(char, char)] = - &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; +pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; -pub const MULTANI: &'static [(char, char)] = &[ - ('𑊀', '𑊆'), - ('𑊈', '𑊈'), - ('𑊊', '𑊍'), - ('𑊏', '𑊝'), - ('𑊟', '𑊩'), -]; +pub const MULTANI: &'static [(char, char)] = + &[('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; pub const MYANMAR: &'static [(char, char)] = &[('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; -pub const NABATAEAN: &'static [(char, char)] = - &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; +pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; pub const NANDINAGARI: &'static [(char, char)] = &[ ('\u{119a0}', '\u{119a7}'), @@ -1025,8 +992,7 @@ pub const NEWA: &'static [(char, char)] = pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')]; -pub const NUSHU: &'static [(char, char)] = - &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; +pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = &[ ('\u{1e100}', '\u{1e12c}'), @@ -1042,15 +1008,13 @@ pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; pub const OLD_HUNGARIAN: &'static [(char, char)] = &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; -pub const OLD_ITALIC: &'static [(char, char)] = - &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; +pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; pub const OLD_PERMIC: &'static [(char, char)] = &[('𐍐', '\u{1037a}')]; -pub const OLD_PERSIAN: &'static [(char, char)] = - &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; +pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; @@ -1075,19 +1039,12 @@ pub const ORIYA: &'static [(char, char)] = &[ ('୦', '୷'), ]; -pub const OSAGE: &'static [(char, char)] = - &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; +pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; -pub const OSMANYA: &'static [(char, char)] = - &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; +pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; -pub const PAHAWH_HMONG: &'static [(char, char)] = &[ - ('𖬀', '𖭅'), - ('𖭐', '𖭙'), - ('𖭛', '𖭡'), - ('𖭣', '𖭷'), - ('𖭽', '𖮏'), -]; +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; @@ -1095,8 +1052,7 @@ pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; pub const PHAGS_PA: &'static [(char, char)] = &[('ꡀ', '꡷')]; -pub const PHOENICIAN: &'static [(char, char)] = - &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; +pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; @@ -1105,25 +1061,20 @@ pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; -pub const SAMARITAN: &'static [(char, char)] = - &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; +pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; pub const SAURASHTRA: &'static [(char, char)] = &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; -pub const SHARADA: &'static [(char, char)] = - &[('\u{11180}', '𑇍'), ('𑇐', '𑇟')]; +pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', '𑇍'), ('𑇐', '𑇟')]; pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; pub const SIDDHAM: &'static [(char, char)] = &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; -pub const SIGNWRITING: &'static [(char, char)] = &[ - ('𝠀', '𝪋'), - ('\u{1da9b}', '\u{1da9f}'), - ('\u{1daa1}', '\u{1daaf}'), -]; +pub const SIGNWRITING: &'static [(char, char)] = + &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; pub const SINHALA: &'static [(char, char)] = &[ ('ං', 'ඃ'), @@ -1143,8 +1094,7 @@ pub const SINHALA: &'static [(char, char)] = &[ pub const SOGDIAN: &'static [(char, char)] = &[('𐼰', '𐽙')]; -pub const SORA_SOMPENG: &'static [(char, char)] = - &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; +pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; @@ -1156,8 +1106,7 @@ pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '꠫')]; pub const SYRIAC: &'static [(char, char)] = &[('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ')]; -pub const TAGALOG: &'static [(char, char)] = - &[('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}')]; +pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}')]; pub const TAGBANWA: &'static [(char, char)] = &[('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; @@ -1172,11 +1121,9 @@ pub const TAI_THAM: &'static [(char, char)] = &[ ('᪠', '᪭'), ]; -pub const TAI_VIET: &'static [(char, char)] = - &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; +pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; -pub const TAKRI: &'static [(char, char)] = - &[('𑚀', '\u{116b8}'), ('𑛀', '𑛉')]; +pub const TAKRI: &'static [(char, char)] = &[('𑚀', '\u{116b8}'), ('𑛀', '𑛉')]; pub const TAMIL: &'static [(char, char)] = &[ ('\u{b82}', 'ஃ'), @@ -1219,8 +1166,7 @@ pub const TELUGU: &'static [(char, char)] = &[ pub const THAANA: &'static [(char, char)] = &[('ހ', 'ޱ')]; -pub const THAI: &'static [(char, char)] = - &[('ก', '\u{e3a}'), ('เ', '๛')]; +pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; pub const TIBETAN: &'static [(char, char)] = &[ ('ༀ', 'ཇ'), @@ -1235,19 +1181,16 @@ pub const TIBETAN: &'static [(char, char)] = &[ pub const TIFINAGH: &'static [(char, char)] = &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; -pub const TIRHUTA: &'static [(char, char)] = - &[('𑒀', '𑓇'), ('𑓐', '𑓙')]; +pub const TIRHUTA: &'static [(char, char)] = &[('𑒀', '𑓇'), ('𑓐', '𑓙')]; -pub const UGARITIC: &'static [(char, char)] = - &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; +pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; pub const WANCHO: &'static [(char, char)] = &[('\u{1e2c0}', '\u{1e2f9}'), ('\u{1e2ff}', '\u{1e2ff}')]; -pub const WARANG_CITI: &'static [(char, char)] = - &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; +pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; pub const YI: &'static [(char, char)] = &[('ꀀ', 'ꒌ'), ('꒐', '꓆')]; diff --git a/regex-syntax/src/unicode_tables/script_extension.rs b/regex-syntax/src/unicode_tables/script_extension.rs index a86b17eb22..aa3894e575 100644 --- a/regex-syntax/src/unicode_tables/script_extension.rs +++ b/regex-syntax/src/unicode_tables/script_extension.rs @@ -225,11 +225,9 @@ pub const ARABIC: &'static [(char, char)] = &[ pub const ARMENIAN: &'static [(char, char)] = &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; -pub const AVESTAN: &'static [(char, char)] = - &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; +pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; -pub const BALINESE: &'static [(char, char)] = - &[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')]; +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')]; pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; @@ -267,12 +265,8 @@ pub const BENGALI: &'static [(char, char)] = &[ ('\u{a8f1}', '\u{a8f1}'), ]; -pub const BHAIKSUKI: &'static [(char, char)] = &[ - ('𑰀', '𑰈'), - ('𑰊', '\u{11c36}'), - ('\u{11c38}', '𑱅'), - ('𑱐', '𑱬'), -]; +pub const BHAIKSUKI: &'static [(char, char)] = + &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; pub const BOPOMOFO: &'static [(char, char)] = &[ ('˪', '˫'), @@ -297,8 +291,7 @@ pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ꧏ', 'ꧏ')]; -pub const BUHID: &'static [(char, char)] = - &[('᜵', '᜶'), ('ᝀ', '\u{1753}')]; +pub const BUHID: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝀ', '\u{1753}')]; pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ')]; @@ -308,12 +301,8 @@ pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; -pub const CHAKMA: &'static [(char, char)] = &[ - ('০', '৯'), - ('၀', '၉'), - ('\u{11100}', '\u{11134}'), - ('𑄶', '𑅆'), -]; +pub const CHAKMA: &'static [(char, char)] = + &[('০', '৯'), ('၀', '၉'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑅆')]; pub const CHAM: &'static [(char, char)] = &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; @@ -514,13 +503,8 @@ pub const DEVANAGARI: &'static [(char, char)] = &[ pub const DOGRA: &'static [(char, char)] = &[('।', '९'), ('꠰', '꠹'), ('𑠀', '𑠻')]; -pub const DUPLOYAN: &'static [(char, char)] = &[ - ('𛰀', '𛱪'), - ('𛱰', '𛱼'), - ('𛲀', '𛲈'), - ('𛲐', '𛲙'), - ('𛲜', '\u{1bca3}'), -]; +pub const DUPLOYAN: &'static [(char, char)] = + &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}')]; pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𓀀', '𓐮'), ('\u{13430}', '\u{13438}')]; @@ -852,12 +836,8 @@ pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = pub const JAVANESE: &'static [(char, char)] = &[('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟')]; -pub const KAITHI: &'static [(char, char)] = &[ - ('०', '९'), - ('꠰', '꠹'), - ('\u{11080}', '𑃁'), - ('\u{110cd}', '\u{110cd}'), -]; +pub const KAITHI: &'static [(char, char)] = + &[('०', '९'), ('꠰', '꠹'), ('\u{11080}', '𑃁'), ('\u{110cd}', '\u{110cd}')]; pub const KANNADA: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), @@ -1011,8 +991,7 @@ pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿')]; pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; -pub const LYDIAN: &'static [(char, char)] = - &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; +pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; pub const MAHAJANI: &'static [(char, char)] = &[('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶')]; @@ -1067,11 +1046,8 @@ pub const MEROITIC_CURSIVE: &'static [(char, char)] = pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; -pub const MIAO: &'static [(char, char)] = &[ - ('𖼀', '\u{16f4a}'), - ('\u{16f4f}', '\u{16f87}'), - ('\u{16f8f}', '𖾟'), -]; +pub const MIAO: &'static [(char, char)] = + &[('𖼀', '\u{16f4a}'), ('\u{16f4f}', '\u{16f87}'), ('\u{16f8f}', '𖾟')]; pub const MODI: &'static [(char, char)] = &[('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙')]; @@ -1085,23 +1061,15 @@ pub const MONGOLIAN: &'static [(char, char)] = &[ ('𑙠', '𑙬'), ]; -pub const MRO: &'static [(char, char)] = - &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; +pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; -pub const MULTANI: &'static [(char, char)] = &[ - ('੦', '੯'), - ('𑊀', '𑊆'), - ('𑊈', '𑊈'), - ('𑊊', '𑊍'), - ('𑊏', '𑊝'), - ('𑊟', '𑊩'), -]; +pub const MULTANI: &'static [(char, char)] = + &[('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; pub const MYANMAR: &'static [(char, char)] = &[('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; -pub const NABATAEAN: &'static [(char, char)] = - &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; +pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; pub const NANDINAGARI: &'static [(char, char)] = &[ ('।', '॥'), @@ -1123,8 +1091,7 @@ pub const NEWA: &'static [(char, char)] = pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')]; -pub const NUSHU: &'static [(char, char)] = - &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; +pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = &[ ('\u{1e100}', '\u{1e12c}'), @@ -1140,16 +1107,14 @@ pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; pub const OLD_HUNGARIAN: &'static [(char, char)] = &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; -pub const OLD_ITALIC: &'static [(char, char)] = - &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; +pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; pub const OLD_PERMIC: &'static [(char, char)] = &[('\u{483}', '\u{483}'), ('𐍐', '\u{1037a}')]; -pub const OLD_PERSIAN: &'static [(char, char)] = - &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; +pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; @@ -1178,19 +1143,12 @@ pub const ORIYA: &'static [(char, char)] = &[ ('ᳲ', 'ᳲ'), ]; -pub const OSAGE: &'static [(char, char)] = - &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; +pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; -pub const OSMANYA: &'static [(char, char)] = - &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; +pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; -pub const PAHAWH_HMONG: &'static [(char, char)] = &[ - ('𖬀', '𖭅'), - ('𖭐', '𖭙'), - ('𖭛', '𖭡'), - ('𖭣', '𖭷'), - ('𖭽', '𖮏'), -]; +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; @@ -1199,8 +1157,7 @@ pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; pub const PHAGS_PA: &'static [(char, char)] = &[('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷')]; -pub const PHOENICIAN: &'static [(char, char)] = - &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; +pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; @@ -1209,8 +1166,7 @@ pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; -pub const SAMARITAN: &'static [(char, char)] = - &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; +pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; pub const SAURASHTRA: &'static [(char, char)] = &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; @@ -1230,11 +1186,8 @@ pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; pub const SIDDHAM: &'static [(char, char)] = &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; -pub const SIGNWRITING: &'static [(char, char)] = &[ - ('𝠀', '𝪋'), - ('\u{1da9b}', '\u{1da9f}'), - ('\u{1daa1}', '\u{1daaf}'), -]; +pub const SIGNWRITING: &'static [(char, char)] = + &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; pub const SINHALA: &'static [(char, char)] = &[ ('।', '॥'), @@ -1255,8 +1208,7 @@ pub const SINHALA: &'static [(char, char)] = &[ pub const SOGDIAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐼰', '𐽙')]; -pub const SORA_SOMPENG: &'static [(char, char)] = - &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; +pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; @@ -1282,12 +1234,8 @@ pub const SYRIAC: &'static [(char, char)] = &[ pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}'), ('᜵', '᜶')]; -pub const TAGBANWA: &'static [(char, char)] = &[ - ('᜵', '᜶'), - ('ᝠ', 'ᝬ'), - ('ᝮ', 'ᝰ'), - ('\u{1772}', '\u{1773}'), -]; +pub const TAGBANWA: &'static [(char, char)] = + &[('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; pub const TAI_LE: &'static [(char, char)] = &[('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; @@ -1300,8 +1248,7 @@ pub const TAI_THAM: &'static [(char, char)] = &[ ('᪠', '᪭'), ]; -pub const TAI_VIET: &'static [(char, char)] = - &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; +pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; pub const TAKRI: &'static [(char, char)] = &[('।', '॥'), ('꠰', '꠹'), ('𑚀', '\u{116b8}'), ('𑛀', '𑛉')]; @@ -1366,8 +1313,7 @@ pub const THAANA: &'static [(char, char)] = &[ ('﷽', '﷽'), ]; -pub const THAI: &'static [(char, char)] = - &[('ก', '\u{e3a}'), ('เ', '๛')]; +pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; pub const TIBETAN: &'static [(char, char)] = &[ ('ༀ', 'ཇ'), @@ -1391,16 +1337,14 @@ pub const TIRHUTA: &'static [(char, char)] = &[ ('𑓐', '𑓙'), ]; -pub const UGARITIC: &'static [(char, char)] = - &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; +pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; pub const WANCHO: &'static [(char, char)] = &[('\u{1e2c0}', '\u{1e2f9}'), ('\u{1e2ff}', '\u{1e2ff}')]; -pub const WARANG_CITI: &'static [(char, char)] = - &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; +pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; pub const YI: &'static [(char, char)] = &[ ('、', '。'), diff --git a/src/compile.rs b/src/compile.rs index 1f69967192..ac706f8bad 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -110,7 +110,7 @@ impl Compiler { /// specified size limit. If the size limit is exceeded, then compilation /// stops and returns an error. pub fn compile(mut self, exprs: &[Hir]) -> result::Result { - debug_assert!(exprs.len() >= 1); + debug_assert!(!exprs.is_empty()); self.num_exprs = exprs.len(); if exprs.len() == 1 { self.compile_one(&exprs[0]) diff --git a/src/error.rs b/src/error.rs index eb7f121a83..1c32c85b99 100644 --- a/src/error.rs +++ b/src/error.rs @@ -19,6 +19,8 @@ pub enum Error { } impl ::std::error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] fn description(&self) -> &str { match *self { Error::Syntax(ref err) => err, diff --git a/src/exec.rs b/src/exec.rs index 2ae7842204..acca2dccb6 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -691,9 +691,7 @@ impl<'c> ExecNoSync<'c> { } AnchoredStart => { let lits = &self.ro.nfa.prefixes; - if !self.ro.nfa.is_anchored_start - || (self.ro.nfa.is_anchored_start && start == 0) - { + if start == 0 || !self.ro.nfa.is_anchored_start { lits.find_start(&text[start..]) .map(|(s, e)| (start + s, start + e)) } else { diff --git a/src/literal/imp.rs b/src/literal/imp.rs index 38ebd295f4..fe07ffccd8 100644 --- a/src/literal/imp.rs +++ b/src/literal/imp.rs @@ -570,7 +570,7 @@ impl BoyerMooreSearch { /// Create a new string searcher, performing whatever /// compilation steps are required. fn new(pattern: Vec) -> Self { - debug_assert!(pattern.len() > 0); + debug_assert!(!pattern.is_empty()); let (g, gi) = Self::select_guard(pattern.as_slice()); let skip_table = Self::compile_skip_table(pattern.as_slice()); diff --git a/src/prog.rs b/src/prog.rs index 6cf4961830..74e5f2f6f8 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -410,7 +410,7 @@ impl InstRanges { self.ranges .iter() .map(|&(s, e)| 1 + (e as u32) - (s as u32)) - .fold(0, |acc, len| acc + len) as usize + .sum::() as usize } } diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 2e38c10ca8..69f0b335de 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::fmt; -use std::ops::Index; +use std::ops::{Index, Range}; use std::str::FromStr; use std::sync::Arc; @@ -36,10 +36,17 @@ impl<'t> Match<'t> { self.end } + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range { + self.start..self.end + } + /// Returns the matched text. #[inline] pub fn as_bytes(&self) -> &'t [u8] { - &self.text[self.start..self.end] + &self.text[self.range()] } /// Creates a new match from the given haystack and byte offsets. @@ -49,6 +56,12 @@ impl<'t> Match<'t> { } } +impl<'t> From> for Range { + fn from(m: Match<'t>) -> Range { + m.range() + } +} + /// A compiled regular expression for matching arbitrary bytes. /// /// It can be used to search, split or replace text. All searching is done with @@ -726,11 +739,11 @@ impl<'r, 't> Iterator for Split<'r, 't> { let text = self.finder.0.text(); match self.finder.next() { None => { - if self.last >= text.len() { + if self.last > text.len() { None } else { let s = &text[self.last..]; - self.last = text.len(); + self.last = text.len() + 1; // Next call will return None Some(s) } } @@ -761,12 +774,19 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { if self.n == 0 { return None; } + self.n -= 1; - if self.n == 0 { - let text = self.splits.finder.0.text(); - Some(&text[self.splits.last..]) + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None } else { - self.splits.next() + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) } } } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 81aac15260..b746599088 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::fmt; -use std::ops::Index; +use std::ops::{Index, Range}; use std::str::FromStr; use std::sync::Arc; @@ -45,10 +45,17 @@ impl<'t> Match<'t> { self.end } + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range { + self.start..self.end + } + /// Returns the matched text. #[inline] pub fn as_str(&self) -> &'t str { - &self.text[self.start..self.end] + &self.text[self.range()] } /// Creates a new match from the given haystack and byte offsets. @@ -64,6 +71,12 @@ impl<'t> From> for &'t str { } } +impl<'t> From> for Range { + fn from(m: Match<'t>) -> Range { + m.range() + } +} + /// A compiled regular expression for matching Unicode strings. /// /// It is represented as either a sequence of bytecode instructions (dynamic) @@ -766,11 +779,11 @@ impl<'r, 't> Iterator for Split<'r, 't> { let text = self.finder.0.text(); match self.finder.next() { None => { - if self.last >= text.len() { + if self.last > text.len() { None } else { let s = &text[self.last..]; - self.last = text.len(); + self.last = text.len() + 1; // Next call will return None Some(s) } } @@ -801,12 +814,19 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { if self.n == 0 { return None; } + self.n -= 1; - if self.n == 0 { - let text = self.splits.finder.0.text(); - Some(&text[self.splits.last..]) + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None } else { - self.splits.next() + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) } } } diff --git a/tests/api.rs b/tests/api.rs index ff136217e1..0d4962cc9f 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -205,6 +205,18 @@ split!( split2, r"(?-u)\b", "a b c", - &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")] + &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c"), t!("")] ); -split!(split3, r"a$", "a", &[t!("")]); +split!(split3, r"a$", "a", &[t!(""), t!("")]); +split!(split_none, r"-", r"a", &[t!("a")]); +split!(split_trailing_blank, r"-", r"a-", &[t!("a"), t!("")]); +split!(split_trailing_blanks, r"-", r"a--", &[t!("a"), t!(""), t!("")]); +split!(split_empty, r"-", r"", &[t!("")]); + +splitn!(splitn_below_limit, r"-", r"a", 2, &[t!("a")]); +splitn!(splitn_at_limit, r"-", r"a-b", 2, &[t!("a"), t!("b")]); +splitn!(splitn_above_limit, r"-", r"a-b-c", 2, &[t!("a"), t!("b-c")]); +splitn!(splitn_zero_limit, r"-", r"a-b", 0, empty_vec!()); +splitn!(splitn_trailing_blank, r"-", r"a-", 2, &[t!("a"), t!("")]); +splitn!(splitn_trailing_separator, r"-", r"a--", 2, &[t!("a"), t!("-")]); +splitn!(splitn_empty, r"-", r"", 1, &[t!("")]); diff --git a/tests/bytes.rs b/tests/bytes.rs index 6c5a11ac77..d05f138edf 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -69,10 +69,12 @@ matiter!( R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), (0, 0) ); -matiter!(invalidutf8_anchor2, - r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", - R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), - (22, 22)); +matiter!( + invalidutf8_anchor2, + r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (22, 22) +); matiter!( invalidutf8_anchor3, r"(?-u)^|ddp\xff\xffdddddlQd@\x80", diff --git a/tests/consistent.rs b/tests/consistent.rs index 2d7bdcf8d1..0f9ea53f35 100644 --- a/tests/consistent.rs +++ b/tests/consistent.rs @@ -231,7 +231,6 @@ macro_rules! checker { TestResult::from_bool(true) } } - } // mod }; // rule case } // macro_rules! diff --git a/tests/crazy.rs b/tests/crazy.rs index 20a3371b2e..8c72273d93 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -29,8 +29,12 @@ mat!( "mine is jam.slam@gmail ", None ); -mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", - "mine is jam.slam@gmail.com ", Some((8, 26))); +mat!( + match_email_big, + r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", + "mine is jam.slam@gmail.com ", + Some((8, 26)) +); mat!( match_date1, r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", diff --git a/tests/fowler.rs b/tests/fowler.rs index 5da32935e7..7f56a758d3 100644 --- a/tests/fowler.rs +++ b/tests/fowler.rs @@ -215,7 +215,13 @@ mat!( Some((1, 2)), Some((1, 2)) ); -mat!(match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81))); +mat!( + match_basic_76, + r"a?(ab|ba)*", + r"ababababababababababababababababababababababababababababababababababababababababa", + Some((0, 81)), + Some((79, 81)) +); mat!( match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", diff --git a/tests/macros.rs b/tests/macros.rs index 3c4b888b20..e70e9489fd 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -147,3 +147,14 @@ macro_rules! split { } } } + +macro_rules! splitn { + ($name:ident, $re:expr, $text:expr, $limit:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let splitted: Vec<_> = re.splitn(t!($text), $limit).collect(); + assert_eq!($expected, &*splitted); + } + } +} diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index 7605d69b21..03c370d698 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -3,6 +3,7 @@ macro_rules! text { ($text:expr) => { $text.as_bytes() } } macro_rules! t { ($re:expr) => { text!($re) } } macro_rules! match_text { ($text:expr) => { $text.as_bytes() } } macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } } +macro_rules! empty_vec { () => { >::new() } } macro_rules! bytes { ($text:expr) => { $text } } diff --git a/tests/macros_str.rs b/tests/macros_str.rs index fda5814b8c..9b996b33b9 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -3,6 +3,7 @@ macro_rules! text { ($text:expr) => { $text } } macro_rules! t { ($text:expr) => { text!($text) } } macro_rules! match_text { ($text:expr) => { $text.as_str() } } macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } } +macro_rules! empty_vec { () => { >::new() } } macro_rules! no_expand { ($text:expr) => {{ diff --git a/tests/unicode.rs b/tests/unicode.rs index 597f86873a..52522f41c6 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -60,12 +60,7 @@ mat!( "〰", Some((0, 3)) ); -mat!( - uni_class_gencat_decimal_numer, - r"\p{Decimal_Number}", - "𑓙", - Some((0, 4)) -); +mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "𑓙", Some((0, 4))); mat!( uni_class_gencat_enclosing_mark, r"\p{Enclosing_Mark}", @@ -86,12 +81,7 @@ mat!( Some((0, 3)) ); mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2))); -mat!( - uni_class_gencat_letter_number, - r"\p{Letter_Number}", - "ↂ", - Some((0, 3)) -); +mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3))); mat!( uni_class_gencat_line_separator, r"\p{Line_Separator}",