Skip to content

Commit 11f02ad

Browse files
authored
Two-phase matching algorithm for NonBacktracking (dotnet#68199)
* Switch to 2-phase matching in NonBacktracking First phase now finds the true match end position. The implicit .* is now a lazy .*? to prioritize the earliest match. Third phase is now only run for subcaptures, which no longer needs to find match end position. Remove counter optimization that no longer applies with OrderedOr. Fix a problem in SymbolicRegexInfo where begin/end anchors were marked as line anchors. Also remove dead fields from SymbolicRegexInfo. Fix captures not being handled for empty matches at start of input. * Improve comments for NonBacktracking Especially fix comments for the new 2-phase match generation algorithm. * Add a failing test for the earlier NonBacktracking * Avoid transitions to deadends for capuring NFA
1 parent c8be3f3 commit 11f02ad

File tree

7 files changed

+309
-516
lines changed

7 files changed

+309
-516
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ internal int FixedLength
5151
/// <summary>If true then the state is a dead-end, rejects all inputs.</summary>
5252
internal bool IsNothing => Node.IsNothing;
5353

54-
/// <summary>If true then state starts with a ^ or $ or \A or \z or \Z</summary>
54+
/// <summary>If true then state starts with a ^ or $ or \Z</summary>
5555
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
5656

5757
/// <summary>
@@ -134,7 +134,9 @@ internal DfaMatchingState<TSet> Next(TSet minterm)
134134
// nextCharKind will be the PrevCharKind of the target state
135135
// use an existing state instead if one exists already
136136
// otherwise create a new new id for it
137-
list.Add((Node._builder.CreateState(node, nextCharKind, capturing: true), effects));
137+
DfaMatchingState<TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
138+
if (!state.IsDeadend)
139+
list.Add((state, effects));
138140
}
139141
return list;
140142
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher<TSet> srm, bool nfa, bool addDotStar,
191191
{
192192
_builder = srm._builder;
193193
uint startId = reverse ?
194-
(srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) :
195-
(srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0);
194+
(srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) :
195+
(srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0);
196196

197197
// Create the initial state
198198
_initialState = _builder.CreateState(

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>
2424
internal readonly SymbolicRegexNode<TSet> _nothing;
2525
internal readonly SymbolicRegexNode<TSet> _anyChar;
2626
internal readonly SymbolicRegexNode<TSet> _anyStar;
27+
internal readonly SymbolicRegexNode<TSet> _anyStarLazy;
2728

2829
private SymbolicRegexNode<TSet>? _epsilon;
2930
internal SymbolicRegexNode<TSet> Epsilon => _epsilon ??= SymbolicRegexNode<TSet>.CreateEpsilon(this);
@@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
173174
_nothing = SymbolicRegexNode<TSet>.CreateFalse(this);
174175
_anyChar = SymbolicRegexNode<TSet>.CreateTrue(this);
175176
_anyStar = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false);
177+
_anyStarLazy = SymbolicRegexNode<TSet>.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: true);
176178

177179
// --- initialize singletonCache ---
178180
_singletonCache[_solver.Empty] = _nothing;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs

Lines changed: 17 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic
1111
private const uint IsLazyMask = 4;
1212
private const uint CanBeNullableMask = 8;
1313
private const uint ContainsSomeAnchorMask = 16;
14-
private const uint ContainsLineAnchorMask = 32;
15-
private const uint ContainsSomeCharacterMask = 64;
16-
private const uint StartsWithBoundaryAnchorMask = 128;
14+
private const uint StartsWithSomeAnchorMask = 32;
1715

1816
private readonly uint _info;
1917

2018
private SymbolicRegexInfo(uint i) => _info = i;
2119

22-
internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false,
23-
bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false,
24-
bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true)
20+
internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false,
21+
bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isLazy = true)
2522
{
2623
uint i = 0;
2724

@@ -35,31 +32,21 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can
3532
}
3633
}
3734

38-
if (startsWithLineAnchor || containsLineAnchor || startsWithBoundaryAnchor || containsSomeAnchor)
35+
if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
3936
{
4037
i |= ContainsSomeAnchorMask;
4138

42-
if (startsWithLineAnchor || containsLineAnchor)
39+
if (startsWithLineAnchor)
4340
{
44-
i |= ContainsLineAnchorMask;
45-
46-
if (startsWithLineAnchor)
47-
{
48-
i |= StartsWithLineAnchorMask;
49-
}
41+
i |= StartsWithLineAnchorMask;
5042
}
5143

52-
if (startsWithBoundaryAnchor)
44+
if (startsWithLineAnchor || startsWithSomeAnchor)
5345
{
54-
i |= StartsWithBoundaryAnchorMask;
46+
i |= StartsWithSomeAnchorMask;
5547
}
5648
}
5749

58-
if (containsSomeCharacter)
59-
{
60-
i |= ContainsSomeCharacterMask;
61-
}
62-
6350
if (isLazy)
6451
{
6552
i |= IsLazyMask;
@@ -72,18 +59,12 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can
7259

7360
public bool CanBeNullable => (_info & CanBeNullableMask) != 0;
7461

75-
public bool StartsWithSomeAnchor => (_info & (StartsWithLineAnchorMask | StartsWithBoundaryAnchorMask)) != 0;
76-
7762
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
7863

79-
public bool StartsWithBoundaryAnchor => (_info & StartsWithBoundaryAnchorMask) != 0;
64+
public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
8065

8166
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
8267

83-
public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
84-
85-
public bool ContainsSomeCharacter => (_info & ContainsSomeCharacterMask) != 0;
86-
8768
public bool IsLazy => (_info & IsLazyMask) != 0;
8869

8970
public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos)
@@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos)
121102
return new SymbolicRegexInfo(i);
122103
}
123104

124-
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info)
125-
{
126-
bool isNullable = left_info.IsNullable && right_info.IsNullable;
127-
bool canBeNullable = left_info.CanBeNullable && right_info.CanBeNullable;
128-
bool isLazy = left_info.IsLazy && right_info.IsLazy;
129-
130-
bool startsWithLineAnchor = left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor);
131-
bool startsWithBoundaryAnchor = left_info.StartsWithBoundaryAnchor || (left_info.CanBeNullable && right_info.StartsWithBoundaryAnchor);
132-
bool containsSomeAnchor = left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor;
133-
bool containsLineAnchor = left_info.ContainsLineAnchor || right_info.ContainsLineAnchor;
134-
bool containsSomeCharacter = left_info.ContainsSomeCharacter || right_info.ContainsSomeCharacter;
135-
136-
return Create(isNullable, canBeNullable, startsWithLineAnchor, startsWithBoundaryAnchor, containsSomeAnchor, containsLineAnchor, containsSomeCharacter, isLazy);
137-
}
105+
public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) =>
106+
Create(
107+
isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
108+
canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
109+
startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
110+
startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
111+
containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
112+
isLazy: left_info.IsLazy && right_info.IsLazy);
138113

139114
public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy)
140115
{
@@ -171,10 +146,7 @@ public static SymbolicRegexInfo Not(SymbolicRegexInfo info) =>
171146
Create(isAlwaysNullable: !info.CanBeNullable,
172147
canBeNullable: !info.IsNullable,
173148
startsWithLineAnchor: info.StartsWithLineAnchor,
174-
startsWithBoundaryAnchor: info.StartsWithBoundaryAnchor,
175149
containsSomeAnchor: info.ContainsSomeAnchor,
176-
containsLineAnchor: info.ContainsLineAnchor,
177-
containsSomeCharacter: info.ContainsSomeCharacter,
178150
isLazy: info.IsLazy);
179151

180152
public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i);

0 commit comments

Comments
 (0)