Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,32 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture)
// for the whole expression, we can use that to quickly jump to the right location in the input.
if (!_rightToLeft) // haven't added FindNextStartingPositionMode support for RTL
{
bool triedToComputeMaxLength = false;

TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(tree.Root);
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ &&
tree.Root.ComputeMaxLength() is int maxLength)
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ)
{
Debug.Assert(maxLength >= _minRequiredLength, $"{maxLength} should have been greater than {_minRequiredLength} minimum");
MaxPossibleLength = maxLength;
if (_minRequiredLength == maxLength)
triedToComputeMaxLength = true;
if (tree.Root.ComputeMaxLength() is int maxLength)
{
FindMode = TrailingAnchor == RegexNodeKind.End ?
FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End :
FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ;
return;
Debug.Assert(maxLength >= _minRequiredLength, $"{maxLength} should have been greater than {_minRequiredLength} minimum");
MaxPossibleLength = maxLength;
if (_minRequiredLength == maxLength)
{
FindMode = TrailingAnchor == RegexNodeKind.End ?
FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End :
FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ;
return;
}
}
}

if ((tree.Options & RegexOptions.NonBacktracking) != 0 && !triedToComputeMaxLength)
{
// NonBacktracking also benefits from knowing whether the pattern is a fixed length, as it can use that
// knowledge to avoid multiple match phases in some situations.
MaxPossibleLength = tree.Root.ComputeMaxLength();
}
}

// If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ internal sealed class SymbolicRegexMatcher<TSetType> : ISymbolicRegexMatcher whe
/// <summary>Number of capture groups.</summary>
private readonly int _capsize;

/// <summary>Fixed-length of any match, if there is one.</summary>
private readonly int? _fixedMatchLength;

/// <summary>This determines whether the matcher uses the special capturing NFA simulation mode.</summary>
internal bool HasSubcaptures => _capsize > 1;

Expand All @@ -154,7 +157,7 @@ private TSetType GetMinterm(int c)
}

/// <summary>Constructs matcher for given symbolic regex.</summary>
internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture)
internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexCode code, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture)
{
Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}");

Expand All @@ -170,6 +173,11 @@ internal SymbolicRegexMatcher(SymbolicRegexNode<TSetType> sr, RegexCode code, Ch
};
_capsize = code.CapSize;

if (code.Tree.MinRequiredLength == code.FindOptimizations.MaxPossibleLength)
{
_fixedMatchLength = code.Tree.MinRequiredLength;
}

if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch &&
code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match.
{
Expand Down Expand Up @@ -493,65 +501,88 @@ private void DoCheckTimeout(int timeoutOccursAt)
/// <param name="perThreadData">Per thread data reused between calls.</param>
public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan<char> input, int startat, PerThreadData perThreadData)
{
// If we need to perform timeout checks, store the absolute timeout value.
int timeoutOccursAt = 0;
if (_checkTimeout)
{
// Using Environment.TickCount for efficiency instead of Stopwatch -- as in the non-DFA case.
timeoutOccursAt = Environment.TickCount + (int)(_timeout + 0.5);
}

// If we're starting at the end of the input, we don't need to do any work other than
// determine whether an empty match is valid, i.e. whether the pattern is "nullable"
// given the kinds of characters at and just before the end.
if (startat == input.Length)
{
// Covers the special-case of an empty match at the end of the input.
uint prevKind = GetCharKind(input, startat - 1);
uint nextKind = GetCharKind(input, startat);

bool emptyMatchExists = _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind));
return emptyMatchExists ?
return _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind)) ?
new SymbolicMatch(startat, 0) :
SymbolicMatch.NoMatch;
}
Comment on lines 515 to 522
Copy link
Contributor

@olsaarik olsaarik Feb 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the new comment I realized this optimization should additionally handle the case where there are capture groups and do the same ApplyEffects thing FindEndPositionCapturing is doing. The difference will be visible for some patterns with nullable capture groups that have anchors in them.

Edit: Oh actually any nullable patterns with nullable capture groups.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, @olsaarik. That's pre-existing this PR, yes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, shouldn't block this PR, just something I noticed.


// Find the first accepting state. Initial start position in the input is i == 0.
int i = startat;

// May return -1 as a legitimate value when the initial state is nullable and startat == 0.
// Returns NoMatchExists when there is no match.
i = FindFinalStatePosition(input, i, timeoutOccursAt, out int i_q0_A1, out int watchdog);
// Phase 1:
// Determine whether there is a match by finding the first final state position. This only tells
// us whether there is a match but needn't give us the longest possible match. This may return -1 as
// a legitimate value when the initial state is nullable and startat == 0. It returns NoMatchExists (-2)
// when there is no match. As an example, consider the pattern a{5,10}b* run against an input
// of aaaaaaaaaaaaaaabbbc: phase 1 will find the position of the first b: aaaaaaaaaaaaaaab.
int i = FindFinalStatePosition(input, startat, timeoutOccursAt, out int matchStartLowBoundary, out int watchdog);

// If there wasn't a match, we're done.
if (i == NoMatchExists)
{
return SymbolicMatch.NoMatch;
}

// A match exists. If we don't need further details, because IsMatch was used (and thus we don't
// need the exact bounds of the match, captures, etc.), we're done.
if (isMatch)
{
// this means success -- the original call was IsMatch
return SymbolicMatch.QuickMatch;
}

int i_start;
// Phase 2:
// Match backwards through the input matching against the reverse of the pattern, looking for the earliest
// start position. That tells us the actual starting position of the match. We can skip this phase if we
// recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that
// exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from
// that first b until it finds the 6th a: aaaaaaaaaab.
int matchStart;
if (watchdog >= 0)
{
i_start = i - watchdog + 1;
matchStart = i - watchdog + 1;
}
else
{
Debug.Assert(i >= startat - 1);
i_start = i < startat ?
matchStart = i < startat ?
startat :
FindStartPosition(input, i, i_q0_A1); // Walk in reverse to locate the start position of the match
FindStartPosition(input, i, matchStartLowBoundary);
}

// Phase 3:
// Match again, this time from the computed start position, to find the latest end position. That start
// and end then represent the bounds of the match. If the pattern has subcaptures (captures other than
// the top-level capture for the whole match), we need to do more work to compute their exact bounds, so we
// take a faster path if captures aren't required. Further, if captures aren't needed, and if any possible
// match of the whole pattern is a fixed length, we can skip this phase as well, just using that fixed-length
// to compute the ending position based on the starting position. Continuing the previous example, phase 3
// will walk forwards from the 6th a until it finds the end of the match: aaaaaaaaaabbb.
if (!HasSubcaptures)
{
int i_end = FindEndPosition(input, i_start);
return new SymbolicMatch(i_start, i_end + 1 - i_start);
if (_fixedMatchLength.HasValue)
{
return new SymbolicMatch(matchStart, _fixedMatchLength.GetValueOrDefault());
}

int matchEnd = FindEndPosition(input, matchStart);
return new SymbolicMatch(matchStart, matchEnd + 1 - matchStart);
}
else
{
int i_end = FindEndPositionCapturing(input, i_start, out Registers endRegisters, perThreadData);
return new SymbolicMatch(i_start, i_end + 1 - i_start, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
int matchEnd = FindEndPositionCapturing(input, matchStart, out Registers endRegisters, perThreadData);
return new SymbolicMatch(matchStart, matchEnd + 1 - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan

// Convert the BDD-based AST to BV-based AST
SymbolicRegexNode<BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd));
_matcher = new SymbolicRegexMatcher<BV>(rootBV, code, solver, minterms, matchTimeout, culture);
_matcher = new SymbolicRegexMatcher<BV>(rootBV, code, minterms, matchTimeout, culture);
}
else
{
Expand All @@ -63,7 +63,7 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan

// Convert the BDD-based AST to ulong-based AST
SymbolicRegexNode<ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd));
_matcher = new SymbolicRegexMatcher<ulong>(root64, code, solver, minterms, matchTimeout, culture);
_matcher = new SymbolicRegexMatcher<ulong>(root64, code, minterms, matchTimeout, culture);
}
}

Expand Down