Skip to content
Prev Previous commit
Next Next commit
More efficient .* in RegexInterpreter
  • Loading branch information
Prashanth Govindarajan committed Jun 30, 2021
commit 781541c3e1191a2857808fa140641e3507728faf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
// Strings and sets are indices into a string table.

using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;

Expand Down Expand Up @@ -107,13 +106,11 @@ internal sealed class RegexCode
public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc)
public readonly bool RightToLeft; // true if right to left

public readonly Dictionary<char, HashSet<int>> FirstLetterToStringTableIndices;

public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
Hashtable? caps, int capsize,
RegexBoyerMoore? boyerMoorePrefix,
(string CharClass, bool CaseInsensitive)[]? leadingCharClasses,
int leadingAnchor, bool rightToLeft, Dictionary<char, HashSet<int>> firstLetterToStringTableIndices)
int leadingAnchor, bool rightToLeft)
{
Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null);

Expand All @@ -128,7 +125,6 @@ public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
LeadingCharClasses = leadingCharClasses;
LeadingAnchor = leadingAnchor;
RightToLeft = rightToLeft;
FirstLetterToStringTableIndices = firstLetterToStringTableIndices;
}

public static bool OpcodeBacktracks(int Op)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
Expand All @@ -21,6 +20,7 @@ internal sealed class RegexInterpreter : RegexRunner
private int _codepos;
private bool _rightToLeft;
private bool _caseInsensitive;
private int _maxBacktrackPosition = -1;

public RegexInterpreter(RegexCode code, CultureInfo culture)
{
Expand Down Expand Up @@ -224,6 +224,20 @@ private bool MatchString(string str)
{
if (runtextend - runtextpos < c)
{
// If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization
if (_maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
{
// If lastIndexOf is -1, we backtrack to the max extent possible.
runtextpos = _maxBacktrackPosition;
ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be bounded by runtextend?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, especially after seeing your PR from yesterday. Fixed now.

int lastIndexOf = runtextSpan.LastIndexOf(str);
if (lastIndexOf > -1)
{
// Found the next position to match. Move runtextpos here
runtextpos = _maxBacktrackPosition + lastIndexOf;
}
}

return false;
}

Expand Down Expand Up @@ -1059,17 +1073,7 @@ protected override void Go()
continue;

case RegexCode.Multi:
int stringTableIndex = Operand(0);
char textChar = runtext![runtextpos];
if (_code.FirstLetterToStringTableIndices.TryGetValue(textChar, out HashSet<int>? stringTableIndices))
{
if (!stringTableIndices.Contains(stringTableIndex))
{
// We are trying a pattern that doesn't start with the right char, so there's no way we can match.
break;
}
}
if (!MatchString(_code.Strings[stringTableIndex]))
if (!MatchString(_code.Strings[Operand(0)]))
{
break;
}
Expand Down Expand Up @@ -1196,6 +1200,7 @@ protected override void Go()
int len = Math.Min(Operand(1), Forwardchars());
char ch = (char)Operand(0);
int i;
int tempMaxBacktrackPosition = runtextpos;

if (!_rightToLeft && !_caseInsensitive)
{
Expand Down Expand Up @@ -1228,6 +1233,8 @@ protected override void Go()
if (len > i && _operator == RegexCode.Notoneloop)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder whether this should also happen for Notoneloopatomic.

{
TrackPush(len - i - 1, runtextpos - Bump());
Debug.Assert(_maxBacktrackPosition == -1);
_maxBacktrackPosition = tempMaxBacktrackPosition;
}
}
advance = 2;
Expand Down Expand Up @@ -1272,6 +1279,16 @@ protected override void Go()
{
int i = TrackPeek();
int pos = TrackPeek(1);
if (_maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
{
// The Multi node has bumped us along already
int difference = pos - _maxBacktrackPosition;
Debug.Assert(difference > 0);
pos = runtextpos;
i -= difference;
// We shouldn't be backtracking anymore.
_maxBacktrackPosition = -1;
}
runtextpos = pos;
if (i > 0)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,25 +172,14 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree)
int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree);

// Convert the string table into an ordered string array.
var firstLetterToStringTableIndices = new Dictionary<char, HashSet<int>>();
var strings = new string[_stringTable.Count];
foreach (KeyValuePair<string, int> stringEntry in _stringTable)
{
if (firstLetterToStringTableIndices.TryGetValue(stringEntry.Key[0], out HashSet<int>? indices))
{
indices.Add(stringEntry.Value);
}
else
{
firstLetterToStringTableIndices.Add(stringEntry.Key[0], new HashSet<int>() { stringEntry.Value });
}
strings[stringEntry.Value] = stringEntry.Key;
}



// Return all that in a RegexCode object.
return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl, firstLetterToStringTableIndices);
return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl);
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1176,16 +1176,38 @@ public void Synchronized()
AssertExtensions.Throws<ArgumentNullException>("inner", () => System.Text.RegularExpressions.Match.Synchronized(null));
}

[Theory]
[InlineData(".*foo", "hifoo")]
[InlineData("ab.*foo", "abhifoo")]
[InlineData("ab.*foo.*bar", "abhifooabcbar")]
[InlineData("ab.*foo.*", "abhifooabcbar")]
[InlineData(".*abc|ghi", "ghi")]
public void TestRegressions(string pattern, string input)
{
var regex = new Regex(pattern);
var match = regex.Match(input);
Assert.True(match.Success);
Assert.Equal(input, match.Value);
}

[Fact]
public void HowManyAlternationsAreChecked()
{
// We can statically determine if it's impossible for an alternation branch N + 1 to match after we've gotten to a certain place in matching branch N, e.g. given the alternation "abc|def" we know that once we match the 'a', there's no point in even considering the second branch. We should be able to utilize that knowledge to avoid unnecessarily checking branches when a previous one fails to match.

Debugger.Launch();
var regex = new Regex("(abc|def)xyz");
var match = regex.Match("abqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqdefxyz");
//Debugger.Launch();
var regex = new Regex(".*(ss)");
var match = regex.Match("Essential services are provided by regular exprs.");
Assert.True(match.Success);
Assert.Equal("defxyz", match.Value);
Assert.Equal("Ess", match.Value);
Assert.Equal(0, match.Index);
Assert.Equal(1, match.Groups[1].Index);

//var regex = new Regex(".*abc|ghi");
//var match = regex.Match("ghi");
//Assert.Equal(1, match.Groups.Count);
//Assert.Equal("ghi", match.Groups[0].Value);

Console.WriteLine("BH");
}
}
Expand Down