Skip to content
Next Next commit
First cut of look up table for speeding up Go()
  • Loading branch information
Prashanth Govindarajan committed Jun 30, 2021
commit 2511bfa5e3ff90d8424f6f16bc2f13389fcf0121
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
// Strings and sets are indices into a string table.

using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;

Expand Down Expand Up @@ -106,11 +107,13 @@ internal sealed class RegexCode
public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc)
public readonly bool RightToLeft; // true if right to left

public readonly Dictionary<char, HashSet<int>> FirstLetterToStringTableIndices;

public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
Hashtable? caps, int capsize,
RegexBoyerMoore? boyerMoorePrefix,
(string CharClass, bool CaseInsensitive)[]? leadingCharClasses,
int leadingAnchor, bool rightToLeft)
int leadingAnchor, bool rightToLeft, Dictionary<char, HashSet<int>> firstLetterToStringTableIndices)
{
Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null);

Expand All @@ -125,6 +128,7 @@ public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
LeadingCharClasses = leadingCharClasses;
LeadingAnchor = leadingAnchor;
RightToLeft = rightToLeft;
FirstLetterToStringTableIndices = firstLetterToStringTableIndices;
}

public static bool OpcodeBacktracks(int Op)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
Expand Down Expand Up @@ -1058,7 +1059,17 @@ protected override void Go()
continue;

case RegexCode.Multi:
if (!MatchString(_code.Strings[Operand(0)]))
int stringTableIndex = Operand(0);
char textChar = runtext![runtextpos];
if (_code.FirstLetterToStringTableIndices.TryGetValue(textChar, out HashSet<int>? stringTableIndices))
{
if (!stringTableIndices.Contains(stringTableIndex))
{
// We are trying a pattern that doesn't start with the right char, so there's no way we can match.
break;
}
}
if (!MatchString(_code.Strings[stringTableIndex]))
{
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,25 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree)
int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree);

// Convert the string table into an ordered string array.
var firstLetterToStringTableIndices = new Dictionary<char, HashSet<int>>();
var strings = new string[_stringTable.Count];
foreach (KeyValuePair<string, int> stringEntry in _stringTable)
{
if (firstLetterToStringTableIndices.TryGetValue(stringEntry.Key[0], out HashSet<int>? indices))
{
indices.Add(stringEntry.Value);
}
else
{
firstLetterToStringTableIndices.Add(stringEntry.Key[0], new HashSet<int>() { stringEntry.Value });
}
strings[stringEntry.Value] = stringEntry.Key;
}



// Return all that in a RegexCode object.
return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl);
return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl, firstLetterToStringTableIndices);
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1175,5 +1175,18 @@ public void Synchronized()

AssertExtensions.Throws<ArgumentNullException>("inner", () => System.Text.RegularExpressions.Match.Synchronized(null));
}

[Fact]
public void HowManyAlternationsAreChecked()
{
// We can statically determine if it's impossible for an alternation branch N + 1 to match after we've gotten to a certain place in matching branch N, e.g. given the alternation "abc|def" we know that once we match the 'a', there's no point in even considering the second branch. We should be able to utilize that knowledge to avoid unnecessarily checking branches when a previous one fails to match.

Debugger.Launch();
var regex = new Regex("(abc|def)xyz");
var match = regex.Match("abqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqdefxyz");
Assert.True(match.Success);
Assert.Equal("defxyz", match.Value);
Console.WriteLine("BH");
}
}
}