Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
First cut of look up table for speeding up Go()
  • Loading branch information
Prashanth Govindarajan committed Apr 13, 2021
commit 62eb98396baae3a8d31d0340500e53773b49d3af
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
// Strings and sets are indices into a string table.

using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;

Expand Down Expand Up @@ -106,11 +107,13 @@ internal sealed class RegexCode
public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc)
public readonly bool RightToLeft; // true if right to left

public readonly Dictionary<char, HashSet<int>> FirstLetterToStringTableIndices;

public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
Hashtable? caps, int capsize,
RegexBoyerMoore? boyerMoorePrefix,
(string CharClass, bool CaseInsensitive)[]? leadingCharClasses,
int leadingAnchor, bool rightToLeft)
int leadingAnchor, bool rightToLeft, Dictionary<char, HashSet<int>> firstLetterToStringTableIndices)
{
Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null);

Expand All @@ -125,6 +128,7 @@ public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount,
LeadingCharClasses = leadingCharClasses;
LeadingAnchor = leadingAnchor;
RightToLeft = rightToLeft;
FirstLetterToStringTableIndices = firstLetterToStringTableIndices;
}

public static bool OpcodeBacktracks(int Op)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
Expand Down Expand Up @@ -1058,7 +1059,17 @@ protected override void Go()
continue;

case RegexCode.Multi:
if (!MatchString(_code.Strings[Operand(0)]))
int stringTableIndex = Operand(0);
char textChar = runtext![runtextpos];
if (_code.FirstLetterToStringTableIndices.TryGetValue(textChar, out HashSet<int>? stringTableIndices))
{
if (!stringTableIndices.Contains(stringTableIndex))
{
// We are trying a pattern that doesn't start with the right char, so there's no way we can match.
break;
}
}
if (!MatchString(_code.Strings[stringTableIndex]))
{
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,25 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree)
int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree);

// Convert the string table into an ordered string array.
var firstLetterToStringTableIndices = new Dictionary<char, HashSet<int>>();
var strings = new string[_stringTable.Count];
foreach (KeyValuePair<string, int> stringEntry in _stringTable)
{
if (firstLetterToStringTableIndices.TryGetValue(stringEntry.Key[0], out HashSet<int>? indices))
{
indices.Add(stringEntry.Value);
}
else
{
firstLetterToStringTableIndices.Add(stringEntry.Key[0], new HashSet<int>() { stringEntry.Value });
}
strings[stringEntry.Value] = stringEntry.Key;
}



// Return all that in a RegexCode object.
return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl);
return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl, firstLetterToStringTableIndices);
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1174,5 +1174,18 @@ public void Synchronized()

AssertExtensions.Throws<ArgumentNullException>("inner", () => System.Text.RegularExpressions.Match.Synchronized(null));
}

[Fact]
public void HowManyAlternationsAreChecked()
{
// We can statically determine if it's impossible for an alternation branch N + 1 to match after we've gotten to a certain place in matching branch N, e.g. given the alternation "abc|def" we know that once we match the 'a', there's no point in even considering the second branch. We should be able to utilize that knowledge to avoid unnecessarily checking branches when a previous one fails to match.

Debugger.Launch();
var regex = new Regex("(abc|def)xyz");
var match = regex.Match("abqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqdefxyz");
Assert.True(match.Success);
Assert.Equal("defxyz", match.Value);
Console.WriteLine("BH");
}
}
}