diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 927012a1be8d5c..dc2c33f5cf8a1c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -6,7 +6,6 @@ - @@ -65,25 +64,23 @@ - + - - - - - + + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs index 59367f305012d9..c6f2fb4a416282 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Debug.cs @@ -13,20 +13,16 @@ public partial class Regex { /// Unwind the regex and save the resulting state graph in DGML /// Writer to which the DGML is written. - /// True to create an NFA instead of a DFA. - /// True to prepend .*? onto the pattern (outside of the implicit root capture). - /// If true, then unwind the regex backwards (and is ignored). - /// The approximate maximum number of states to include; less than or equal to 0 for no maximum. /// maximum length of labels in nodes anything over that length is indicated with .. [ExcludeFromCodeCoverage(Justification = "Debug only")] - internal void SaveDGML(TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength) + internal void SaveDGML(TextWriter writer, int maxLabelLength) { if (factory is not SymbolicRegexRunnerFactory srmFactory) { throw new NotSupportedException(); } - srmFactory._matcher.SaveDGML(writer, nfa, addDotStar, reverse, maxStates, maxLabelLength); + srmFactory._matcher.SaveDGML(writer, maxLabelLength); } /// @@ -44,17 +40,43 @@ internal static void GenerateUnicodeTables(string path) /// /// upper bound on the number of generated strings /// random seed for the generator, 0 means no random seed - /// if true then generate inputs that do not match /// [ExcludeFromCodeCoverage(Justification = "Debug only")] - internal IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) + internal IEnumerable SampleMatches(int k, int randomseed) { if (factory is not SymbolicRegexRunnerFactory srmFactory) { throw new NotSupportedException(); } - return srmFactory._matcher.GenerateRandomMembers(k, randomseed, negative); + return srmFactory._matcher.SampleMatches(k, randomseed); + } + + /// + /// Explore transitions of the DFA and/or NFA exhaustively. DFA exploration, if requested, is done only up to the + /// DFA state limit. NFA exploration, if requested, continues from the states unexplored by the DFA exploration, + /// or from the initial states if DFA exploration was not requested. NFA exploration will always finish. + /// + /// + /// This may result in a different automaton being explored than matching would produce, since if the limit for + /// the number of DFA states is reached then the order in which states and transitions are explored is significant. + /// During matching that order is driven by the input, while this function may use any order (currently it is + /// breadth-first). + /// + /// whether to explore the .*? prefixed version of the pattern + /// whether to explore the reversed pattern + /// whether to explore the original pattern + /// whether to explore DFA transitions + /// whether to explore NFA transitions + [ExcludeFromCodeCoverage(Justification = "Debug only")] + internal void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa) + { + if (factory is not SymbolicRegexRunnerFactory srmFactory) + { + throw new NotSupportedException(); + } + + srmFactory._matcher.Explore(includeDotStarred, includeReverse, includeOriginal, exploreDfa, exploreNfa); } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 343e78be28f465..3d1e0a04fa3f8a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -37,7 +37,8 @@ public partial class Regex : ISerializable // so this is a convenient place to include them rather than needing a debug-only illink file. [DynamicDependency(nameof(SaveDGML))] [DynamicDependency(nameof(GenerateUnicodeTables))] - [DynamicDependency(nameof(GenerateRandomMembers))] + [DynamicDependency(nameof(SampleMatches))] + [DynamicDependency(nameof(Explore))] #endif protected Regex() { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs index 7cc1b97f932980..9b315652bba236 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs @@ -29,23 +29,11 @@ internal DfaMatchingState(SymbolicRegexNode node, uint prevCharKind) internal bool IsDeadend => Node.IsNothing; /// The node must be nullable here - internal int FixedLength + internal int FixedLength(uint nextCharKind) { - get - { - if (Node._kind == SymbolicRegexNodeKind.FixedLengthMarker) - { - return Node._lower; - } - - if (Node._kind == SymbolicRegexNodeKind.Or) - { - Debug.Assert(Node._alts is not null); - return Node._alts._maximumLength; - } - - return -1; - } + Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS); + uint context = CharKind.Context(PrevCharKind, nextCharKind); + return Node.ResolveFixedLength(context); } /// If true then the state is a dead-end, rejects all inputs. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs deleted file mode 100644 index e6d0c01acfc05f..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs +++ /dev/null @@ -1,322 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections.Generic; -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.IO; -using System.Net; - -namespace System.Text.RegularExpressions.Symbolic -{ - [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] - internal static class DgmlWriter where TSet : IComparable, IEquatable - { - /// Write the DFA or NFA in DGML format into the TextWriter. - /// The for the regular expression. - /// Writer to which the DGML is written. - /// True to create an NFA instead of a DFA. - /// True to prepend .*? onto the pattern (outside of the implicit root capture). - /// If true, then unwind the regex backwards (and is ignored). - /// The approximate maximum number of states to include; less than or equal to 0 for no maximum. - /// maximum length of labels in nodes anything over that length is indicated with .. - public static void Write( - TextWriter writer, SymbolicRegexMatcher matcher, - bool nfa = false, bool addDotStar = true, bool reverse = false, int maxStates = -1, int maxLabelLength = -1) - { - var charSetSolver = new CharSetSolver(); - var explorer = new DfaExplorer(matcher, nfa, addDotStar, reverse, maxStates); - var nonEpsilonTransitions = new Dictionary<(int SourceState, int TargetState), List<(SymbolicRegexNode?, TSet)>>(); - var epsilonTransitions = new List(); - - foreach (Transition transition in explorer.GetTransitions()) - { - if (transition.IsEpsilon) - { - epsilonTransitions.Add(transition); - } - else - { - (int SourceState, int TargetState) p = (transition.SourceState, transition.TargetState); - if (!nonEpsilonTransitions.TryGetValue(p, out List<(SymbolicRegexNode?, TSet)>? rules)) - { - nonEpsilonTransitions[p] = rules = new List<(SymbolicRegexNode?, TSet)>(); - } - - rules.Add(transition.Label); - } - } - - writer.WriteLine(""); - writer.WriteLine(""); - writer.WriteLine(" "); - writer.WriteLine(" ", GetDFAInfo(explorer, charSetSolver)); - writer.WriteLine(" ", GetDFAInfo(explorer, charSetSolver)); - foreach (int state in explorer.GetStates()) - { - writer.WriteLine(" ", state, explorer.DescribeState(state)); - if (state == explorer.InitialState) - { - writer.WriteLine(" "); - } - if (explorer.IsFinalState(state)) - { - writer.WriteLine(" "); - } - writer.WriteLine(" "); - writer.WriteLine(" ", state, explorer.DescribeState(state)); - } - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" ", explorer.InitialState); - writer.WriteLine(" "); - - foreach (Transition transition in epsilonTransitions) - { - writer.WriteLine(" ", transition.SourceState, transition.TargetState); - } - - foreach (KeyValuePair<(int, int), List<(SymbolicRegexNode?, TSet)>> transition in nonEpsilonTransitions) - { - string label = string.Join($",{Environment.NewLine} ", DescribeLabels(explorer, transition.Value, charSetSolver)); - string info = ""; - if (label.Length > (uint)maxLabelLength) - { - info = $"FullLabel = \"{label}\" "; - label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); - } - - writer.WriteLine($" "); - } - - foreach (int state in explorer.GetStates()) - { - writer.WriteLine(" ", state); - } - - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(" "); - writer.WriteLine(""); - } - - private static string GetDFAInfo(DfaExplorer explorer, CharSetSolver solver) - { - StringBuilder sb = new(); - sb.Append($"States = {explorer.StateCount} "); - sb.Append($"Transitions = {explorer.TransitionCount} "); - sb.Append($"Min Terms ({explorer._builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',', DescribeLabels(explorer, explorer.Alphabet, solver)); - return sb.ToString(); - } - - private static IEnumerable DescribeLabels(DfaExplorer explorer, IList<(SymbolicRegexNode?, TSet)> items, CharSetSolver solver) - { - for (int i = 0; i < items.Count; i++) - { - yield return explorer.DescribeLabel(items[i], solver); - } - } - - /// Used to unwind a regex into a DFA up to a bound that limits the number of states - private sealed class DfaExplorer - { - private readonly DfaMatchingState _initialState; - private readonly List _states = new(); - private readonly List _transitions = new(); - private readonly SymbolicNFA? _nfa; - internal readonly SymbolicRegexBuilder _builder; - - internal DfaExplorer(SymbolicRegexMatcher srm, bool nfa, bool addDotStar, bool reverse, int maxStates) - { - _builder = srm._builder; - uint startId = reverse ? - (srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) : - (srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0); - - // Create the initial state - _initialState = _builder.CreateState( - reverse ? srm._reversePattern : - addDotStar ? srm._dotStarredPattern : - srm._pattern, startId); - - if (nfa) - { - _nfa = _initialState.Node.Explore(maxStates); - for (int q = 0; q < _nfa.StateCount; q++) - { - _states.Add(q); - foreach ((TSet, SymbolicRegexNode?, int) branch in _nfa.EnumeratePaths(q)) - { - _transitions.Add(new Transition(q, branch.Item3, (branch.Item2, branch.Item1))); - } - } - } - else - { - Dictionary<(int, int), TSet> normalizedMoves = new(); - Stack> stack = new(); - stack.Push(_initialState); - _states.Add(_initialState.Id); - - HashSet stateSet = new(); - stateSet.Add(_initialState.Id); - - TSet[]? minterms = _builder._solver.GetMinterms(); - Debug.Assert(minterms is not null); - - // Unwind until the stack is empty or the bound has been reached - while (stack.Count > 0 && (maxStates <= 0 || _states.Count < maxStates)) - { - DfaMatchingState q = stack.Pop(); - foreach (TSet c in minterms) - { - DfaMatchingState p = q.Next(c); - - // check that p is not a dead-end - if (!p.IsNothing) - { - if (stateSet.Add(p.Id)) - { - stack.Push(p); - _states.Add(p.Id); - } - - (int, int) qp = (q.Id, p.Id); - normalizedMoves[qp] = normalizedMoves.ContainsKey(qp) ? - _builder._solver.Or(normalizedMoves[qp], c) : - c; - } - } - } - - foreach (KeyValuePair<(int, int), TSet> entry in normalizedMoves) - { - _transitions.Add(new Transition(entry.Key.Item1, entry.Key.Item2, (null, entry.Value))); - } - } - } - - public (SymbolicRegexNode?, TSet)[] Alphabet - { - get - { - TSet[]? alphabet = _builder._solver.GetMinterms(); - Debug.Assert(alphabet is not null); - var results = new (SymbolicRegexNode?, TSet)[alphabet.Length]; - for (int i = 0; i < alphabet.Length; i++) - { - results[i] = (null, alphabet[i]); - } - return results; - } - } - - public int InitialState => _nfa is not null ? 0 : _initialState.Id; - - public int StateCount => _states.Count; - - public int TransitionCount => _transitions.Count; - - public string DescribeLabel((SymbolicRegexNode?, TSet) lab, CharSetSolver solver) => - WebUtility.HtmlEncode(lab.Item1 is null ? // Conditional nullability based on anchors - _builder._solver.PrettyPrint(lab.Item2, solver) : - $"{lab.Item1}/{_builder._solver.PrettyPrint(lab.Item2, solver)}"); - - public string DescribeState(int state) - { - if (_nfa is not null) - { - Debug.Assert(state < _nfa.StateCount); - string? str = WebUtility.HtmlEncode(_nfa.GetNode(state).ToString()); - return _nfa.IsUnexplored(state) ? $"Unexplored:{str}" : str; - } - - Debug.Assert(_builder._stateArray is not null); - return _builder._stateArray[state].DgmlView; - } - - public IEnumerable GetStates() => _states; - - public bool IsFinalState(int state) - { - if (_nfa is not null) - { - Debug.Assert(state < _nfa.StateCount); - return _nfa.CanBeNullable(state); - } - - Debug.Assert(_builder._stateArray is not null && state < _builder._stateArray.Length); - return _builder._stateArray[state].Node.CanBeNullable; - } - - public List GetTransitions() => _transitions; - } - - private sealed record Transition(int SourceState, int TargetState, (SymbolicRegexNode?, TSet) Label) - { - public bool IsEpsilon => Label.Equals(default); - } - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 2184b987e5b17f..ec9a5694f402d1 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -40,12 +40,12 @@ internal SymbolicRegexNode ConvertToSymbolicRegexNode(RegexNode root) DoublyLinkedList> rootResult = new(); // Create a stack to be processed in order to process iteratively rather than recursively, and push the root on. - Stack<(RegexNode Node, bool TryToMarkFixedLength, DoublyLinkedList> Result, DoublyLinkedList>[]? ChildResults)> stack = new(); - stack.Push((root, true, rootResult, CreateChildResultArray(root.ChildCount()))); + Stack<(RegexNode Node, DoublyLinkedList> Result, DoublyLinkedList>[]? ChildResults)> stack = new(); + stack.Push((root, rootResult, CreateChildResultArray(root.ChildCount()))); // Continue to iterate until the stack is empty, popping the next item on each iteration. // Some popped items may be pushed back on as part of processing. - while (stack.TryPop(out (RegexNode Node, bool TryToMarkFixedLength, DoublyLinkedList> Result, DoublyLinkedList>[]? ChildResults) popped)) + while (stack.TryPop(out (RegexNode Node, DoublyLinkedList> Result, DoublyLinkedList>[]? ChildResults) popped)) { RegexNode node = popped.Node; DoublyLinkedList> result = popped.Result; @@ -96,15 +96,13 @@ internal SymbolicRegexNode ConvertToSymbolicRegexNode(RegexNode root) Debug.Assert(childResults is not null && childResults.Length == node.ChildCount()); // Push back the temporarily popped item. Next time this work item is seen, its ChildResults list will be ready. - // Propagate the length mark check only in case of alternation. stack.Push(popped); - bool mark = node.Kind == RegexNodeKind.Alternate && popped.TryToMarkFixedLength; // Push all the children to be converted for (int i = 0; i < node.ChildCount(); ++i) { childResults[i] = new DoublyLinkedList>(); - stack.Push((node.Child(i), mark, childResults[i], CreateChildResultArray(node.Child(i).ChildCount()))); + stack.Push((node.Child(i), childResults[i], CreateChildResultArray(node.Child(i).ChildCount()))); } break; } @@ -228,8 +226,9 @@ static string UnexpectedNodeType(RegexNode node) case RegexNodeKind.Alternate: { - // Alternations are created by creating an Or of all of its children. - // This Or needs to be "ordered" to achieve the same semantics as the backtracking engines. + // Alternations in SymbolicRegexNode are binary and always normalized to right associative + // form, so here the list of children is built into a tree of alternations. + // The order is kept to achieve the same semantics as the backtracking engines. SymbolicRegexNode or = _builder._nothing; // Enumerate in reverse order through the child results @@ -240,7 +239,7 @@ static string UnexpectedNodeType(RegexNode node) // If childResult is a non-singleton list, then it denotes a concatenation that must be constructed at this point. SymbolicRegexNode elem = childResult.Count == 1 ? childResult.FirstElement : - _builder.CreateConcatAlreadyReversed(childResult.EnumerateLastToFirst(), popped.TryToMarkFixedLength); + _builder.CreateConcatAlreadyReversed(childResult.EnumerateLastToFirst()); if (elem.IsNothing) { continue; @@ -248,7 +247,7 @@ static string UnexpectedNodeType(RegexNode node) or = elem.IsAnyStar ? elem : // .* is the absorbing element - SymbolicRegexNode.OrderedOr(_builder, elem, or); + SymbolicRegexNode.CreateAlternate(_builder, elem, or); } result.AddLast(or); break; @@ -260,10 +259,10 @@ static string UnexpectedNodeType(RegexNode node) Debug.Assert(childResults.Length == 1); DoublyLinkedList> childResult = childResults[0]; - // Convert a list of nodes into a concatenation, do not propagate the length marker flag inside the loop body + // Convert a list of nodes into a concatenation SymbolicRegexNode body = childResult.Count == 1 ? childResult.FirstElement : - _builder.CreateConcatAlreadyReversed(childResult.EnumerateLastToFirst(), false); + _builder.CreateConcatAlreadyReversed(childResult.EnumerateLastToFirst()); result.AddLast(_builder.CreateLoop(body, node.Kind == RegexNodeKind.Lazyloop, node.M, node.N)); break; } @@ -291,10 +290,9 @@ static string UnexpectedNodeType(RegexNode node) // Only a top-level concatenation or capture node can result in a non-singleton list. Debug.Assert(rootResult.Count == 1 || root.Kind == RegexNodeKind.Concatenate || root.Kind == RegexNodeKind.Capture); - // If the root node is a concatenation, then the converted concatenation is built with length marker check being true. return rootResult.Count == 1 ? rootResult.FirstElement : - _builder.CreateConcatAlreadyReversed(rootResult.EnumerateLastToFirst(), tryCreateFixedLengthMarker: true); + _builder.CreateConcatAlreadyReversed(rootResult.EnumerateLastToFirst()); void EnsureNewlinePredicateInitialized() { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicNFA.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicNFA.cs deleted file mode 100644 index c3485d8a5712ab..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicNFA.cs +++ /dev/null @@ -1,387 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections; -using System.Collections.Generic; -using System.Diagnostics; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Represents the exploration of a symbolic regex as a symbolic NFA - internal sealed class SymbolicNFA where TSet : IComparable, IEquatable - { - private readonly ISolver _solver; - private readonly Transition[] _transitionFunction; - private readonly SymbolicRegexNode[] _finalCondition; - private readonly HashSet _unexplored; - private readonly SymbolicRegexNode[] _nodes; - - private const int DeadendState = -1; - private const int UnexploredState = -2; - - /// If true then some states have not been explored - public bool IsIncomplete => _unexplored.Count > 0; - - private SymbolicNFA(ISolver solver, Transition[] transitionFunction, HashSet unexplored, SymbolicRegexNode[] nodes) - { - Debug.Assert(transitionFunction.Length > 0 && nodes.Length == transitionFunction.Length); - _solver = solver; - _transitionFunction = transitionFunction; - _finalCondition = new SymbolicRegexNode[nodes.Length]; - for (int i = 0; i < nodes.Length; i++) - { - _finalCondition[i] = nodes[i].ExtractNullabilityTest(); - } - _unexplored = unexplored; - _nodes = nodes; - } - - /// Total number of states, 0 is the initial state, states are numbered from 0 to StateCount-1 - public int StateCount => _transitionFunction.Length; - - /// If true then the state has not been explored - public bool IsUnexplored(int state) => _transitionFunction[state]._leaf == UnexploredState; - - /// If true then the state has no outgoing transitions - public bool IsDeadend(int state) => _transitionFunction[state]._leaf == DeadendState; - - /// If true then backtracking terminates in this state - public bool IsBacktrackEnd(int state) => _nodes[state].IsHighPriorityNullable; - - /// Returns true if the state is nullable in the given context - public bool IsFinal(int state, uint context) => _finalCondition[state].IsNullableFor(context); - - /// Returns true if the state is nullable for some context - public bool CanBeNullable(int state) => _finalCondition[state].CanBeNullable; - - /// Returns true if the state is nullable for all contexts - public bool IsNullable(int state) => _finalCondition[state].IsNullable; - - /// Gets the underlying node of the state - public SymbolicRegexNode GetNode(int state) => _nodes[state]; - - /// Enumerates all target states from the given source state - /// must be a an integer between 0 and StateCount-1 - /// must be a value that acts as a minterm for the transitions emanating from the source state - /// reflects the immediate surrounding of the input and is used to determine nullability of anchors - public IEnumerable EnumerateTargetStates(int sourceState, TSet input, uint context) - { - Debug.Assert(sourceState >= 0 && sourceState < _transitionFunction.Length); - - // First operate in a mode assuming no Union happens by finding the target leaf state if one exists - Transition transition = _transitionFunction[sourceState]; - while (transition._kind != TransitionRegexKind.Union) - { - switch (transition._kind) - { - case TransitionRegexKind.Leaf: - // deadend and unexplored are negative - if (transition._leaf >= 0) - { - Debug.Assert(transition._leaf < _transitionFunction.Length); - yield return transition._leaf; - } - // The single target (or no target) state was found, so exit the whole enumeration - yield break; - - case TransitionRegexKind.Conditional: - Debug.Assert(transition._test is not null && transition._first is not null && transition._second is not null); - // Branch according to the input condition in relation to the test condition - if (!_solver.IsEmpty(_solver.And(input, transition._test))) - { - // in a conditional transition input must be exclusive - Debug.Assert(_solver.IsEmpty(_solver.And(input, _solver.Not(transition._test)))); - transition = transition._first; - } - else - { - transition = transition._second; - } - break; - - default: - Debug.Assert(transition._kind == TransitionRegexKind.Lookaround && transition._look is not null && transition._first is not null && transition._second is not null); - // Branch according to nullability of the lookaround condition in the given context - transition = transition._look.IsNullableFor(context) ? - transition._first : - transition._second; - break; - } - } - - // Continue operating in a mode where several target states can be yielded - Debug.Assert(transition._first is not null && transition._second is not null); - Stack todo = new(); - todo.Push(transition._second); - todo.Push(transition._first); - while (todo.TryPop(out _)) - { - switch (transition._kind) - { - case TransitionRegexKind.Leaf: - // dead-end - if (transition._leaf >= 0) - { - Debug.Assert(transition._leaf < _transitionFunction.Length); - yield return transition._leaf; - } - break; - - case TransitionRegexKind.Conditional: - Debug.Assert(transition._test is not null && transition._first is not null && transition._second is not null); - // Branch according to the input condition in relation to the test condition - if (!_solver.IsEmpty(_solver.And(input, transition._test))) - { - // in a conditional transition input must be exclusive - Debug.Assert(_solver.IsEmpty(_solver.And(input, _solver.Not(transition._test)))); - todo.Push(transition._first); - } - else - { - todo.Push(transition._second); - } - break; - - case TransitionRegexKind.Lookaround: - Debug.Assert(transition._look is not null && transition._first is not null && transition._second is not null); - // Branch according to nullability of the lookaround condition in the given context - todo.Push(transition._look.IsNullableFor(context) ? transition._first : transition._second); - break; - - default: - Debug.Assert(transition._kind == TransitionRegexKind.Union && transition._first is not null && transition._second is not null); - todo.Push(transition._second); - todo.Push(transition._first); - break; - } - } - } - - public IEnumerable<(TSet, SymbolicRegexNode?, int)> EnumeratePaths(int sourceState) => - _transitionFunction[sourceState].EnumeratePaths(_solver, _solver.Full); - - public static SymbolicNFA Explore(SymbolicRegexNode root, int bound) - { - (Dictionary, Transition> cache, - Dictionary, int> statemap, - List> nodes, - Stack front) workState = (new(), new(), new(), new()); - - workState.nodes.Add(root); - workState.statemap[root] = 0; - workState.front.Push(0); - - Dictionary transitions = new(); - Stack front = new(); - - while (workState.front.Count > 0) - { - Debug.Assert(front.Count == 0); - - // Work Breadth-First in layers, swap front with workState.front - Stack tmp = front; - front = workState.front; - workState.front = tmp; - - // Process all the states in front first - // Any new states detected in Convert are added to workState.front - while (front.Count > 0 && (bound <= 0 || workState.nodes.Count < bound)) - { - int q = front.Pop(); - - // If q was on the front it must be associated with a node but not have a transition yet - Debug.Assert(q >= 0 && q < workState.nodes.Count && !transitions.ContainsKey(q)); - transitions[q] = Convert(workState.nodes[q].CreateDerivative(), workState); - } - - if (front.Count > 0) - { - // The state bound was reached without completing the exploration so exit the loop - break; - } - } - - SymbolicRegexNode[] nodes_array = workState.nodes.ToArray(); - - // All states are numbered from 0 to nodes.Count-1 - Transition[] transition_array = new Transition[nodes_array.Length]; - foreach (KeyValuePair.Transition> entry in transitions) - { - transition_array[entry.Key] = entry.Value; - } - - HashSet unexplored = new(front); - unexplored.UnionWith(workState.front); - foreach (int q in unexplored) - { - transition_array[q] = Transition.s_unexplored; - } - - // At this point no entry can be null in the transition array - Debug.Assert(Array.TrueForAll(transition_array, tr => tr is not null)); - - var nfa = new SymbolicNFA(root._builder._solver, transition_array, unexplored, nodes_array); - return nfa; - } - - private static Transition Convert(TransitionRegex tregex, - (Dictionary, Transition> cache, - Dictionary, int> statemap, - List> nodes, - Stack front) args) - { - Transition? transition; - if (args.cache.TryGetValue(tregex, out transition)) - { - return transition; - } - - Stack<(TransitionRegex, bool)> work = new(); - work.Push((tregex, false)); - - while (work.TryPop(out (TransitionRegex, bool) top)) - { - TransitionRegex tr = top.Item1; - bool wasPushedSecondTime = top.Item2; - if (wasPushedSecondTime) - { - Debug.Assert(tr._kind != TransitionRegexKind.Leaf && tr._first is not null && tr._second is not null); - transition = new Transition(kind: tr._kind, - test: tr._test, - look: tr._node, - first: args.cache[tr._first], - second: args.cache[tr._second]); - args.cache[tr] = transition; - } - else - { - switch (tr._kind) - { - case TransitionRegexKind.Leaf: - Debug.Assert(tr._node is not null); - - if (tr._node.IsNothing) - { - args.cache[tr] = Transition.s_deadend; - } - else - { - int state; - if (!args.statemap.TryGetValue(tr._node, out state)) - { - state = args.nodes.Count; - args.nodes.Add(tr._node); - args.statemap[tr._node] = state; - args.front.Push(state); - } - transition = new Transition(kind: TransitionRegexKind.Leaf, leaf: state); - args.cache[tr] = transition; - } - break; - - default: - Debug.Assert(tr._first is not null && tr._second is not null); - - // Push the tr for the second time - work.Push((tr, true)); - - // Push the branches also, unless they have been computed already - if (!args.cache.ContainsKey(tr._second)) - { - work.Push((tr._second, false)); - } - - if (!args.cache.ContainsKey(tr._first)) - { - work.Push((tr._first, false)); - } - - break; - } - } - } - - return args.cache[tregex]; - } - - /// Representation of transitions inside the parent class - private sealed class Transition - { - public readonly TransitionRegexKind _kind; - public readonly int _leaf; - public readonly TSet? _test; - public readonly SymbolicRegexNode? _look; - public readonly Transition? _first; - public readonly Transition? _second; - - public static readonly Transition s_deadend = new Transition(TransitionRegexKind.Leaf, leaf: DeadendState); - public static readonly Transition s_unexplored = new Transition(TransitionRegexKind.Leaf, leaf: UnexploredState); - - internal Transition(TransitionRegexKind kind, int leaf = 0, TSet? test = default(TSet), SymbolicRegexNode? look = null, Transition? first = null, Transition? second = null) - { - _kind = kind; - _leaf = leaf; - _test = test; - _look = look; - _first = first; - _second = second; - } - - /// Enumerates all the paths in this transition excluding paths to dead-ends (and unexplored states if any) - internal IEnumerable<(TSet, SymbolicRegexNode?, int)> EnumeratePaths(ISolver solver, TSet pathCondition) - { - switch (_kind) - { - case TransitionRegexKind.Leaf: - // Omit any path that leads to a deadend or is unexplored - if (_leaf >= 0) - { - yield return (pathCondition, null, _leaf); - } - break; - - case TransitionRegexKind.Union: - Debug.Assert(_first is not null && _second is not null); - foreach ((TSet, SymbolicRegexNode?, int) path in _first.EnumeratePaths(solver, pathCondition)) - { - yield return path; - } - foreach ((TSet, SymbolicRegexNode?, int) path in _second.EnumeratePaths(solver, pathCondition)) - { - yield return path; - } - break; - - case TransitionRegexKind.Conditional: - Debug.Assert(_test is not null && _first is not null && _second is not null); - foreach ((TSet, SymbolicRegexNode?, int) path in _first.EnumeratePaths(solver, solver.And(pathCondition, _test))) - { - yield return path; - } - foreach ((TSet, SymbolicRegexNode?, int) path in _second.EnumeratePaths(solver, solver.And(pathCondition, solver.Not(_test)))) - { - yield return path; - } - break; - - default: - Debug.Assert(_kind is TransitionRegexKind.Lookaround && _look is not null && _first is not null && _second is not null); - foreach ((TSet, SymbolicRegexNode?, int) path in _first.EnumeratePaths(solver, pathCondition)) - { - SymbolicRegexNode nullabilityTest = path.Item2 is null ? _look : _look._builder.And(path.Item2, _look); - yield return (path.Item1, nullabilityTest, path.Item3); - } - foreach ((TSet, SymbolicRegexNode?, int) path in _second.EnumeratePaths(solver, pathCondition)) - { - // Complement the nullability test - SymbolicRegexNode nullabilityTest = path.Item2 is null ? _look._builder.Not(_look) : _look._builder.And(path.Item2, _look._builder.Not(_look)); - yield return (path.Item1, nullabilityTest, path.Item3); - } - break; - } - } - } - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs index b16ba24c754062..4a6db4bd74685e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs @@ -53,12 +53,6 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, private SymbolicRegexNode? _nwbAnchor; internal SymbolicRegexNode NonBoundaryAnchor => _nwbAnchor ??= SymbolicRegexNode.CreateBoundaryAnchor(this, SymbolicRegexNodeKind.NonBoundaryAnchor); - private SymbolicRegexSet? _fullSet; - internal SymbolicRegexSet FullSet => _fullSet ??= SymbolicRegexSet.CreateFull(this); - - private SymbolicRegexSet? _emptySet; - internal SymbolicRegexSet EmptySet => _emptySet ??= SymbolicRegexSet.CreateEmpty(this); - internal TSet _wordLetterForBoundariesSet; internal TSet _newLineSet; @@ -82,7 +76,6 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, SymbolicRegexNode?, // _left SymbolicRegexNode?, // _right int, int, TSet?, // _lower, _upper, _set - SymbolicRegexSet?, SymbolicRegexInfo), SymbolicRegexNode> _nodeCache = new(); // The following dictionaries are used as caches for operations that recurse over the structure of SymbolicRegexNode. @@ -115,16 +108,6 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable, /// internal readonly Dictionary<(SymbolicRegexNode, SymbolicRegexNode), bool> _subsumptionCache = new(); -#if DEBUG - internal readonly Dictionary<(TransitionRegexKind, // _kind - TSet?, // _test - TransitionRegex?, // _first - TransitionRegex?, // _second - SymbolicRegexNode?, // _leaf - DerivativeEffect?), // _effect - TransitionRegex> _trCache = new(); -#endif - /// /// Maps state ids to states, initial capacity is 1024 states. /// Each time more states are needed the length is increased by 1024. @@ -224,16 +207,28 @@ internal TSet GetMinterm(int mintermId) _solver.Empty; // minterm=False represents \Z } - /// - /// Make a disjunction of given nodes, simplify by eliminating any regex that accepts no inputs - /// - internal SymbolicRegexNode Or(params SymbolicRegexNode[] nodes) => - SymbolicRegexNode.Or(this, nodes); + /// Returns the span from that may contain transitions for the given state + internal Span?> GetDeltasFor(DfaMatchingState state) + { + if (_delta is null || _minterms is null) + return Span?>.Empty; + int numMinterms = state.StartsWithLineAnchor ? _minterms.Length + 1 : _minterms.Length; + return _delta.AsSpan(state.Id << _mintermsLog, numMinterms); + } + + /// Returns the span from that may contain transitions for the given state + internal Span GetNfaDeltasFor(DfaMatchingState state) + { + if (_nfaDelta is null || _minterms is null || !_nfaStateArrayInverse.TryGetValue(state.Id, out int nfaState)) + return Span.Empty; + int numMinterms = state.StartsWithLineAnchor ? _minterms.Length + 1 : _minterms.Length; + return _nfaDelta.AsSpan(nfaState << _mintermsLog, numMinterms); + } /// - /// Make an ordered disjunction of given nodes, simplify by eliminating any regex that accepts no inputs + /// Make an alternation of given nodes, simplify by eliminating any regex that accepts no inputs /// - internal SymbolicRegexNode OrderedOr(List> nodes) + internal SymbolicRegexNode Alternate(List> nodes) { HashSet> seenElems = new(); @@ -247,7 +242,7 @@ internal SymbolicRegexNode OrderedOr(List> nodes) } } - // Iterate backwards to avoid quadratic rebuilding of the Or nodes, which are always simplified to + // Iterate backwards to avoid quadratic rebuilding of the Alternate nodes, which are always simplified to // right associative form. Concretely: // In (a|(b|c)) | d -> (a|(b|(c|d)) the first argument is not a subtree of the result. // In a | (b|(c|d)) -> (a|(b|(c|d)) the second argument is a subtree of the result. @@ -255,50 +250,19 @@ internal SymbolicRegexNode OrderedOr(List> nodes) SymbolicRegexNode or = _nothing; for (int i = nodes.Count - 1; i >= 0; --i) { - or = SymbolicRegexNode.OrderedOr(this, nodes[i], or, deduplicated: true); + or = SymbolicRegexNode.CreateAlternate(this, nodes[i], or, deduplicated: true); } return or; } - /// - /// Make a conjunction of given nodes, simplify by eliminating nodes that accept everything - /// - internal SymbolicRegexNode And(params SymbolicRegexNode[] nodes) => - SymbolicRegexNode.And(this, nodes); - - /// - /// Make a disjunction of given set of nodes, simplify by eliminating any regex that accepts no inputs - /// - internal SymbolicRegexNode Or(SymbolicRegexSet set) => - set.IsNothing ? _nothing : - set.IsEverything ? _anyStar : - set.IsSingleton ? set.GetSingletonElement() : - SymbolicRegexNode.Or(this, set); - - internal SymbolicRegexNode Or(SymbolicRegexNode x, SymbolicRegexNode y) => - x == _anyStar || y == _anyStar ? _anyStar : - x == _nothing ? y : - y == _nothing ? x : - SymbolicRegexNode.Or(this, x, y); - - /// - /// Make a conjunction of given set, simplify by eliminating any regex that accepts all inputs, - /// returns the empty regex if the regex accepts nothing - /// - internal SymbolicRegexNode And(SymbolicRegexSet set) => - set.IsNothing ? _nothing : - set.IsEverything ? _anyStar : - set.IsSingleton ? set.GetSingletonElement() : - SymbolicRegexNode.And(this, set); - /// /// Make a concatenation of given nodes, if any regex is nothing then return nothing, eliminate /// intermediate epsilons, if tryCreateFixedLengthMarker and length is fixed, add a fixed length /// marker at the end. /// - internal SymbolicRegexNode CreateConcat(List> nodes, bool tryCreateFixedLengthMarker = false) => - CreateConcatAlreadyReversed(EnumerateNodesInReverse(nodes), tryCreateFixedLengthMarker); + internal SymbolicRegexNode CreateConcat(List> nodes) => + CreateConcatAlreadyReversed(EnumerateNodesInReverse(nodes)); private static IEnumerable> EnumerateNodesInReverse(List> nodes) { @@ -310,21 +274,10 @@ private static IEnumerable> EnumerateNodesInReverse(List /// /// If any regex is nothing, then return nothing. /// Eliminate intermediate epsilons. - /// If tryCreateFixedLengthMarker and length is fixed, add a fixed length marker at the end. /// - internal SymbolicRegexNode CreateConcatAlreadyReversed(IEnumerable> nodes, bool tryCreateFixedLengthMarker) + internal SymbolicRegexNode CreateConcatAlreadyReversed(IEnumerable> nodes) { SymbolicRegexNode result = Epsilon; - - if (tryCreateFixedLengthMarker) - { - int length = CalculateFixedLength(nodes); - if (length >= 0) - { - result = CreateFixedLengthMarker(length); - } - } - // Iterate through all the nodes concatenating them together in reverse order. // Here the nodes enumeration is already reversed, so reversing it back to the original concatenation order. foreach (SymbolicRegexNode node in nodes) @@ -343,23 +296,6 @@ internal SymbolicRegexNode CreateConcatAlreadyReversed(IEnumerable CreateConcat(SymbolicRegexNode left, SymbolicRegexNode right) => SymbolicRegexNode.CreateConcat(this, left, right); - private static int CalculateFixedLength(IEnumerable> nodes) - { - int length = 0; - foreach (SymbolicRegexNode node in nodes) - { - int k = node.GetFixedLength(); - if (k < 0) - { - return -1; - } - - length += k; - } - - return length; - } - /// /// Make loop regex /// @@ -416,13 +352,6 @@ internal SymbolicRegexNode CreateSingleton(TSet set) /// Creates a fixed length marker for the end of a sequence. internal SymbolicRegexNode CreateFixedLengthMarker(int length) => SymbolicRegexNode.CreateFixedLengthMarker(this, length); - /// - /// Make a complemented node - /// - /// node to be complemented - /// - internal SymbolicRegexNode Not(SymbolicRegexNode node) => SymbolicRegexNode.Not(this, node); - internal SymbolicRegexNode CreateEffect(SymbolicRegexNode node, SymbolicRegexNode effectNode) => SymbolicRegexNode.CreateEffect(this, node, effectNode); internal SymbolicRegexNode CreateCapture(SymbolicRegexNode child, int captureNum) => CreateConcat(CreateCaptureStart(captureNum), CreateConcat(child, CreateCaptureEnd(captureNum))); @@ -484,17 +413,12 @@ internal SymbolicRegexNode Transform(SymbolicRegexNode n Debug.Assert(node._left is not null); return builder.CreateLoop(Transform(node._left, builder, setTransformer), node.IsLazy, node._lower, node._upper); - case SymbolicRegexNodeKind.Or: - Debug.Assert(node._alts is not null); - return builder.Or(node._alts.Transform(builder, setTransformer)); - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(node._left is not null && node._right is not null); - return SymbolicRegexNode.OrderedOr(builder, Transform(node._left, builder, setTransformer), Transform(node._right, builder, setTransformer), deduplicated: true); - - case SymbolicRegexNodeKind.And: - Debug.Assert(node._alts is not null); - return builder.And(node._alts.Transform(builder, setTransformer)); + return SymbolicRegexNode.CreateAlternate(builder, + Transform(node._left, builder, setTransformer), + Transform(node._right, builder, setTransformer), + deduplicated: true); case SymbolicRegexNodeKind.CaptureStart: return builder.CreateCaptureStart(node._lower); @@ -510,7 +434,7 @@ internal SymbolicRegexNode Transform(SymbolicRegexNode n { reverseTransformed[i] = Transform(concatElements[^(i + 1)], builder, setTransformer); } - return builder.CreateConcatAlreadyReversed(reverseTransformed, tryCreateFixedLengthMarker: false); + return builder.CreateConcatAlreadyReversed(reverseTransformed); } case SymbolicRegexNodeKind.DisableBacktrackingSimulation: @@ -518,9 +442,8 @@ internal SymbolicRegexNode Transform(SymbolicRegexNode n return builder.CreateDisableBacktrackingSimulation(Transform(node._left, builder, setTransformer)); default: - Debug.Assert(node._kind == SymbolicRegexNodeKind.Not); - Debug.Assert(node._left is not null); - return builder.Not(Transform(node._left, builder, setTransformer)); + Debug.Fail($"{nameof(Transform)}:{node._kind}"); + return null; } } @@ -529,10 +452,9 @@ internal SymbolicRegexNode Transform(SymbolicRegexNode n /// /// the pattern that this state will represent /// the kind of the character that led to this state - /// if true, then state won't be cached /// whether to use the separate space of states with capturing transitions or not /// - public DfaMatchingState CreateState(SymbolicRegexNode node, uint prevCharKind, bool disableCaching = false, bool capturing = false) + public DfaMatchingState CreateState(SymbolicRegexNode node, uint prevCharKind, bool capturing = false) { //first prune the anchors in the node TSet wlbSet = _wordLetterForBoundariesSet; @@ -547,16 +469,7 @@ public DfaMatchingState CreateState(SymbolicRegexNode node, uint pre var s = new DfaMatchingState(pruned_node, prevCharKind); if (!(capturing ? _capturingStateCache : _stateCache).TryGetValue(s, out DfaMatchingState? state)) { - // do not cache set of states as states in NFA mode - if (disableCaching && pruned_node.Kind == SymbolicRegexNodeKind.Or) - { - s.Id = -1; // mark the Id as invalid - state = s; - } - else - { - state = MakeNewState(s, capturing); - } + state = MakeNewState(s, capturing); } return state; @@ -602,7 +515,7 @@ private DfaMatchingState MakeNewState(DfaMatchingState state, bool c /// public int CreateNfaState(SymbolicRegexNode node, uint prevCharKind) { - Debug.Assert(node.Kind != SymbolicRegexNodeKind.OrderedOr); + Debug.Assert(node.Kind != SymbolicRegexNodeKind.Alternate); // First make the underlying core state DfaMatchingState coreState = CreateState(node, prevCharKind); @@ -700,11 +613,11 @@ public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset SymbolicRegexNode node = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? coreTarget.Node._left! : coreTarget.Node; - if (node.Kind == SymbolicRegexNodeKind.OrderedOr) + if (node.Kind == SymbolicRegexNodeKind.Alternate) { // Create separate NFA states for all members of a disjunction // Here duplicate NFA states cannot arise because there are no duplicate nodes in the disjunction - List> alts = node.ToList(listKind: SymbolicRegexNodeKind.OrderedOr); + List> alts = node.ToList(listKind: SymbolicRegexNodeKind.Alternate); targets = new int[alts.Count]; int targetIndex = 0; foreach (SymbolicRegexNode q in alts) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs index a07e817d97be55..c278442b8075cb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs @@ -84,22 +84,6 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can public bool ContainsEffect => (_info & ContainsEffectMask) != 0; - public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos) - { - uint isLazy = IsLazyLoopMask; - uint i = 0; - - for (int j = 0; j < infos.Length; j++) - { - // Disjunction is lazy if ALL of its members are lazy - isLazy &= infos[j]._info; - i |= infos[j]._info; - } - - i = (i & ~IsLazyLoopMask) | isLazy; - return new SymbolicRegexInfo(i); - } - /// /// The alternation remains high priority nullable if the left alternative is so. /// All other info properties are the logical disjunction of the resepctive info properties @@ -115,25 +99,6 @@ public static SymbolicRegexInfo Alternate(SymbolicRegexInfo left_info, SymbolicR isHighPriorityNullable: left_info.IsHighPriorityNullable, containsEffect: left_info.ContainsEffect || right_info.ContainsEffect); - public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos) - { - uint isLazy = IsLazyLoopMask; - uint isNullable = IsAlwaysNullableMask | CanBeNullableMask; - uint i = 0; - - foreach (SymbolicRegexInfo info in infos) - { - //nullability and lazyness are conjunctive while other properties are disjunctive - isLazy &= info._info; - isNullable &= info._info; - i |= info._info; - } - - i = (i & ~IsLazyLoopMask) | isLazy; - i = (i & ~(IsAlwaysNullableMask | CanBeNullableMask)) | isNullable; - return new SymbolicRegexInfo(i); - } - /// /// Concatenation remains high priority nullable if both left and right are so. /// Nullability is conjunctive and other properies are essentially disjunctive, @@ -184,20 +149,6 @@ public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound public static SymbolicRegexInfo Effect(SymbolicRegexInfo childInfo) => new SymbolicRegexInfo(childInfo._info | ContainsEffectMask); - public static SymbolicRegexInfo Not(SymbolicRegexInfo info) => - // Nullability is complemented, all other properties remain the same - // The following rules are used to determine nullability of Not(node): - // Observe that this is used as an over-approximation, actual nullability is checked dynamically based on given context. - // - If node is never nullable (for any context, info.CanBeNullable=false) then Not(node) is always nullable - // - If node is always nullable (info.IsNullable=true) then Not(node) can never be nullable - // For example \B.CanBeNullable=true and \B.IsNullable=false - // and ~(\B).CanBeNullable=true and ~(\B).IsNullable=false - Create(isAlwaysNullable: !info.CanBeNullable, - canBeNullable: !info.IsNullable, - startsWithLineAnchor: info.StartsWithLineAnchor, - containsSomeAnchor: info.ContainsSomeAnchor, - isLazyLoop: info.IsLazyLoop); - public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i); public bool Equals(SymbolicRegexInfo other) => _info == other._info; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs index ae8dd4819f0318..ea1f68075000b6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexKind.cs @@ -14,17 +14,8 @@ internal enum SymbolicRegexNodeKind Concat, /// A node that matches a loop (e.g. , , , etc.). Loop, - - /// A node that matches if any of its nodes match. - /// This is typically used to combine singletons. - Or, /// A node that matches if any of its nodes match and that matches them in a fixed order that mirrors how the backtracking engines operate (e.g. ). - OrderedOr, - /// A node that matches if all of its nodes match. - /// This is typically used to combine singletons. - And, - /// A node that matches if its node doesn't (e.g. ). - Not, + Alternate, /// A node that represents a beginning anchor (i.e. ). BeginningAnchor, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs new file mode 100644 index 00000000000000..8a5dad8d85cee9 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs @@ -0,0 +1,210 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if DEBUG +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Net; + +namespace System.Text.RegularExpressions.Symbolic +{ + internal sealed partial class SymbolicRegexMatcher + { + /// + [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] + public override void SaveDGML(TextWriter writer, int maxLabelLength) + { + if (maxLabelLength < 0) + maxLabelLength = int.MaxValue; + + Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> transitions = GatherTransitions(_builder); + + writer.WriteLine(""); + writer.WriteLine(""); + writer.WriteLine(" "); + writer.WriteLine(" ", FormatInfo(_builder, transitions.Count)); + writer.WriteLine(" ", FormatInfo(_builder, transitions.Count)); + foreach (DfaMatchingState state in _builder._stateCache) + { + writer.WriteLine(" ", state.Id, state.DgmlView); + if (state.IsInitialState) + { + writer.WriteLine(" "); + } + if (state.Node.CanBeNullable) + { + writer.WriteLine(" "); + } + writer.WriteLine(" "); + writer.WriteLine(" ", state.Id, state.DgmlView); + } + writer.WriteLine(" "); + writer.WriteLine(" "); + foreach (DfaMatchingState initialState in GetInitialStates(this)) + { + Debug.Assert(_builder._stateCache.Contains(initialState)); + writer.WriteLine(" ", initialState.Id); + } + writer.WriteLine(" "); + + foreach (KeyValuePair<(int Source, int Target), (TSet Rule, List NfaTargets)> transition in transitions) + { + string label = DescribeLabel(transition.Value.Rule, _builder); + string info = ""; + if (label.Length > maxLabelLength) + { + info = $"FullLabel = \"{label}\" "; + label = string.Concat(label.AsSpan(0, maxLabelLength), ".."); + } + + writer.WriteLine($" "); + // Render NFA transitions as labelless "epsilon" transitions (i.e. ones that don't consume a character) + // from the target of the DFA transition. + foreach (int nfaTarget in transition.Value.NfaTargets) + { + writer.WriteLine($" "); + } + } + + foreach (DfaMatchingState state in _builder._stateCache) + { + writer.WriteLine(" ", state.Id); + } + + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(" "); + writer.WriteLine(""); + + // This function gathers all transitions in the given builder and groups them by (source,destination) state ID + static Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> GatherTransitions(SymbolicRegexBuilder builder) + { + Debug.Assert(builder._delta is not null); + Debug.Assert(builder._minterms is not null); + Dictionary<(int Source, int Target), (TSet Rule, List NfaTargets)> result = new(); + foreach (DfaMatchingState source in builder._stateCache) + { + // Get the span of entries in delta that gives the transitions for the different minterms + Span?> deltas = builder.GetDeltasFor(source); + Span nfaDeltas = builder.GetNfaDeltasFor(source); + Debug.Assert(deltas.Length == builder._minterms.Length); + for (int i = 0; i < deltas.Length; ++i) + { + // null entries are transitions not explored yet, so skip them + if (deltas[i] is DfaMatchingState target) + { + // Get or create the data for this (source,destination) state ID pair + (int Source, int Target) key = (source.Id, target.Id); + if (!result.TryGetValue(key, out (TSet Rule, List NfaTargets) entry)) + { + entry = (builder._solver.Empty, new List()); + } + // If this state has an NFA transition for the same minterm, then associate + // those with the transition. + if (nfaDeltas.Length > 0 && nfaDeltas[i] is int[] nfaTargets) + { + foreach (int nfaTarget in nfaTargets) + { + entry.NfaTargets.Add(builder._nfaStateArray[nfaTarget]); + } + } + // Expand the rule for this minterm + result[key] = (builder._solver.Or(entry.Rule, builder._minterms[i]), entry.NfaTargets); + } + } + } + return result; + } + + static string FormatInfo(SymbolicRegexBuilder builder, int transitionCount) + { + StringBuilder sb = new(); + sb.Append($"States = {builder._stateCache.Count} "); + sb.Append($"Transitions = {transitionCount} "); + sb.Append($"Min Terms ({builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',', + DescribeLabels(builder._solver.GetMinterms()!, builder)); + return sb.ToString(); + } + + static IEnumerable DescribeLabels(IEnumerable labels, SymbolicRegexBuilder builder) + { + foreach (TSet label in labels) + { + yield return DescribeLabel(label, builder); + } + } + + static string DescribeLabel(TSet label, SymbolicRegexBuilder builder) => + WebUtility.HtmlEncode(builder._solver.PrettyPrint(label, builder._charSetSolver)); + + static IEnumerable> GetInitialStates(SymbolicRegexMatcher matcher) + { + foreach (DfaMatchingState state in matcher._dotstarredInitialStates) + yield return state; + foreach (DfaMatchingState state in matcher._initialStates) + yield return state; + foreach (DfaMatchingState state in matcher._reverseInitialStates) + yield return state; + } + } + } +} +#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs new file mode 100644 index 00000000000000..6808434ef98419 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Explore.cs @@ -0,0 +1,111 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if DEBUG +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Net; + +namespace System.Text.RegularExpressions.Symbolic +{ + internal sealed partial class SymbolicRegexMatcher + { + /// + [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] + public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa) + { + Debug.Assert(_builder._minterms is not null); + + // Track seen states to avoid exploring twice + HashSet> seen = new(); + // Use a queue for unexplored states + // This results in a breadth-first exploration + Queue> toExplore = new(); + + // Explore all initial states as requested + if (includeDotStarred) + EnqueueAll(_dotstarredInitialStates, seen, toExplore); + if (includeReverse) + EnqueueAll(_reverseInitialStates, seen, toExplore); + if (includeOriginal) + EnqueueAll(_initialStates, seen, toExplore); + + if (exploreDfa) + { + while (toExplore.Count > 0) + { + // Don't dequeue yet, because a transition might fail + DfaMatchingState state = toExplore.Peek(); + // Include the special minterm for the last end-of-line if the state is sensitive to it + int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; + // Explore successor states for each minterm + for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + { + int offset = (state.Id << _builder._mintermsLog) | mintermId; + if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState? nextState)) + goto DfaLimitReached; + EnqueueIfUnseen(nextState, seen, toExplore); + } + // Safe to dequeue now that the state has been completely handled + toExplore.Dequeue(); + } + } + + DfaLimitReached: + if (exploreNfa && toExplore.Count > 0) + { + // DFA states are broken up into NFA states when they are alternations + DfaMatchingState[] toBreakUp = toExplore.ToArray(); + toExplore.Clear(); + foreach (DfaMatchingState dfaState in toBreakUp) + { + // Remove state from seen so that it can be added back in if necessary + seen.Remove(dfaState); + // Enqueue all elements of a top level alternation or the state itself + foreach (var element in dfaState.Node.EnumerateAlternationBranches()) + { + int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind); + EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore); + } + } + + while (toExplore.Count > 0) + { + // NFA transitions can't fail, so its safe to dequeue here + DfaMatchingState state = toExplore.Dequeue(); + // Include the special minterm for the last end-of-line if the state is sensitive to it + int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; + // Explore successor states for each minterm + for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) + { + int nfaOffset = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId; + int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset); + foreach (int nextNfaState in nextNfaStates) + { + EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore); + } + } + } + } + + static void EnqueueAll(DfaMatchingState[] states, HashSet> seen, Queue> toExplore) + { + foreach (DfaMatchingState state in states) + { + EnqueueIfUnseen(state, seen, toExplore); + } + } + + static void EnqueueIfUnseen(DfaMatchingState state, HashSet> seen, Queue> queue) + { + if (seen.Add(state)) + { + queue.Enqueue(state); + } + } + } + } +} +#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs new file mode 100644 index 00000000000000..509a5b8516a7b8 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs @@ -0,0 +1,193 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if DEBUG +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; + +namespace System.Text.RegularExpressions.Symbolic +{ + internal sealed partial class SymbolicRegexMatcher + { + /// + /// The probability of stopping match sampling when a candidate is found. This influences the expected length + /// of the sampled matches. For a pattern .* that accepts anything the expected length is: + /// 1/p - 1, where the -1 comes from the fact that the first coin is tossed with the empty input. + /// + private const double SampleMatchesStoppingProbability = 0.2; + + /// + /// Maximum length to try to sample input to. + /// + /// + /// This is required for cases where the state space has a loop that is not detected as a deadend, + /// which would otherwise cause the sampling to hang. + /// + private const int SampleMatchesMaxInputLength = 100; + + /// + [ExcludeFromCodeCoverage(Justification = "Currently only used for testing")] + public override IEnumerable SampleMatches(int k, int randomseed) + { + // Zero is treated as no seed, instead using a system provided one + Random random = randomseed != 0 ? new Random(randomseed) : new Random(); + + ISolver solver = _builder._solver; + CharSetSolver charSetSolver = _builder._charSetSolver; + + // Create helper BDDs for handling anchors and preferentially generating ASCII inputs + BDD asciiWordCharacters = charSetSolver.Or(new BDD[] { + charSetSolver.CreateSetFromRange('A', 'Z'), + charSetSolver.CreateSetFromRange('a', 'z'), + charSetSolver.CreateFromChar('_'), + charSetSolver.CreateSetFromRange('0', '9')}); + // Visible ASCII range for input character generation + BDD ascii = charSetSolver.CreateSetFromRange('\x20', '\x7E'); + BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters)); + + // Set up two sets of minterms, one with the additional special minterm for the last end-of-line + Debug.Assert(_builder._minterms is not null); + int[] mintermIdsWithoutZ = new int[_builder._minterms.Length]; + int[] mintermIdsWithZ = new int[_builder._minterms.Length + 1]; + for (int i = 0; i < _builder._minterms.Length; ++i) + { + mintermIdsWithoutZ[i] = i; + mintermIdsWithZ[i] = i; + } + mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length; + + for (int i = 0; i < k; i++) + { + // Holds the generated input so far + StringBuilder inputSoFar = new(); + StringBuilder? latestCandidate = null; + + // Current set of states reached initially contains just the root + NfaMatchingState states = new(_builder); + // Here one could also consider previous characters for example for \b, \B, and ^ anchors + // and initialize inputSoFar accordingly + states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan.Empty, -1)]); + CurrentState statesWrapper = new(states); + + // Used for end suffixes + List possibleEndings = new(); + + while (true) + { + Debug.Assert(states.NfaStateSet.Count > 0); + + // Gather the possible endings for satisfying nullability + possibleEndings.Clear(); + if (NfaStateHandler.CanBeNullable(ref statesWrapper)) + { + // Unconditionally final state or end of the input due to \Z anchor for example + if (NfaStateHandler.IsNullable(ref statesWrapper) || + NfaStateHandler.IsNullable(ref statesWrapper, CharKind.BeginningEnd)) + { + possibleEndings.Add(""); + } + + // End of line due to end-of-line anchor + if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.Newline)) + { + possibleEndings.Add("\n"); + } + + // Related to wordborder due to \b or \B + if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.WordLetter)) + { + possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString()); + } + + // Related to wordborder due to \b or \B + if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.General)) + { + possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString()); + } + } + + // If we have a possible ending, then store a candidate input + if (possibleEndings.Count > 0) + { + latestCandidate ??= new(); + latestCandidate.Clear(); + latestCandidate.Append(inputSoFar); + //Choose some suffix that allows some anchor (if any) to be nullable + latestCandidate.Append(Choose(random, possibleEndings)); + + // Choose to stop here based on a coin-toss + if (FlipBiasedCoin(random, SampleMatchesStoppingProbability)) + { + yield return latestCandidate.ToString(); + break; + } + } + + // Shuffle the minterms, including the last end-of-line marker if appropriate + int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(ref statesWrapper) ? + Shuffle(random, mintermIdsWithZ) : + Shuffle(random, mintermIdsWithoutZ); + foreach (int mintermId in mintermIds) + { + bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId); + Debug.Assert(success); + if (states.NfaStateSet.Count > 0) + { + TSet minterm = _builder.GetMinterm(mintermId); + // Append a random member of the minterm + inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver)); + break; + } + else + { + // The transition was a dead end, undo and continue to try another minterm + NfaStateHandler.UndoTransition(ref statesWrapper); + } + } + + // In the case that there are no next states or input has become too large: stop here + if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength) + { + // Ending up here without an ending is unlikely but possible for example for infeasible patterns + // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend. + if (latestCandidate != null) + { + yield return latestCandidate.ToString(); + } + break; + } + } + } + + static BDD ToBDD(TSet set, ISolver solver, CharSetSolver charSetSolver) => solver.ConvertToBDD(set, charSetSolver); + + static T Choose(Random random, IList elems) => elems[random.Next(elems.Count)]; + + static char ChooseChar(Random random, BDD bdd, BDD ascii, CharSetSolver charSetSolver) + { + Debug.Assert(!bdd.IsEmpty); + // Select characters from the visible ASCII range whenever possible + BDD bdd1 = charSetSolver.And(bdd, ascii); + (uint, uint) range = Choose(random, BDDRangeConverter.ToRanges(bdd1.IsEmpty ? bdd : bdd1)); + return (char)random.Next((int)range.Item1, (int)range.Item2 + 1); + } + + static bool FlipBiasedCoin(Random random, double probTrue) => random.NextDouble() < probTrue; + + static T[] Shuffle(Random random, T[] array) + { + // In-place Fisher-Yates shuffle + for (int i = 0; i < array.Length - 1; ++i) + { + int j = random.Next(i, array.Length); + var tmp = array[i]; + array[i] = array[j]; + array[j] = tmp; + } + return array; + } + } + } +} +#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index afc476d34a9cce..4109865c7ff4df 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -15,29 +15,20 @@ namespace System.Text.RegularExpressions.Symbolic internal abstract class SymbolicRegexMatcher { #if DEBUG - /// Unwind the regex of the matcher and save the resulting state graph in DGML - /// Writer to which the DGML is written. - /// True to create an NFA instead of a DFA. - /// True to prepend .*? onto the pattern (outside of the implicit root capture). - /// If true, then unwind the regex backwards. - /// The approximate maximum number of states to include; less than or equal to 0 for no maximum. - /// maximum length of labels in nodes anything over that length is indicated with .. - public abstract void SaveDGML(TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength); + /// + public abstract void SaveDGML(TextWriter writer, int maxLabelLength); - /// - /// Generates up to k random strings matched by the regex - /// - /// upper bound on the number of generated strings - /// random seed for the generator, 0 means no random seed - /// if true then generate inputs that do not match - /// - public abstract IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative); + /// + public abstract IEnumerable SampleMatches(int k, int randomseed); + + /// + public abstract void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa); #endif } /// Represents a regex matching engine that performs regex matching using symbolic derivatives. /// Character set type. - internal sealed class SymbolicRegexMatcher : SymbolicRegexMatcher where TSet : IComparable, IEquatable + internal sealed partial class SymbolicRegexMatcher : SymbolicRegexMatcher where TSet : IComparable, IEquatable { /// Maximum number of built states before switching over to NFA mode. /// @@ -534,7 +525,7 @@ private int FindEndPositionDeltas(SymbolicRegexBuilder buil { // Check whether there's a fixed-length marker for the current state. If there is, we can // use that length to optimize subsequent matching phases. - matchLength = TStateHandler.FixedLength(ref state); + matchLength = TStateHandler.FixedLength(ref state, GetCharKind(input, pos)); endPos = pos; // A match is known to exist. If that's all we need to know, we're done. @@ -945,24 +936,12 @@ public void InitializeFrom(DfaMatchingState dfaMatchingState) // If the DFA state is a union of multiple DFA states, loop through all of them // adding an NFA state for each. - SymbolicRegexNode node = dfaMatchingState.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? - dfaMatchingState.Node._left! : dfaMatchingState.Node; - while (node.Kind is SymbolicRegexNodeKind.OrderedOr) + foreach (SymbolicRegexNode element in dfaMatchingState.Node.EnumerateAlternationBranches()) { - Debug.Assert(node._left is not null && node._right is not null); - - // Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too - SymbolicRegexNode element = dfaMatchingState.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ? - Builder.CreateDisableBacktrackingSimulation(node._left) : node._left; - // Create (possibly new) NFA states for all the members. // Add their IDs to the current set of NFA states and into the list. NfaStateSet.Add(Builder.CreateNfaState(element, dfaMatchingState.PrevCharKind), out _); - node = node._right; } - - // Finally, just add an NFA state for the singular DFA state or last element of a union. - NfaStateSet.Add(Builder.CreateNfaState(node, dfaMatchingState.PrevCharKind), out _); } } @@ -996,7 +975,7 @@ private interface IStateHandler public static abstract bool StartsWithLineAnchor(ref CurrentState state); public static abstract bool IsNullable(ref CurrentState state, uint nextCharKind); public static abstract bool IsDeadend(ref CurrentState state); - public static abstract int FixedLength(ref CurrentState state); + public static abstract int FixedLength(ref CurrentState state, uint nextCharKind); public static abstract bool IsInitialState(ref CurrentState state); public static abstract bool TakeTransition(SymbolicRegexBuilder builder, ref CurrentState state, int mintermId); } @@ -1016,7 +995,7 @@ private interface IStateHandler /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int FixedLength(ref CurrentState state) => state.DfaState!.FixedLength; + public static int FixedLength(ref CurrentState state, uint nextCharKind) => state.DfaState!.FixedLength(nextCharKind); /// Gets whether this is an initial state. [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1089,7 +1068,7 @@ public static bool IsNullable(ref CurrentState state, uint nextCharKind) /// Gets the length of any fixed-length marker that exists for this state, or -1 if there is none. /// In NFA mode, there are no fixed-length markers. - public static int FixedLength(ref CurrentState state) => -1; + public static int FixedLength(ref CurrentState state, uint nextCharKind) => -1; /// Gets whether this is an initial state. /// In NFA mode, no set of states qualifies as an initial state. @@ -1147,14 +1126,56 @@ static int[] GetNextStates(int sourceState, int mintermId, SymbolicRegexBuilder< return builder._nfaDelta[nfaOffset] ?? builder.CreateNewNfaTransition(sourceState, mintermId, nfaOffset); } } - } #if DEBUG - public override void SaveDGML(TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength) => - DgmlWriter.Write(writer, this, nfa, addDotStar, reverse, maxStates, maxLabelLength); + /// Undo a previous call to . + public static void UndoTransition(ref CurrentState state) + { + Debug.Assert(state.DfaState is null, $"Expected null {nameof(state.DfaState)}."); + Debug.Assert(state.NfaState is not null, $"Expected non-null {nameof(state.NfaState)}."); - public override IEnumerable GenerateRandomMembers(int k, int randomseed, bool negative) => - new SymbolicRegexSampler(_pattern, randomseed, negative).GenerateRandomMembers(k); + NfaMatchingState nfaState = state.NfaState!; + + // Swap the current active states set with the scratch set to undo a previous transition. + SparseIntMap nextStates = nfaState.NfaStateSet; + SparseIntMap sourceStates = nfaState.NfaStateSetScratch; + nfaState.NfaStateSet = sourceStates; + nfaState.NfaStateSetScratch = nextStates; + + // Sanity check: if there are any next states, then there must have been some source states. + Debug.Assert(nextStates.Count == 0 || sourceStates.Count > 0); + } + + /// Check if any underlying core state is unconditionally nullable. + public static bool IsNullable(ref CurrentState state) + { + SymbolicRegexBuilder builder = state.NfaState!.Builder; + foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) + { + if (builder.GetCoreState(nfaState.Key).Node.IsNullable) + { + return true; + } + } + + return false; + } + + /// Check if any underlying core state can be nullable. + public static bool CanBeNullable(ref CurrentState state) + { + SymbolicRegexBuilder builder = state.NfaState!.Builder; + foreach (ref KeyValuePair nfaState in CollectionsMarshal.AsSpan(state.NfaState!.NfaStateSet.Values)) + { + if (builder.GetCoreState(nfaState.Key).Node.CanBeNullable) + { + return true; + } + } + + return false; + } #endif + } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index cd6eadf78ae97b..0be82814332535 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Globalization; using System.Runtime.CompilerServices; using System.Threading; @@ -39,7 +40,6 @@ internal sealed class SymbolicRegexNode where TSet : IComparable, IE internal readonly TSet? _set; internal readonly SymbolicRegexNode? _left; internal readonly SymbolicRegexNode? _right; - internal readonly SymbolicRegexSet? _alts; /// /// Caches nullability of this node for any given context (0 <= context < ContextLimit) @@ -57,9 +57,8 @@ internal sealed class SymbolicRegexNode where TSet : IComparable, IE /// lower bound of a loop /// upper boubd of a loop /// singelton set - /// alternatives set of a disjunction or conjunction /// misc flags including laziness - private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexSet? alts, SymbolicRegexInfo info) + private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexInfo info) { _builder = builder; _kind = kind; @@ -68,61 +67,23 @@ private SymbolicRegexNode(SymbolicRegexBuilder builder, SymbolicRegexNodeK _lower = lower; _upper = upper; _set = set; - _alts = alts; _info = info; - _hashcode = ComputeHashCode(); _startSet = ComputeStartSet(); _nullabilityCache = info.StartsWithSomeAnchor && info.CanBeNullable ? new byte[CharKind.ContextLimit] : null; } - private bool _isInternalizedUnion; - /// Create a new node or retrieve one from the builder _nodeCache - private static SymbolicRegexNode Create(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexSet? alts, SymbolicRegexInfo info) + private static SymbolicRegexNode Create(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexNode? left, SymbolicRegexNode? right, int lower, int upper, TSet? set, SymbolicRegexInfo info) { - SymbolicRegexNode? node; - var key = (kind, left, right, lower, upper, set, alts, info); - if (!builder._nodeCache.TryGetValue(key, out node)) + var key = (kind, left, right, lower, upper, set, info); + if (!builder._nodeCache.TryGetValue(key, out SymbolicRegexNode? node)) { - // Do not internalize top level Or-nodes or else NFA mode will become ineffective - if (kind == SymbolicRegexNodeKind.Or) - { - node = new SymbolicRegexNode(builder, kind, left, right, lower, upper, set, alts, info); - return node; - } - - left = left == null || left._kind != SymbolicRegexNodeKind.Or || left._isInternalizedUnion ? left : Internalize(left); - right = right == null || right._kind != SymbolicRegexNodeKind.Or || right._isInternalizedUnion ? right : Internalize(right); - - node = new SymbolicRegexNode(builder, kind, left, right, lower, upper, set, alts, info); + node = new SymbolicRegexNode(builder, kind, left, right, lower, upper, set, info); builder._nodeCache[key] = node; } - - Debug.Assert(node is not null); return node; } - /// Internalize an Or-node that is not yet internalized - private static SymbolicRegexNode Internalize(SymbolicRegexNode node) - { - Debug.Assert(node._kind == SymbolicRegexNodeKind.Or && !node._isInternalizedUnion); - - (SymbolicRegexNodeKind, SymbolicRegexNode?, SymbolicRegexNode?, int, int, TSet?, SymbolicRegexSet?, SymbolicRegexInfo) node_key = - (SymbolicRegexNodeKind.Or, null, null, -1, -1, default(TSet), node._alts, node._info); - SymbolicRegexNode? node1; - if (node._builder._nodeCache.TryGetValue(node_key, out node1)) - { - Debug.Assert(node1 is not null && node1._isInternalizedUnion); - return node1; - } - else - { - node._isInternalizedUnion = true; - node._builder._nodeCache[node_key] = node; - return node; - } - } - /// True if this node is lazy internal bool IsLazy => _info.IsLazyLoop; @@ -172,7 +133,7 @@ private static bool IsHighPriorityNullableFor(SymbolicRegexNode node, uint case SymbolicRegexNodeKind.DisableBacktrackingSimulation: case SymbolicRegexNodeKind.Effect: - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(node._left is not null); //the left alternative must be high-priority-nullable //nullability of the right alternative does not matter @@ -213,15 +174,16 @@ internal bool CanBeNullable internal SymbolicRegexInfo _info; - private readonly int _hashcode; - - /// Converts a Concat or OrderdOr into an array, returns anything else in a singleton array. + /// + /// Converts a list of a given kind, e.g. Concat or Alternate, into an array, + /// returns anything else in a singleton array. + /// /// a list to insert the elements into, or null to return results in a new list /// kind of node to consider as the list builder public List> ToList(List>? list = null, SymbolicRegexNodeKind listKind = SymbolicRegexNodeKind.Concat) { - Debug.Assert(listKind == SymbolicRegexNodeKind.Concat || listKind == SymbolicRegexNodeKind.OrderedOr); + Debug.Assert(listKind == SymbolicRegexNodeKind.Concat || listKind == SymbolicRegexNodeKind.Alternate); list ??= new List>(); AppendToList(this, list, listKind); return list; @@ -301,22 +263,11 @@ bool WithCache(uint context) is_nullable = _left.IsNullableFor(context) && _right.IsNullableFor(context); break; - case SymbolicRegexNodeKind.Or: - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - is_nullable = _alts.IsNullableFor(context); - break; - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); is_nullable = _left.IsNullableFor(context) || _right.IsNullableFor(context); break; - case SymbolicRegexNodeKind.Not: - Debug.Assert(_left is not null); - is_nullable = !_left.IsNullableFor(context); - break; - case SymbolicRegexNodeKind.BeginningAnchor: is_nullable = CharKind.Prev(context) == CharKind.BeginningEnd; break; @@ -464,16 +415,16 @@ public bool IsNothing #region called only once, in the constructor of SymbolicRegexBuilder internal static SymbolicRegexNode CreateFalse(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, null, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, SymbolicRegexInfo.Create()); internal static SymbolicRegexNode CreateTrue(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, SymbolicRegexInfo.Create()); internal static SymbolicRegexNode CreateFixedLengthMarker(SymbolicRegexBuilder builder, int length) => - Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); internal static SymbolicRegexNode CreateEpsilon(SymbolicRegexBuilder builder) => - Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.Epsilon, null, null, -1, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); internal static SymbolicRegexNode CreateBeginEndAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) { @@ -481,7 +432,7 @@ internal static SymbolicRegexNode CreateBeginEndAnchor(SymbolicRegexBuilde SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor); - return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true, + return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true, startsWithLineAnchor: kind is SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor)); @@ -490,13 +441,13 @@ SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or internal static SymbolicRegexNode CreateBoundaryAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind) { Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor); - return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true)); + return Create(builder, kind, null, null, -1, -1, default, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true)); } #endregion internal static SymbolicRegexNode CreateSingleton(SymbolicRegexBuilder builder, TSet set) => - Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create()); + Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, SymbolicRegexInfo.Create()); internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder builder, SymbolicRegexNode body, int lower, int upper, bool isLazy) { @@ -508,25 +459,7 @@ internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder bu Debug.Assert(body._left is not null); return CreateLoop(builder, body._left, 0, 1, isLazy || body.IsLazy); } - return Create(builder, SymbolicRegexNodeKind.Loop, body, null, lower, upper, default, null, SymbolicRegexInfo.Loop(body._info, lower, isLazy)); - } - - internal static SymbolicRegexNode Or(SymbolicRegexBuilder builder, params SymbolicRegexNode[] disjuncts) => - CreateCollection(builder, SymbolicRegexNodeKind.Or, SymbolicRegexSet.CreateMulti(builder, disjuncts, SymbolicRegexNodeKind.Or), SymbolicRegexInfo.Or(GetInfos(disjuncts))); - - internal static SymbolicRegexNode Or(SymbolicRegexBuilder builder, SymbolicRegexSet disjuncts) - { - Debug.Assert(disjuncts._kind == SymbolicRegexNodeKind.Or || disjuncts.IsEverything); - return CreateCollection(builder, SymbolicRegexNodeKind.Or, disjuncts, SymbolicRegexInfo.Or(GetInfos(disjuncts))); - } - - internal static SymbolicRegexNode And(SymbolicRegexBuilder builder, params SymbolicRegexNode[] conjuncts) => - CreateCollection(builder, SymbolicRegexNodeKind.And, SymbolicRegexSet.CreateMulti(builder, conjuncts, SymbolicRegexNodeKind.And), SymbolicRegexInfo.And(GetInfos(conjuncts))); - - internal static SymbolicRegexNode And(SymbolicRegexBuilder builder, SymbolicRegexSet conjuncts) - { - Debug.Assert(conjuncts.IsNothing || conjuncts._kind == SymbolicRegexNodeKind.And); - return CreateCollection(builder, SymbolicRegexNodeKind.And, conjuncts, SymbolicRegexInfo.And(GetInfos(conjuncts))); + return Create(builder, SymbolicRegexNodeKind.Loop, body, null, lower, upper, default, SymbolicRegexInfo.Loop(body._info, lower, isLazy)); } internal static SymbolicRegexNode CreateEffect(SymbolicRegexBuilder builder, SymbolicRegexNode node, SymbolicRegexNode effectNode) @@ -543,46 +476,17 @@ internal static SymbolicRegexNode CreateEffect(SymbolicRegexBuilder return CreateEffect(builder, node._left, CreateConcat(builder, effectNode, node._right)); } - return Create(builder, SymbolicRegexNodeKind.Effect, node, effectNode, -1, -1, default, null, SymbolicRegexInfo.Effect(node._info)); + return Create(builder, SymbolicRegexNodeKind.Effect, node, effectNode, -1, -1, default, SymbolicRegexInfo.Effect(node._info)); } internal static SymbolicRegexNode CreateCaptureStart(SymbolicRegexBuilder builder, int captureNum) => - Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.CaptureStart, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); internal static SymbolicRegexNode CreateCaptureEnd(SymbolicRegexBuilder builder, int captureNum) => - Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); + Create(builder, SymbolicRegexNodeKind.CaptureEnd, null, null, captureNum, -1, default, SymbolicRegexInfo.Create(isAlwaysNullable: true, isHighPriorityNullable: true)); internal static SymbolicRegexNode CreateDisableBacktrackingSimulation(SymbolicRegexBuilder builder, SymbolicRegexNode child) => - Create(builder, SymbolicRegexNodeKind.DisableBacktrackingSimulation, child, null, -1, -1, default, null, child._info); - - private static SymbolicRegexNode CreateCollection(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, SymbolicRegexSet alts, SymbolicRegexInfo info) => - alts.IsNothing ? builder._nothing : - alts.IsEverything ? builder._anyStar : - alts.IsSingleton ? alts.GetSingletonElement() : - Create(builder, kind, null, null, -1, -1, default, alts, info); - - private static SymbolicRegexInfo[] GetInfos(SymbolicRegexNode[] nodes) - { - var infos = new SymbolicRegexInfo[nodes.Length]; - for (int i = 0; i < nodes.Length; i++) - { - infos[i] = nodes[i]._info; - } - return infos; - } - - private static SymbolicRegexInfo[] GetInfos(SymbolicRegexSet nodes) - { - var infos = new SymbolicRegexInfo[nodes.Count]; - int i = 0; - foreach (SymbolicRegexNode node in nodes) - { - Debug.Assert(i < nodes.Count); - infos[i++] = node._info; - } - Debug.Assert(i == nodes.Count); - return infos; - } + Create(builder, SymbolicRegexNodeKind.DisableBacktrackingSimulation, child, null, -1, -1, default, child._info); /// Make a concatenation of the supplied regex nodes. internal static SymbolicRegexNode CreateConcat(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right) @@ -605,12 +509,12 @@ internal static SymbolicRegexNode CreateConcat(SymbolicRegexBuilder return CreateEffect(builder, CreateConcat(builder, left._left, right), left._right); } - return Create(builder, SymbolicRegexNodeKind.Concat, left, right, -1, -1, default, null, SymbolicRegexInfo.Concat(left._info, right._info)); + return Create(builder, SymbolicRegexNodeKind.Concat, left, right, -1, -1, default, SymbolicRegexInfo.Concat(left._info, right._info)); } /// - /// Make an ordered or of given regexes, eliminate nothing regexes and treat .* as consuming element. - /// Keep the or flat, assuming both right and left are flat. + /// Make an alternation of given regexes, eliminate nothing regexes and treat .* as consuming element. + /// Keep the alternation flat, assuming both right and left are flat. /// Apply subsumption/combining optimizations, such that e.g. a?b|b will be simplified to a?b and b|a?b will be combined to a??b /// /// @@ -624,7 +528,7 @@ internal static SymbolicRegexNode CreateConcat(SymbolicRegexBuilder /// whether to skip deduplication /// if true then simplification rules succeeding when the right hand side subsumes the left hand side are tried first /// - internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, bool deduplicated = false, bool hintRightLikelySubsumes = false) + internal static SymbolicRegexNode CreateAlternate(SymbolicRegexBuilder builder, SymbolicRegexNode left, SymbolicRegexNode right, bool deduplicated = false, bool hintRightLikelySubsumes = false) { if (left.IsAnyStar || right == builder._nothing || left == right || (left.IsNullable && right.IsEpsilon)) return left; @@ -633,33 +537,33 @@ internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder bui // Handle cases where right is an alternation or not uniformly. If right is R|S then the head is R and the // tail is S. If right is not an alternation then the head is right and the tail is nothing. - SymbolicRegexNode head = right._kind == SymbolicRegexNodeKind.OrderedOr ? right._left! : right; - SymbolicRegexNode tail = right._kind == SymbolicRegexNodeKind.OrderedOr ? right._right! : builder._nothing; + SymbolicRegexNode head = right._kind == SymbolicRegexNodeKind.Alternate ? right._left! : right; + SymbolicRegexNode tail = right._kind == SymbolicRegexNodeKind.Alternate ? right._right! : builder._nothing; // Simplify away right side if left side subsumes it. For example X?Y|Y|Z would simplify to just X?Y|Z. if (!hintRightLikelySubsumes && left.Subsumes(head)) - return OrderedOr(builder, left, tail); + return CreateAlternate(builder, left, tail); // Simplify by folding right side into left side if right side subsumes the left side. For example Y|X?Y|Z // would simplify to X??Y|Z. if (head.Subsumes(left) && TryFoldAlternation(left, head, out SymbolicRegexNode? result)) - return OrderedOr(builder, result, tail); + return CreateAlternate(builder, result, tail); // This is a repeat of a rule above, but for the case when the hint tells us to try reverse subsumption first. if (hintRightLikelySubsumes && left.Subsumes(head)) - return OrderedOr(builder, left, tail); + return CreateAlternate(builder, left, tail); - // If left is not an Or, try to avoid allocation by checking if deduplication is necessary - if (!deduplicated && left._kind != SymbolicRegexNodeKind.OrderedOr) + // If left is not an Alternate, try to avoid allocation by checking if deduplication is necessary + if (!deduplicated && left._kind != SymbolicRegexNodeKind.Alternate) { SymbolicRegexNode current = right; // Initially assume there are no duplicates deduplicated = true; - while (current._kind == SymbolicRegexNodeKind.OrderedOr) + while (current._kind == SymbolicRegexNodeKind.Alternate) { Debug.Assert(current._left is not null && current._right is not null); - // All Ors are supposed to be in a right associative normal form - Debug.Assert(current._left._kind != SymbolicRegexNodeKind.OrderedOr); + // All Alternates are supposed to be in a right associative normal form + Debug.Assert(current._left._kind != SymbolicRegexNodeKind.Alternate); if (current._left == left) { // Duplicate found, mark that and exit early @@ -673,12 +577,12 @@ internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder bui deduplicated = (current != left); } - if (!deduplicated || left._kind == SymbolicRegexNodeKind.OrderedOr) + if (!deduplicated || left._kind == SymbolicRegexNodeKind.Alternate) { // If the left side was an or, then it has to be flattened, gather the elements from both sides - List> elems = left.ToList(listKind: SymbolicRegexNodeKind.OrderedOr); + List> elems = left.ToList(listKind: SymbolicRegexNodeKind.Alternate); int firstRightElem = elems.Count; - right.ToList(elems, listKind: SymbolicRegexNodeKind.OrderedOr); + right.ToList(elems, listKind: SymbolicRegexNodeKind.Alternate); // Eliminate any duplicate elements, keeping the leftmost element HashSet> seenElems = new(); @@ -704,7 +608,7 @@ internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder bui SymbolicRegexNode or = builder._nothing; for (int i = elems.Count - 1; i >= 0; i--) { - or = OrderedOr(builder, elems[i], or, deduplicated: true); + or = CreateAlternate(builder, elems[i], or, deduplicated: true); } return or; } @@ -713,16 +617,16 @@ internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder bui SymbolicRegexNode or = right; for (int i = firstRightElem - 1; i >= 0; i--) { - or = OrderedOr(builder, elems[i], or, deduplicated: true); + or = CreateAlternate(builder, elems[i], or, deduplicated: true); } return or; } } - Debug.Assert(left._kind != SymbolicRegexNodeKind.OrderedOr); + Debug.Assert(left._kind != SymbolicRegexNodeKind.Alternate); Debug.Assert(deduplicated); - return Create(builder, SymbolicRegexNodeKind.OrderedOr, left, right, -1, -1, default, null, SymbolicRegexInfo.Alternate(left._info, right._info)); + return Create(builder, SymbolicRegexNodeKind.Alternate, left, right, -1, -1, default, SymbolicRegexInfo.Alternate(left._info, right._info)); } /// @@ -1011,96 +915,6 @@ static bool TrySplitConcatSubsumption(SymbolicRegexNode left, SymbolicRege } } - internal static SymbolicRegexNode Not(SymbolicRegexBuilder builder, SymbolicRegexNode root) - { - // Instead of just creating a negated root node - // Convert ~root to Negation Normal Form (NNF) by using deMorgan's laws and push ~ to the leaves - // This may avoid rather large overhead (such case was discovered with unit test PasswordSearchDual) - // Do this transformation in-line without recursion, to avoid any chance of deep recursion - // OBSERVE: NNF[node] represents the Negation Normal Form of ~node - Dictionary, SymbolicRegexNode> NNF = new(); - Stack<(SymbolicRegexNode, bool)> todo = new(); - todo.Push((root, false)); - while (todo.Count > 0) - { - (SymbolicRegexNode, bool) top = todo.Pop(); - bool secondTimePushed = top.Item2; - SymbolicRegexNode node = top.Item1; - if (secondTimePushed) - { - Debug.Assert((node._kind == SymbolicRegexNodeKind.Or || node._kind == SymbolicRegexNodeKind.And) && node._alts is not null); - // Here all members of _alts have been processed - List> alts_nnf = new(); - foreach (SymbolicRegexNode elem in node._alts) - { - alts_nnf.Add(NNF[elem]); - } - // Using deMorgan's laws, flip the kind: Or becomes And, And becomes Or - SymbolicRegexNode node_nnf = node._kind == SymbolicRegexNodeKind.Or ? And(builder, alts_nnf.ToArray()) : Or(builder, alts_nnf.ToArray()); - NNF[node] = node_nnf; - } - else - { - switch (node._kind) - { - case SymbolicRegexNodeKind.Not: - Debug.Assert(node._left is not null); - // Here we assume that top._left is already in NNF, double negation is cancelled out - NNF[node] = node._left; - break; - - case SymbolicRegexNodeKind.Or or SymbolicRegexNodeKind.And: - Debug.Assert(node._alts is not null); - // Push the node for the second time - todo.Push((node, true)); - // Compute the negation normal form of all the members - // Their computation is actually the same independent from being inside an 'Or' or 'And' node - foreach (SymbolicRegexNode elem in node._alts) - { - todo.Push((elem, false)); - } - break; - - case SymbolicRegexNodeKind.Epsilon: - // ~() = .+ - NNF[node] = SymbolicRegexNode.CreateLoop(builder, builder._anyChar, 1, int.MaxValue, isLazy: false); - break; - - case SymbolicRegexNodeKind.Singleton: - Debug.Assert(node._set is not null); - // ~[] = .* - if (node.IsNothing) - { - NNF[node] = builder._anyStar; - break; - } - goto default; - - case SymbolicRegexNodeKind.Loop: - Debug.Assert(node._left is not null); - // ~(.*) = [] and ~(.+) = () - if (node.IsAnyStar) - { - NNF[node] = builder._nothing; - break; - } - else if (node.IsPlus && node._left.IsAnyChar) - { - NNF[node] = builder.Epsilon; - break; - } - goto default; - - default: - // In all other cases construct the complement - NNF[node] = Create(builder, SymbolicRegexNodeKind.Not, node, null, -1, -1, default, null, SymbolicRegexInfo.Not(node._info)); - break; - } - } - } - return NNF[root]; - } - /// /// Returns the fixed matching length of the regex or -1 if the regex does not have a fixed matching length. /// @@ -1168,11 +982,7 @@ public int GetFixedLength() break; } - case SymbolicRegexNodeKind.Or: - Debug.Assert(_alts is not null); - return _alts.GetFixedLength(); - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); int length = _left.GetFixedLength(); @@ -1195,126 +1005,59 @@ public int GetFixedLength() return -1; } -#if DEBUG - private TransitionRegex? _transitionRegex; /// - /// Computes the symbolic derivative as a transition regex. - /// Transitions are in the tree left to right in the order the backtracking engine would explore them. + /// Insert nodes to mark paths in the regex that correspond + /// to matches of fixed length. For example, for abar|bar two markers would be added abar(4)|bar(3). /// - internal TransitionRegex CreateDerivative() + /// + /// This function will rebuild concatenations because it pushes the FixedLengthMarker into the rightmost element. + /// Due to this this function should not be called on every character. + /// + /// accumulater used in the recursion for lengths of paths + /// the node with fixed length markers added + public SymbolicRegexNode AddFixedLengthMarkers(int lengthSoFar = 0) { - if (_transitionRegex is not null) - { - return _transitionRegex; - } - - if (IsNothing || IsEpsilon) - { - _transitionRegex = TransitionRegex.Leaf(_builder._nothing); - return _transitionRegex; - } - - if (IsAnyStar || IsAnyPlus) - { - _transitionRegex = TransitionRegex.Leaf(_builder._anyStar); - return _transitionRegex; - } - if (!StackHelper.TryEnsureSufficientExecutionStack()) { - return StackHelper.CallOnEmptyStack(CreateDerivative); + return this; } switch (_kind) { - case SymbolicRegexNodeKind.Singleton: - Debug.Assert(_set is not null); - _transitionRegex = TransitionRegex.Conditional(_set, TransitionRegex.Leaf(_builder.Epsilon), TransitionRegex.Leaf(_builder._nothing)); - break; + case SymbolicRegexNodeKind.Alternate: + Debug.Assert(_left is not null && _right is not null); + // For an Alternate attempt to add markers separately for each element + return CreateAlternate(_builder, + _left.AddFixedLengthMarkers(lengthSoFar), + _right.AddFixedLengthMarkers(lengthSoFar), deduplicated: true); case SymbolicRegexNodeKind.Concat: Debug.Assert(_left is not null && _right is not null); - TransitionRegex mainTransition = _left.CreateDerivative().Concat(_right); - - if (!_left.CanBeNullable) - { - // If _left is never nullable - _transitionRegex = mainTransition; - } - else if (_left.IsNullable) - { - // If _left is unconditionally nullable - _transitionRegex = TransitionRegex.Union(mainTransition, _right.CreateDerivative()); - } - else + // For a concat if the left side has a fixed length then accumulate that to the right side + int leftLength = _left.GetFixedLength(); + if (leftLength >= 0) { - // The left side contains anchors and can be nullable in some context - // Extract the nullability as the lookaround condition - SymbolicRegexNode leftNullabilityTest = _left.ExtractNullabilityTest(); - _transitionRegex = TransitionRegex.Lookaround(leftNullabilityTest, TransitionRegex.Union(mainTransition, _right.CreateDerivative()), mainTransition); + return CreateConcat(_builder, _left, _right.AddFixedLengthMarkers(lengthSoFar + leftLength)); } - break; - - case SymbolicRegexNodeKind.Loop: - // d(R*) = d(R+) = d(R)R* - Debug.Assert(_left is not null); - Debug.Assert(_upper > 0); - TransitionRegex step = _left.CreateDerivative(); - - if (IsStar || IsPlus) - { - _transitionRegex = step.Concat(_builder.CreateLoop(_left, IsLazy)); - } - else - { - int newupper = _upper == int.MaxValue ? int.MaxValue : _upper - 1; - int newlower = _lower == 0 ? 0 : _lower - 1; - SymbolicRegexNode rest = _builder.CreateLoop(_left, IsLazy, newlower, newupper); - _transitionRegex = step.Concat(rest); - } - break; - - case SymbolicRegexNodeKind.Or: - Debug.Assert(_alts is not null); - _transitionRegex = TransitionRegex.Leaf(_builder._nothing); - foreach (SymbolicRegexNode elem in _alts) - { - _transitionRegex = TransitionRegex.Union(_transitionRegex, elem.CreateDerivative()); - } - break; - - case SymbolicRegexNodeKind.OrderedOr: - Debug.Assert(_left is not null && _right is not null); - _transitionRegex = TransitionRegex.Union(_left.CreateDerivative(), _right.CreateDerivative()); - break; - - case SymbolicRegexNodeKind.DisableBacktrackingSimulation: - Debug.Assert(_left is not null); - // The derivative to TransitionRegex does not support backtracking simulation, so ignore this node - _transitionRegex = _left.CreateDerivative(); - break; - - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - _transitionRegex = TransitionRegex.Leaf(_builder._anyStar); - foreach (SymbolicRegexNode elem in _alts) + // If the right side is always zero length, then just recurse to the left side + int rightLength = _right.GetFixedLength(); + if (rightLength == 0) { - _transitionRegex = TransitionRegex.Intersect(_transitionRegex, elem.CreateDerivative()); + return CreateConcat(_builder, _left.AddFixedLengthMarkers(lengthSoFar), _right); } break; - case SymbolicRegexNodeKind.Not: - Debug.Assert(_left is not null); - _transitionRegex = _left.CreateDerivative().Complement(); - break; - - default: - _transitionRegex = TransitionRegex.Leaf(_builder._nothing); - break; + case SymbolicRegexNodeKind.FixedLengthMarker: + Debug.Assert(_lower == lengthSoFar); + return this; } - return _transitionRegex; + + // For all other nodes defer to GetFixedLength to figure out if there is a fixed length and add the marker + // if there is one. + int thisLength = GetFixedLength(); + return thisLength < 0 ? this : + CreateConcat(_builder, this, CreateFixedLengthMarker(_builder, lengthSoFar + thisLength)); } -#endif /// /// Create a derivative ( and ) and then strip @@ -1391,7 +1134,7 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) switch (_kind) { - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); // The left alternative, when nullable, has priority over the right alternative // Otherwise the left alternative is still active and the right alternative is pruned @@ -1399,7 +1142,7 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) // Thus, taking the higher-priority branch in backtracking that is known to lead to a match // at which point the other branches become irrelevant and must no longer be used. prunedNode = _left.IsNullableFor(context) ? _left.PruneLowerPriorityThanNullability(context) : - OrderedOr(_builder, _left, _right.PruneLowerPriorityThanNullability(context), deduplicated: true); + CreateAlternate(_builder, _left, _right.PruneLowerPriorityThanNullability(context), deduplicated: true); break; case SymbolicRegexNodeKind.Concat: @@ -1412,10 +1155,10 @@ private SymbolicRegexNode PruneLowerPriorityThanNullability(uint context) //--- //in a concatenation XZ where X is not an alternation, both X and Z are pruned //e.g. a{0,5}?b{0,5}? reduces to () - prunedNode = _left._kind == SymbolicRegexNodeKind.OrderedOr ? + prunedNode = _left._kind == SymbolicRegexNodeKind.Alternate ? (_left._left!.IsNullableFor(context) ? CreateConcat(_builder, _left._left, _right).PruneLowerPriorityThanNullability(context) : - OrderedOr(_builder, CreateConcat(_builder, _left._left, _right), CreateConcat(_builder, _left._right!, _right).PruneLowerPriorityThanNullability(context))) : + CreateAlternate(_builder, CreateConcat(_builder, _left._left, _right), CreateConcat(_builder, _left._right!, _right).PruneLowerPriorityThanNullability(context))) : CreateConcat(_builder, _left.PruneLowerPriorityThanNullability(context), _right.PruneLowerPriorityThanNullability(context)); break; @@ -1523,8 +1266,8 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) // In the second case backtracking would try to continue to follow (ab)* after reading b // This backtracking semantics is effectively being recorded into the order of the alternatives derivative = _left.IsHighPriorityNullableFor(context) ? - OrderedOr(_builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) : - OrderedOr(_builder, leftDerivative, rightDerivative); + CreateAlternate(_builder, rightDerivative, leftDerivative, hintRightLikelySubsumes: true) : + CreateAlternate(_builder, leftDerivative, rightDerivative); } break; } @@ -1553,10 +1296,10 @@ private SymbolicRegexNode CreateDerivative(TSet elem, uint context) break; } - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); - derivative = OrderedOr(_builder, _left.CreateDerivative(elem, context), _right.CreateDerivative(elem, context)); + derivative = CreateAlternate(_builder, _left.CreateDerivative(elem, context), _right.CreateDerivative(elem, context)); break; } @@ -1605,14 +1348,14 @@ internal SymbolicRegexNode StripEffects() Debug.Assert(_left._info.ContainsEffect && !_right._info.ContainsEffect); return _builder.CreateConcat(_left.StripEffects(), _right); - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); // This iterative handling of nested alternations is important to avoid quadratic work in deduplicating // the elements. We don't want to omit deduplication here, since he stripping may make nodes equal. - List> elems = ToList(listKind: SymbolicRegexNodeKind.OrderedOr); + List> elems = ToList(listKind: SymbolicRegexNodeKind.Alternate); for (int i = 0; i < elems.Count; i++) elems[i] = elems[i].StripEffects(); - return _builder.OrderedOr(elems); + return _builder.Alternate(elems); case SymbolicRegexNodeKind.DisableBacktrackingSimulation: Debug.Assert(_left is not null); @@ -1693,7 +1436,7 @@ internal void StripAndMapEffects(uint context, List<(SymbolicRegexNode, De break; } - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); _left.StripAndMapEffects(context, alternativesAndEffects, currentEffects); _right.StripAndMapEffects(context, alternativesAndEffects, currentEffects); @@ -1774,7 +1517,7 @@ internal void ApplyEffects(Action apply, uint cont } break; - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); if (_left.IsNullableFor(context)) { @@ -1801,200 +1544,10 @@ internal void ApplyEffects(Action apply, uint cont Debug.Assert(_left is not null); _left.ApplyEffects(apply, context, arg); break; - - case SymbolicRegexNodeKind.Or: - Debug.Assert(_alts is not null); - foreach (SymbolicRegexNode elem in _alts) - { - if (elem.IsNullableFor(context)) - elem.ApplyEffects(apply, context, arg); - } - break; - - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - foreach (SymbolicRegexNode elem in _alts) - { - Debug.Assert(elem.IsNullableFor(context)); - elem.ApplyEffects(apply, context, arg); - } - break; - } - } - -#if DEBUG - /// - /// Computes the closure of CreateDerivative, by exploring all the leaves - /// of the transition regex until no more new leaves are found. - /// Converts the resulting transition system into a symbolic NFA. - /// If the exploration remains incomplete due to the given state bound - /// being reached then the InComplete property of the constructed NFA is true. - /// - internal SymbolicNFA Explore(int bound) => SymbolicNFA.Explore(this, bound); - - /// Extracts the nullability test as a Boolean combination of anchors - public SymbolicRegexNode ExtractNullabilityTest() - { - if (IsNullable) - { - return _builder._anyStar; - } - - if (!CanBeNullable) - { - return _builder._nothing; - } - - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(ExtractNullabilityTest); - } - - switch (_kind) - { - case SymbolicRegexNodeKind.BeginningAnchor: - case SymbolicRegexNodeKind.EndAnchor: - case SymbolicRegexNodeKind.BOLAnchor: - case SymbolicRegexNodeKind.EOLAnchor: - case SymbolicRegexNodeKind.BoundaryAnchor: - case SymbolicRegexNodeKind.NonBoundaryAnchor: - case SymbolicRegexNodeKind.EndAnchorZ: - case SymbolicRegexNodeKind.EndAnchorZReverse: - return this; - case SymbolicRegexNodeKind.Concat: - Debug.Assert(_left is not null && _right is not null); - return _builder.And(_left.ExtractNullabilityTest(), _right.ExtractNullabilityTest()); - case SymbolicRegexNodeKind.Or: - Debug.Assert(_alts is not null); - SymbolicRegexNode disjunction = _builder._nothing; - foreach (SymbolicRegexNode elem in _alts) - { - disjunction = _builder.Or(disjunction, elem.ExtractNullabilityTest()); - } - return disjunction; - case SymbolicRegexNodeKind.OrderedOr: - Debug.Assert(_left is not null && _right is not null); - return OrderedOr(_builder, _left.ExtractNullabilityTest(), _right.ExtractNullabilityTest()); - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - SymbolicRegexNode conjunction = _builder._anyStar; - foreach (SymbolicRegexNode elem in _alts) - { - conjunction = _builder.And(conjunction, elem.ExtractNullabilityTest()); - } - return conjunction; - case SymbolicRegexNodeKind.Loop: - Debug.Assert(_left is not null); - return _left.ExtractNullabilityTest(); - default: - // All remaining cases could not be nullable or were trivially nullable - // Singleton cannot be nullable and Epsilon and FixedLengthMarker are trivially nullable - Debug.Assert(_kind == SymbolicRegexNodeKind.Not && _left is not null); - return _builder.Not(_left.ExtractNullabilityTest()); - } - } -#endif - - public override int GetHashCode() - { - return _hashcode; - } - - private int ComputeHashCode() - { - switch (_kind) - { - case SymbolicRegexNodeKind.EndAnchor: - case SymbolicRegexNodeKind.BeginningAnchor: - case SymbolicRegexNodeKind.BOLAnchor: - case SymbolicRegexNodeKind.EOLAnchor: - case SymbolicRegexNodeKind.Epsilon: - case SymbolicRegexNodeKind.BoundaryAnchor: - case SymbolicRegexNodeKind.NonBoundaryAnchor: - case SymbolicRegexNodeKind.EndAnchorZ: - case SymbolicRegexNodeKind.EndAnchorZReverse: - return HashCode.Combine(_kind, _info); - - case SymbolicRegexNodeKind.FixedLengthMarker: - case SymbolicRegexNodeKind.CaptureStart: - case SymbolicRegexNodeKind.CaptureEnd: - return HashCode.Combine(_kind, _lower); - - case SymbolicRegexNodeKind.Loop: - return HashCode.Combine(_kind, _left, _lower, _upper, _info); - - case SymbolicRegexNodeKind.Or or SymbolicRegexNodeKind.And: - return HashCode.Combine(_kind, _alts, _info); - - case SymbolicRegexNodeKind.Concat: - case SymbolicRegexNodeKind.OrderedOr: - case SymbolicRegexNodeKind.Effect: - return HashCode.Combine(_left, _right, _info); - - case SymbolicRegexNodeKind.DisableBacktrackingSimulation: - return HashCode.Combine(_left, _info); - - case SymbolicRegexNodeKind.Singleton: - return HashCode.Combine(_kind, _set); - - default: - Debug.Assert(_kind == SymbolicRegexNodeKind.Not); - return HashCode.Combine(_kind, _left, _info); - }; - } - - public override bool Equals([NotNullWhen(true)] object? obj) - { - if (obj is not SymbolicRegexNode that) - { - return false; - } - - if (this == that) - { - return true; - } - - if (_kind != that._kind) - { - return false; - } - - if (_kind == SymbolicRegexNodeKind.Or) - { - if (_isInternalizedUnion && that._isInternalizedUnion) - { - // Internalized nodes that are not identical are not equal - return false; - } - - // Check equality of the sets of regexes - Debug.Assert(_alts is not null && that._alts is not null); - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(_alts.Equals, that._alts); - } - return _alts.Equals(that._alts); } - - return false; } #if DEBUG - private void ToStringForLoop(StringBuilder sb) - { - if (_kind == SymbolicRegexNodeKind.Singleton) - { - ToStringHelper(sb); - } - else - { - sb.Append('('); - ToStringHelper(sb); - sb.Append(')'); - } - } - public override string ToString() { StringBuilder sb = new(); @@ -2030,7 +1583,12 @@ internal void ToStringHelper(StringBuilder sb) return; case SymbolicRegexNodeKind.Epsilon: + sb.Append('\u03B5'); + return; + case SymbolicRegexNodeKind.FixedLengthMarker: + sb.Append('\u02FF'); + AppendNumberSubscript(sb, _lower); return; case SymbolicRegexNodeKind.BoundaryAnchor: @@ -2049,13 +1607,7 @@ internal void ToStringHelper(StringBuilder sb) sb.Append("\\a"); return; - case SymbolicRegexNodeKind.Or: - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - _alts.ToStringHelper(sb); - return; - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); sb.Append('('); _left.ToStringHelper(sb); @@ -2068,14 +1620,10 @@ internal void ToStringHelper(StringBuilder sb) Debug.Assert(_left is not null && _right is not null); //mark left associative case with parenthesis if (_left.Kind == SymbolicRegexNodeKind.Concat) - { sb.Append('('); - } _left.ToStringHelper(sb); if (_left.Kind == SymbolicRegexNodeKind.Concat) - { sb.Append(')'); - } _right.ToStringHelper(sb); return; @@ -2092,7 +1640,7 @@ internal void ToStringHelper(StringBuilder sb) } else if (_lower == 0 && _upper == 1) { - _left.ToStringForLoop(sb); + ToStringGrouped(_left, sb); sb.Append('?'); if (IsLazy) { @@ -2102,7 +1650,7 @@ internal void ToStringHelper(StringBuilder sb) } else if (IsStar) { - _left.ToStringForLoop(sb); + ToStringGrouped(_left, sb); sb.Append('*'); if (IsLazy) { @@ -2111,7 +1659,7 @@ internal void ToStringHelper(StringBuilder sb) } else if (IsPlus) { - _left.ToStringForLoop(sb); + ToStringGrouped(_left, sb); sb.Append('+'); if (IsLazy) { @@ -2124,7 +1672,7 @@ internal void ToStringHelper(StringBuilder sb) } else { - _left.ToStringForLoop(sb); + ToStringGrouped(_left, sb); sb.Append('{'); sb.Append(_lower); if (!IsBoundedLoop) @@ -2159,33 +1707,13 @@ internal void ToStringHelper(StringBuilder sb) sb.Append('\u230A'); // Left floor // Include group number as a subscript Debug.Assert(_lower >= 0); - foreach (char c in _lower.ToString()) - { - sb.Append((char)('\u2080' + (c - '0'))); - } + AppendNumberSubscript(sb, _lower); return; case SymbolicRegexNodeKind.CaptureEnd: // Include group number as a superscript Debug.Assert(_lower >= 0); - foreach (char c in _lower.ToString()) - { - switch (c) - { - case '1': - sb.Append('\u00B9'); - break; - case '2': - sb.Append('\u00B2'); - break; - case '3': - sb.Append('\u00B3'); - break; - default: - sb.Append((char)('\u2070' + (c - '0'))); - break; - } - } + AppendNumberSuperscript(sb, _lower); sb.Append('\u2309'); // Right ceiling return; @@ -2195,14 +1723,56 @@ internal void ToStringHelper(StringBuilder sb) return; default: - // Using the operator ~ for complement - Debug.Assert(_kind == SymbolicRegexNodeKind.Not); - Debug.Assert(_left is not null); - sb.Append("~("); - _left.ToStringHelper(sb); - sb.Append(')'); + Debug.Fail($"{nameof(ToStringHelper)}:{_kind}"); return; } + + static void ToStringGrouped(SymbolicRegexNode node, StringBuilder sb) + { + switch (node._kind) + { + case SymbolicRegexNodeKind.Singleton: + node.ToStringHelper(sb); + break; + + default: + sb.Append('('); + node.ToStringHelper(sb); + sb.Append(')'); + break; + + } + } + + static void AppendNumberSubscript(StringBuilder sb, int value) + { + foreach (char c in value.ToString(CultureInfo.InvariantCulture)) + { + sb.Append((char)('\u2080' + (c - '0'))); + } + } + + static void AppendNumberSuperscript(StringBuilder sb, int value) + { + foreach (char c in value.ToString(CultureInfo.InvariantCulture)) + { + switch (c) + { + case '1': + sb.Append('\u00B9'); + break; + case '2': + sb.Append('\u00B2'); + break; + case '3': + sb.Append('\u00B3'); + break; + default: + sb.Append((char)('\u2070' + (c - '0'))); + break; + } + } + } } #endif @@ -2252,16 +1822,7 @@ private void CollectSets(HashSet sets) _left.CollectSets(sets); return; - case SymbolicRegexNodeKind.Or: - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - foreach (SymbolicRegexNode sr in _alts) - { - sr.CollectSets(sets); - } - return; - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); _left.CollectSets(sets); _right.CollectSets(sets); @@ -2284,11 +1845,6 @@ private void CollectSets(HashSet sets) _left.CollectSets(sets); return; - case SymbolicRegexNodeKind.Not: - Debug.Assert(_left is not null); - _left.CollectSets(sets); - return; - case SymbolicRegexNodeKind.NonBoundaryAnchor: case SymbolicRegexNodeKind.BoundaryAnchor: sets.Add(_builder._wordLetterForBoundariesSet); @@ -2342,21 +1898,9 @@ public SymbolicRegexNode Reverse() return rev; } - case SymbolicRegexNodeKind.Or: - Debug.Assert(_alts is not null); - return _builder.Or(_alts.Reverse()); - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); - return OrderedOr(_builder, _left.Reverse(), _right.Reverse()); - - case SymbolicRegexNodeKind.And: - Debug.Assert(_alts is not null); - return _builder.And(_alts.Reverse()); - - case SymbolicRegexNodeKind.Not: - Debug.Assert(_left is not null); - return _builder.Not(_left.Reverse()); + return CreateAlternate(_builder, _left.Reverse(), _right.Reverse()); case SymbolicRegexNodeKind.FixedLengthMarker: // Fixed length markers are omitted in reverse @@ -2415,11 +1959,7 @@ internal bool StartsWithLoop(int upperBoundLowestValue = 1) Debug.Assert(_left is not null && _right is not null); return _left.StartsWithLoop(upperBoundLowestValue) || (_left.IsNullable && _right.StartsWithLoop(upperBoundLowestValue)); - case SymbolicRegexNodeKind.Or: - Debug.Assert(_alts is not null); - return _alts.StartsWithLoop(upperBoundLowestValue); - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: Debug.Assert(_left is not null && _right is not null); return _left.StartsWithLoop(upperBoundLowestValue) || _right.StartsWithLoop(upperBoundLowestValue); @@ -2471,41 +2011,19 @@ private TSet ComputeStartSet() return startSet; } - case SymbolicRegexNodeKind.Or: - { - Debug.Assert(_alts is not null); - TSet startSet = _builder._solver.Empty; - foreach (SymbolicRegexNode alt in _alts) - { - startSet = _builder._solver.Or(startSet, alt._startSet); - } - return startSet; - } - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); return _builder._solver.Or(_left._startSet, _right._startSet); } - case SymbolicRegexNodeKind.And: - { - Debug.Assert(_alts is not null); - TSet startSet = _builder._solver.Full; - foreach (SymbolicRegexNode alt in _alts) - { - startSet = _builder._solver.And(startSet, alt._startSet); - } - return startSet; - } - case SymbolicRegexNodeKind.DisableBacktrackingSimulation: case SymbolicRegexNodeKind.Effect: Debug.Assert(_left is not null); return _left._startSet; default: - Debug.Assert(_kind == SymbolicRegexNodeKind.Not); + Debug.Fail($"{nameof(ComputeStartSet)}:{_kind}"); return _builder._solver.Full; } } @@ -2577,20 +2095,7 @@ internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bo CreateConcat(_builder, left1, right1); } - case SymbolicRegexNodeKind.Or: - { - Debug.Assert(_alts != null); - var elements = new SymbolicRegexNode[_alts.Count]; - int i = 0; - foreach (SymbolicRegexNode alt in _alts) - { - elements[i++] = alt.PruneAnchors(prevKind, contWithWL, contWithNWL); - } - Debug.Assert(i == elements.Length); - return Or(_builder, elements); - } - - case SymbolicRegexNodeKind.OrderedOr: + case SymbolicRegexNodeKind.Alternate: { Debug.Assert(_left is not null && _right is not null); SymbolicRegexNode left1 = _left.PruneAnchors(prevKind, contWithWL, contWithNWL); @@ -2599,7 +2104,7 @@ internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bo Debug.Assert(left1 is not null && right1 is not null); return left1 == _left && right1 == _right ? this : - OrderedOr(_builder, left1, right1); + CreateAlternate(_builder, left1, right1); } case SymbolicRegexNodeKind.Effect: @@ -2622,5 +2127,90 @@ internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bo return this; } } + + /// + /// Resolve the preferred fixed length when accepting a match for this node. For example, a pattern .*?(dada$(4)|ada(3)) + /// after "dada" would be in a state $(4)|(3)|... and this function would return 4 if the match is at the end of input + /// 3 otherwise. + /// + /// the context for deciding nullability + /// the fixed length of any match ending in this state, if any, or -1 otherwise + internal int ResolveFixedLength(uint context) + { + Debug.Assert(IsNullableFor(context)); + + // Guard against stack overflow due to deep recursion + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return StackHelper.CallOnEmptyStack(ResolveFixedLength, context); + } + + switch (_kind) + { + case SymbolicRegexNodeKind.FixedLengthMarker: + return _lower; + + case SymbolicRegexNodeKind.Alternate: + Debug.Assert(_left is not null && _right is not null); + if (_left.IsNullableFor(context)) + { + // Left is nullable, so the match is from the left + return _left.ResolveFixedLength(context); + } + else + { + // Otherwise right must be nullable and thus the relevant match + Debug.Assert(_right.IsNullableFor(context)); + return _right.ResolveFixedLength(context); + } + + case SymbolicRegexNodeKind.Concat: + Debug.Assert(_left is not null && _right is not null); + int leftLength = _left.ResolveFixedLength(context); + return leftLength >= 0 ? leftLength : _right.ResolveFixedLength(context); + } + return -1; + } + + /// + /// Break up a top level alternation into its elements. This is used when transitioning from DFA mode to NFA mode. + /// A node on the top level will be unwrapped + /// and the resulting elements re-wrapped to maintain the metadata. + /// + /// an enumeration of the elements of the alternation, or just the node itself if there is no alternation + internal IEnumerable> EnumerateAlternationBranches() + { + switch (_kind) + { + case SymbolicRegexNodeKind.DisableBacktrackingSimulation: + Debug.Assert(_left is not null); + // This call should never recurse more than one level + Debug.Assert(_left._kind is not SymbolicRegexNodeKind.DisableBacktrackingSimulation); + foreach (SymbolicRegexNode element in _left.EnumerateAlternationBranches()) + { + // Re-wrap the element nodes in DisableBacktrackingSimulation if the top level node was too + yield return _builder.CreateDisableBacktrackingSimulation(element); + } + break; + case SymbolicRegexNodeKind.Alternate: + // Loop through all the elements of an alternation + SymbolicRegexNode current = this; + while (current._kind is SymbolicRegexNodeKind.Alternate) + { + Debug.Assert(current._left is not null && current._right is not null); + Debug.Assert(current._left._kind is not SymbolicRegexNodeKind.Alternate); + // Alternations are in right associative form, so the left child is never an alternation and + // thus an element to be yielded here. + yield return current._left; + current = current._right; + } + // Yield the last element + yield return current; + break; + default: + yield return this; + break; + } + } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index dacbdc243ed625..282ae2be7bfc6d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -22,6 +22,7 @@ public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, Tim var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root); + rootNode = rootNode.AddFixedLengthMarkers(); BDD[] minterms = rootNode.ComputeMinterms(); _matcher = minterms.Length > 64 ? diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs deleted file mode 100644 index 41872e6c8ee030..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSampler.cs +++ /dev/null @@ -1,241 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections.Generic; -using System.Diagnostics; - -namespace System.Text.RegularExpressions.Symbolic -{ - internal sealed class SymbolicRegexSampler where TSet : IComparable, IEquatable - { - private Random _random; - private SymbolicRegexNode _root; - /// The used random seed - public int RandomSeed { get; private set; } - private BDD _asciiWordCharacters; - private BDD _asciiNonWordCharacters; // omits all characters before ' ' - private BDD _ascii; // omits all characters before ' ' - private ISolver _solver; - private CharSetSolver _charSetSolver; - - public SymbolicRegexSampler(SymbolicRegexNode root, int randomseed, bool negative) - { - _root = negative ? root._builder.Not(root) : root; - // Treat 0 as no seed and instead choose a random seed randomly - RandomSeed = randomseed == 0 ? new Random().Next() : randomseed; - _random = new Random(RandomSeed); - _solver = root._builder._solver; - _charSetSolver = new CharSetSolver(); - _asciiWordCharacters = _charSetSolver.Or(new BDD[] { - _charSetSolver.CreateSetFromRange('A', 'Z'), - _charSetSolver.CreateSetFromRange('a', 'z'), - _charSetSolver.CreateFromChar('_'), - _charSetSolver.CreateSetFromRange('0', '9')}); - // Visible ASCII range for input character generation - _ascii = _charSetSolver.CreateSetFromRange('\x20', '\x7E'); - _asciiNonWordCharacters = _charSetSolver.And(_ascii, _charSetSolver.Not(_asciiWordCharacters)); - } - - /// Generates up to k random strings accepted by the regex - public IEnumerable GenerateRandomMembers(int k) - { - for (int i = 0; i < k; i++) - { - // Holds the generated input so far - StringBuilder input_so_far = new(); - - // Initially there is no previous character - // Here one could also consider previous characters for example for \b, \B, and ^ anchors - // and initialize input_so_far accordingly - uint prevCharKind = CharKind.BeginningEnd; - - // This flag is set to false in the unlikely situation that generation ends up in a dead-end - bool generationSucceeded = true; - - // Current set of states reached initially contains just the root - List> states = new(); - states.Add(_root); - - // Used for end suffixes - List possible_endings = new(); - - List> nextStates = new(); - - while (true) - { - Debug.Assert(states.Count > 0); - - if (CanBeFinal(states)) - { - // Unconditionally final state or end of the input due to \Z anchor for example - if (IsFinal(states) || IsFinal(states, CharKind.Context(prevCharKind, CharKind.BeginningEnd))) - { - possible_endings.Add(""); - } - - // End of line due to end-of-line anchor - if (IsFinal(states, CharKind.Context(prevCharKind, CharKind.Newline))) - { - possible_endings.Add("\n"); - } - - // Related to wordborder due to \b or \B - if (IsFinal(states, CharKind.Context(prevCharKind, CharKind.WordLetter))) - { - possible_endings.Add(ChooseChar(_asciiWordCharacters).ToString()); - } - - // Related to wordborder due to \b or \B - if (IsFinal(states, CharKind.Context(prevCharKind, CharKind.General))) - { - possible_endings.Add(ChooseChar(_asciiNonWordCharacters).ToString()); - } - } - - // Choose to stop here based on a coin-toss - if (possible_endings.Count > 0 && ChooseRandomlyTrueOrFalse()) - { - //Choose some suffix that allows some anchor (if any) to be nullable - input_so_far.Append(Choose(possible_endings)); - break; - } - - SymbolicRegexNode state = Choose(states); - char c = '\0'; - uint cKind = 0; - // Observe that state.CreateDerivative() can be a deadend - List<(TSet, SymbolicRegexNode?, SymbolicRegexNode)> paths = new(state.CreateDerivative().EnumeratePaths(_solver.Full)); - if (paths.Count > 0) - { - (TSet, SymbolicRegexNode?, SymbolicRegexNode) path = Choose(paths); - // Consider a random path from some random state in states and - // select a random member of the set on that path - c = ChooseChar(ToBDD(path.Item1)); - - // Map the character back into the corresponding character constraint of the solver - TSet c_set = _solver.CreateFromChar(c); - - // Determine the character kind of c - cKind = IsNewline(c_set) ? CharKind.Newline : (IsWordchar(c_set) ? CharKind.WordLetter : CharKind.General); - - // Construct the combined context of previous and c kind - uint context = CharKind.Context(prevCharKind, cKind); - - // Step into the next set of states - nextStates.AddRange(Step(states, c_set, context)); - } - - // In the case that there are no next states: stop here - if (nextStates.Count == 0) - { - if (possible_endings.Count > 0) - { - input_so_far.Append(Choose(possible_endings)); - } - else - { - // Ending up here is unlikely but possible for example for infeasible patterns such as @"no\bway" - // or due to poor choice of c -- no anchor is enabled -- so this is a deadend - generationSucceeded = false; - } - break; - } - - input_so_far.Append(c); - states.Clear(); - possible_endings.Clear(); - List> tmp = states; - states = nextStates; - nextStates = tmp; - prevCharKind = cKind; - } - - if (generationSucceeded) - { - yield return input_so_far.ToString(); - } - } - } - - private static IEnumerable> Step(List> states, TSet set, uint context) - { - HashSet> seen = new(); - foreach (SymbolicRegexNode state in states) - { - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in state.CreateDerivative().EnumeratePaths(set)) - { - // Either there are no anchors or else check that the anchors are nullable in the given context - if (path.Item2 is null || path.Item2.IsNullableFor(context)) - { - // Omit repetitions from the enumeration - if (seen.Add(path.Item3)) - { - yield return path.Item3; - } - } - } - } - } - - private BDD ToBDD(TSet set) => _solver.ConvertToBDD(set, _charSetSolver); - - private T Choose(IList elems) => elems[_random.Next(elems.Count)]; - - private char ChooseChar((uint, uint) pair) => (char)_random.Next((int)pair.Item1, (int)pair.Item2 + 1); - - private char ChooseChar(BDD bdd) - { - Debug.Assert(!bdd.IsEmpty); - // Select characters from the visible ASCII range whenever possible - BDD bdd1 = _charSetSolver.And(bdd, _ascii); - return ChooseChar(Choose(BDDRangeConverter.ToRanges(bdd1.IsEmpty ? bdd : bdd1))); - } - - private bool ChooseRandomlyTrueOrFalse() => _random.Next(100) < 50; - /// Returns true if some state is unconditionally final - - private static bool IsFinal(List> states) - { - foreach (SymbolicRegexNode state in states) - { - if (state.IsNullable) - { - return true; - } - } - return false; - } - - /// Returns true if some state is final in the given context - private static bool IsFinal(List> states, uint context) - { - foreach (SymbolicRegexNode state in states) - { - if (state.IsNullableFor(context)) - { - return true; - } - } - return false; - } - - /// Returns true if some state can be final - private static bool CanBeFinal(List> states) - { - foreach (SymbolicRegexNode state in states) - { - if (state.CanBeNullable) - { - return true; - } - } - return false; - } - - private bool IsWordchar(TSet set) => !_solver.IsEmpty(_solver.And(set, _root._builder._wordLetterForBoundariesSet)); - - private bool IsNewline(TSet set) => !_solver.IsEmpty(_solver.And(set, _root._builder._newLineSet)); - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSet.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSet.cs deleted file mode 100644 index c52e14f893de74..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexSet.cs +++ /dev/null @@ -1,576 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Collections; -using System.Collections.Generic; -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; -using System.Runtime.CompilerServices; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Represents a set of symbolic regexes that is either a disjunction or a conjunction - internal sealed class SymbolicRegexSet : IEnumerable> where TSet : IComparable, IEquatable - { - internal readonly SymbolicRegexBuilder _builder; - - private readonly HashSet> _set; - - /// - /// Symbolic regex A{0,k}?B is stored as (A,B,true) -> k -- lazy - /// Symbolic regex A{0,k}? is stored as (A,(),true) -> k -- lazy - /// Symbolic regex A{0,k}B is stored as (A,B,false) -> k -- eager - /// Symbolic regex A{0,k} is stored as (A,(),false) -> k -- eager - /// - private readonly Dictionary<(SymbolicRegexNode, SymbolicRegexNode, bool), int> _loops; - - /// the union (or intersection) of all singletons in the collection if any or null if none - private readonly SymbolicRegexNode? _singleton; - - internal readonly SymbolicRegexNodeKind _kind; - - private int _hashCode; - - /// If >= 0 then the maximal length of a fixed length markers in the set - internal int _maximumLength = -1; - - private SymbolicRegexSet(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind, HashSet>? set, Dictionary<(SymbolicRegexNode, SymbolicRegexNode, bool), int>? loops, SymbolicRegexNode? singleton) - { - Debug.Assert(kind is SymbolicRegexNodeKind.And or SymbolicRegexNodeKind.Or); - Debug.Assert((set is null) == (loops is null)); - - _builder = builder; - _kind = kind; - _set = set ?? new HashSet>(); - _loops = loops ?? new Dictionary<(SymbolicRegexNode, SymbolicRegexNode, bool), int>(); - _singleton = singleton; - } - - /// Denotes the empty conjunction - public bool IsEverything => _kind == SymbolicRegexNodeKind.And && _set.Count == 0 && _loops.Count == 0 && _singleton == null; - - /// Denotes the empty disjunction - public bool IsNothing => _kind == SymbolicRegexNodeKind.Or && _set.Count == 0 && _loops.Count == 0 && _singleton == null; - - /// How many elements are there in this set - public int Count => _set.Count + _loops.Count + (_singleton == null ? 0 : 1); - - /// True iff the set is a singleton - public bool IsSingleton => Count == 1; - - internal static SymbolicRegexSet CreateFull(SymbolicRegexBuilder builder) => new SymbolicRegexSet(builder, SymbolicRegexNodeKind.And, null, null, null); - - internal static SymbolicRegexSet CreateEmpty(SymbolicRegexBuilder builder) => new SymbolicRegexSet(builder, SymbolicRegexNodeKind.Or, null, null, null); - - internal static SymbolicRegexSet CreateMulti(SymbolicRegexBuilder builder, IEnumerable> elems, SymbolicRegexNodeKind kind) - { - // Loops contains the actual multi-set part of the collection - var loops = new Dictionary<(SymbolicRegexNode, SymbolicRegexNode, bool), int>(); - - // Other represents a normal set - var other = new HashSet>(); - - // Combination of singletons (when not null) - SymbolicRegexNode? singleton = null; - - int fixedLength = -1; - - foreach (SymbolicRegexNode elem in elems) - { - // Keep track of the maximal fixed length if this is a disjunction - // this means for example if the regex is abc(3)|bc(2) and - // the input is xxxabcyyy then two fixed length markers will occur (3) and (2) - // after reading c and the maximal one is taken - // in a conjuctive setting this is undefined and the fixed length remains -1 - if (kind == SymbolicRegexNodeKind.Or && - elem._kind == SymbolicRegexNodeKind.FixedLengthMarker && elem._lower > fixedLength) - { - fixedLength = elem._lower; - } - - #region start foreach - if (elem == builder._anyStar) - { - // .* is the absorbing element for disjunction - if (kind == SymbolicRegexNodeKind.Or) - { - return builder.FullSet; - } - } - else if (elem == builder._nothing) - { - // [] is the absorbing element for conjunction - if (kind == SymbolicRegexNodeKind.And) - { - return builder.EmptySet; - } - } - else - { - switch (elem._kind) - { - case SymbolicRegexNodeKind.And: - case SymbolicRegexNodeKind.Or: - Debug.Assert(elem._alts is not null); - if (kind == elem._kind) - { - // Flatten the inner set - foreach (SymbolicRegexNode alt in elem._alts) - { - if (alt._kind == SymbolicRegexNodeKind.Loop && alt._lower == 0) - { - AddLoopElement(builder, loops, other, alt, builder.Epsilon, kind); - } - else - { - if (alt._kind == SymbolicRegexNodeKind.Concat && alt._left!._kind == SymbolicRegexNodeKind.Loop && alt._left._lower == 0) - { - Debug.Assert(alt._right is not null); - AddLoopElement(builder, loops, other, alt._left, alt._right, kind); - } - else - { - if (alt._kind == SymbolicRegexNodeKind.Singleton) - { - Debug.Assert(alt._set is not null); - if (singleton is null) - { - singleton = alt; - } - else - { - Debug.Assert(singleton._kind == SymbolicRegexNodeKind.Singleton && singleton._set is not null); - // Join the sets either by Intersecting or Unioning - // which at the character set level translates to conjunction or disjunction in the underlying character solver - TSet set = kind == SymbolicRegexNodeKind.Or ? builder._solver.Or(singleton._set, alt._set) : builder._solver.And(singleton._set, alt._set); - singleton = SymbolicRegexNode.CreateSingleton(builder, set); - } - } - else - { - other.Add(alt); - } - } - } - } - } - else - { - other.Add(elem); - } - break; - - case SymbolicRegexNodeKind.Loop: - if (elem._lower == 0) - { - AddLoopElement(builder, loops, other, elem, builder.Epsilon, kind); - } - else - { - other.Add(elem); - } - break; - - case SymbolicRegexNodeKind.Concat: - Debug.Assert(elem._left is not null && elem._right is not null); - if (elem._kind == SymbolicRegexNodeKind.Concat && elem._left._kind == SymbolicRegexNodeKind.Loop && elem._left._lower == 0) - { - AddLoopElement(builder, loops, other, elem._left, elem._right, kind); - } - else - { - other.Add(elem); - } - break; - - case SymbolicRegexNodeKind.Singleton: - Debug.Assert(elem._set is not null); - if (singleton is null) - { - singleton = elem; - } - else - { - Debug.Assert(singleton._kind == SymbolicRegexNodeKind.Singleton && singleton._set is not null); - // Join the sets either by Intersecting or Unioning - // which at the character set level translates to conjunction or disjunction in the underlying character solver - TSet set = kind == SymbolicRegexNodeKind.Or ? builder._solver.Or(singleton._set, elem._set) : builder._solver.And(singleton._set, elem._set); - singleton = SymbolicRegexNode.CreateSingleton(builder, set); - } - break; - - default: - other.Add(elem); - break; - } - } - #endregion - } - - // This optimization is only valid for a conjunction/intersection - if (kind == SymbolicRegexNodeKind.And && singleton is not null && singleton.Equals(builder._solver.Empty)) - { - return builder.EmptySet; - } - - // The following is only valid for a disjunction/union - if (kind == SymbolicRegexNodeKind.Or) - { - // If any element of other is covered in loops then omit it - var others1 = new HashSet>(); - foreach (SymbolicRegexNode sr in other) - { - // If there is an element A{0,m} then A is not needed because - // it is included by the loop due to the upper bound m > 0 - if (loops.ContainsKey((sr, builder.Epsilon, false))) - { - others1.Add(sr); - } - } - - foreach (KeyValuePair<(SymbolicRegexNode, SymbolicRegexNode, bool), int> pair in loops) - { - // If there is an element A{0,m}B then B is not needed because - // it is included by the concatenation due to the lower bound 0 - if (other.Contains(pair.Key.Item2)) - { - others1.Add(pair.Key.Item2); - } - } - - other.ExceptWith(others1); - } - - return - other.Count != 0 || loops.Count != 0 || singleton is not null ? new SymbolicRegexSet(builder, kind, other, loops, singleton) { _maximumLength = fixedLength } : - kind == SymbolicRegexNodeKind.Or ? builder.EmptySet : - builder.FullSet; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void AddLoopElement( - SymbolicRegexBuilder builder, - Dictionary<(SymbolicRegexNode, SymbolicRegexNode, bool), int> loops, - HashSet> other, SymbolicRegexNode loop, - SymbolicRegexNode rest, - SymbolicRegexNodeKind kind) - { - if (loop._upper == 0 && rest.IsEpsilon) - { - // In a set treat a loop with upper=lower=0 and no rest (no continuation after the loop) - // as () independent of whether it is lazy or eager - other.Add(builder.Epsilon); - } - else - { - Debug.Assert(loop._left is not null); - (SymbolicRegexNode, SymbolicRegexNode, bool) key = (loop._left, rest, loop.IsLazy); - if (!loops.TryGetValue(key, out int count) || - (kind == SymbolicRegexNodeKind.Or ? count < loop._upper : count > loop._upper)) // If disjunction then map to the maximum of the upper bounds else to the minimum - { - loops[key] = loop._upper; - } - } - } - } - - internal bool IsNullableFor(uint context) - { - Enumerator e = GetEnumerator(); - - if (_kind == SymbolicRegexNodeKind.Or) - { - // Some element must be nullable - while (e.MoveNext()) - { - if (e.Current.IsNullableFor(context)) - { - return true; - } - } - - return false; - } - else - { - Debug.Assert(_kind == SymbolicRegexNodeKind.And); - - // All elements must be nullable - while (e.MoveNext()) - { - if (!e.Current.IsNullableFor(context)) - { - return false; - } - } - - return true; - } - } - - public override int GetHashCode() - { - if (_hashCode == 0) - { - int hashCode = _kind.GetHashCode(); - - if (_singleton is not null) - { - hashCode ^= _singleton.GetHashCode(); - } - - foreach (SymbolicRegexNode n in _set) - { - hashCode ^= n.GetHashCode(); - } - - foreach (KeyValuePair<(SymbolicRegexNode, SymbolicRegexNode, bool), int> entry in _loops) - { - hashCode ^= entry.Key.GetHashCode() + entry.Value.GetHashCode(); - } - - _hashCode = hashCode; - } - - return _hashCode; - } - - public override bool Equals([NotNullWhen(true)] object? obj) - { - // This function is mutually recursive with the one in SymbolicRegexNode, which has stack overflow avoidance - if (obj is not SymbolicRegexSet that || - _kind != that._kind || - _singleton is null && that._singleton is not null || - _singleton is not null && !_singleton.Equals(that._singleton) || - _set.Count != that._set.Count || - _loops.Count != that._loops.Count || - (_set.Count > 0 && !_set.SetEquals(that._set))) - { - return false; - } - - foreach (KeyValuePair<(SymbolicRegexNode, SymbolicRegexNode, bool), int> c in _loops) - { - if (!that._loops.TryGetValue(c.Key, out int count) || !count.Equals(c.Value)) - { - return false; - } - } - - return true; - } -#if DEBUG - public void ToStringHelper(StringBuilder sb) - { - // This function is mutually recursive with the one in SymbolicRegexNode, which has stack overflow avoidance - if (IsNothing) - { - sb.Append(SymbolicRegexNode.EmptyCharClass); - } - else if (IsEverything) - { - sb.Append(".*"); - } - else - { - Enumerator enumerator = GetEnumerator(); - bool nonempty = enumerator.MoveNext(); - Debug.Assert(nonempty, "Collection must be nonempty because IsNothing is false and IsEverything is false"); - SymbolicRegexNode node = enumerator.Current; - if (!enumerator.MoveNext()) - { - // The collection only has one element - node.ToStringHelper(sb); - } - else - { - // Union of two or more elements - sb.Append('('); - // Append the first two elements - node.ToStringHelper(sb); - // Using the operator & for intersection - char op = _kind == SymbolicRegexNodeKind.Or ? '|' : '&'; - sb.Append(op); - enumerator.Current.ToStringHelper(sb); - while (enumerator.MoveNext()) - { - // Append all the remaining elements - sb.Append(op); - enumerator.Current.ToStringHelper(sb); - } - sb.Append(')'); - } - } - } - -#endif - internal SymbolicRegexSet Transform(SymbolicRegexBuilder builderT, Func, TSet, TNewSet> setTransformer) - where TNewSet : IComparable, IEquatable - { - // This function is mutually recursive with the one in SymbolicRegexBuilder, which has stack overflow avoidance - return SymbolicRegexSet.CreateMulti(builderT, TransformElements(builderT, setTransformer), _kind); - - IEnumerable> TransformElements(SymbolicRegexBuilder builderT, Func, TSet, TNewSet> setTransformer) - { - foreach (SymbolicRegexNode sr in this) - { - yield return _builder.Transform(sr, builderT, setTransformer); - } - } - } - - internal SymbolicRegexNode GetSingletonElement() - { - Debug.Assert(IsSingleton); - - Enumerator e = GetEnumerator(); - bool success = e.MoveNext(); - Debug.Assert(success); - return e.Current; - } - - internal SymbolicRegexSet Reverse() - { - // This function is mutually recursive with the one in SymbolicRegexNode, which has stack overflow avoidance - return CreateMulti(_builder, ReverseElements(), _kind); - - IEnumerable> ReverseElements() - { - foreach (SymbolicRegexNode n in this) - { - yield return n.Reverse(); - } - } - } - - internal bool StartsWithLoop(int upperBoundLowestValue) - { - // This function is mutually recursive with the one in SymbolicRegexNode, which has stack overflow avoidance - foreach (SymbolicRegexNode n in this) - { - if (n.StartsWithLoop(upperBoundLowestValue)) - { - return true; - } - } - - return false; - } - - public Enumerator GetEnumerator() => new Enumerator(this); - - IEnumerator> IEnumerable>.GetEnumerator() => new Enumerator(this); - - IEnumerator IEnumerable.GetEnumerator() => new Enumerator(this); - - internal int GetFixedLength() - { - // This function is mutually recursive with the one in SymbolicRegexNode, which has stack overflow avoidance - if (_loops.Count > 0) - { - return -1; - } - - int length = -1; - foreach (SymbolicRegexNode node in _set) - { - int nodeLength = node.GetFixedLength(); - - if (nodeLength == -1) - { - return -1; - } - else if (length == -1) - { - length = nodeLength; - } - else if (length != nodeLength) - { - return -1; - } - } - - if (_singleton is not null && length != 1) - { - if (length == -1) - { - length = 1; - } - else - { - length = -1; - } - } - - return length; - } - - /// Enumerates all symbolic regexes in the set - internal struct Enumerator : IEnumerator> - { - private readonly SymbolicRegexSet _set; - private int _state; // 0 = return singleton, 1 == iterate set, 2 == iterate loops, 3 == done - private SymbolicRegexNode? _current; - private HashSet>.Enumerator _setEnumerator; - private Dictionary<(SymbolicRegexNode, SymbolicRegexNode, bool), int>.Enumerator _loopsEnumerator; - - internal Enumerator(SymbolicRegexSet symbolicRegexSet) - { - _state = symbolicRegexSet._singleton is null ? 1 : 0; - _set = symbolicRegexSet; - _setEnumerator = symbolicRegexSet._set.GetEnumerator(); - _loopsEnumerator = symbolicRegexSet._loops.GetEnumerator(); - _current = null; - } - - public SymbolicRegexNode Current => _current!; - - object IEnumerator.Current => Current; - - public void Dispose() - { - _state = 3; - _setEnumerator.Dispose(); - _loopsEnumerator.Dispose(); - } - - public bool MoveNext() - { - switch (_state) - { - case 0: - Debug.Assert(_set._singleton is not null); - _current = _set._singleton; - _state = 1; - return true; - - case 1: - if (_setEnumerator.MoveNext()) - { - _current = _setEnumerator.Current; - return true; - } - _state = 2; - goto case 2; - - case 2: - if (_loopsEnumerator.MoveNext()) - { - // Recreate the symbolic regex from (body,rest)->k to body{0,k}rest - (SymbolicRegexNode body, SymbolicRegexNode rest, bool isLazy) = _loopsEnumerator.Current.Key; - int upper = _loopsEnumerator.Current.Value; - _current = _set._builder.CreateConcat(_set._builder.CreateLoop(body, isLazy, 0, upper), rest); - return true; - } - _state = 3; - goto default; - - default: - _current = null!; - return false; - } - } - - public void Reset() => throw new NotSupportedException(); - } - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/TransitionRegex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/TransitionRegex.cs deleted file mode 100644 index e977c76c9694ca..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/TransitionRegex.cs +++ /dev/null @@ -1,364 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -using System.Collections.Generic; -using System.Diagnostics; -using System.Runtime.InteropServices; -using System.Threading; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Represents a symbolic derivative created from a symbolic regex without using minterms - internal sealed class TransitionRegex where TSet : IComparable, IEquatable - { - public readonly SymbolicRegexBuilder _builder; - public readonly TransitionRegexKind _kind; - public readonly TSet? _test; - public readonly TransitionRegex? _first; - public readonly TransitionRegex? _second; - public readonly SymbolicRegexNode? _node; - public readonly DerivativeEffect? _effect; - - private TransitionRegex(SymbolicRegexBuilder builder, TransitionRegexKind kind, TSet? test, TransitionRegex? first, TransitionRegex? second, SymbolicRegexNode? node, DerivativeEffect? effect) - { - Debug.Assert(builder is not null); - Debug.Assert( - (kind is TransitionRegexKind.Leaf && node is not null && Equals(test, default(TSet)) && first is null && second is null && effect is null) || - (kind is TransitionRegexKind.Conditional && test is not null && first is not null && second is not null && node is null && effect is null) || - (kind is TransitionRegexKind.Union && Equals(test, default(TSet)) && first is not null && second is not null && node is null && effect is null) || - (kind is TransitionRegexKind.Lookaround && Equals(test, default(TSet)) && first is not null && second is not null && node is not null && effect is null) || - (kind is TransitionRegexKind.Effect && Equals(test, default(TSet)) && first is not null && second is null && node is null && effect is not null)); - - _builder = builder; - _kind = kind; - _test = test; - _first = first; - _second = second; - _node = node; - _effect = effect; - } - - private static TransitionRegex GetOrCreate(SymbolicRegexBuilder builder, TransitionRegexKind kind, TSet? test, TransitionRegex? one, TransitionRegex? two, SymbolicRegexNode? node, DerivativeEffect? effect = null) - { - // Keep transition regexes internalized using the builder - ref TransitionRegex? tr = ref CollectionsMarshal.GetValueRefOrAddDefault(builder._trCache, (kind, test, one, two, node, effect), out _); - return tr ??= new TransitionRegex(builder, kind, test, one, two, node, effect); - } - - public bool IsNothing - { - get - { - if (_kind == TransitionRegexKind.Leaf) - { - Debug.Assert(_node != null); - return _node.IsNothing; - } - - return false; - } - } - - public bool IsAnyStar - { - get - { - if (_kind == TransitionRegexKind.Leaf) - { - Debug.Assert(_node != null); - return _node.IsAnyStar; - } - - return false; - } - } - - /// Complement of transition regex - public TransitionRegex Complement() - { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(Complement); - } - - switch (_kind) - { - case TransitionRegexKind.Leaf: - // Complement is propagated to the leaf - Debug.Assert(_node is not null); - return GetOrCreate(_builder, _kind, default(TSet), null, null, _node._builder.Not(_node)); - - case TransitionRegexKind.Union: - // Apply deMorgan's laws - Debug.Assert(_first is not null && _second is not null); - return Intersect(_first.Complement(), _second.Complement()); - - default: - // Both Conditional and Nullability obey the same laws of propagation of complement - Debug.Assert(_first is not null && _second is not null); - return GetOrCreate(_builder, _kind, _test, _first.Complement(), _second.Complement(), _node); - } - } - - public static TransitionRegex Leaf(SymbolicRegexNode node) => - GetOrCreate(node._builder, TransitionRegexKind.Leaf, default(TSet), null, null, node); - - /// Concatenate a node at the end of this transition regex - public TransitionRegex Concat(SymbolicRegexNode node) - { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(Concat, node); - } - - switch (_kind) - { - case TransitionRegexKind.Leaf: - Debug.Assert(_node is not null); - return GetOrCreate(_builder, _kind, default(TSet), null, null, _node._builder.CreateConcat(_node, node)); - - case TransitionRegexKind.Effect: - Debug.Assert(_first is not null); - return GetOrCreate(_builder, _kind, default(TSet), _first.Concat(node), null, null, _effect); - - default: - // All other three cases are disjunctive and obey the same laws of propagation of complement - Debug.Assert(_first is not null && _second is not null); - return GetOrCreate(_builder, _kind, _test, _first.Concat(node), _second.Concat(node), _node); - } - } - - /// Intersection of transition regexes - public static TransitionRegex Intersect(TransitionRegex one, TransitionRegex two) - { - // Apply standard simplifications - - // [] & t = [], t & .* = t - if (one.IsNothing || two.IsAnyStar || one == two) - { - return one; - } - - // t & [] = [], .* & t = t - if (two.IsNothing || one.IsAnyStar) - { - return two; - } - - return one.IntersectWith(two, one._builder._solver.Full); - } - - private TransitionRegex IntersectWith(TransitionRegex that, TSet pathIn) - { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(IntersectWith, that, pathIn); - } - - Debug.Assert(!_builder._solver.IsEmpty(pathIn)); - -#region Conditional - // Intersect when this is a Conditional - if (_kind == TransitionRegexKind.Conditional) - { - Debug.Assert(_test is not null && _first is not null && _second is not null); - TSet thenPath = _builder._solver.And(pathIn, _test); - TSet elsePath = _builder._solver.And(pathIn, _builder._solver.Not(_test)); - - if (_builder._solver.IsEmpty(thenPath)) - { - // then case being infeasible implies that elsePath must be satisfiable - return _second.IntersectWith(that, elsePath); - } - - if (_builder._solver.IsEmpty(elsePath)) - { - // else case is infeasible - return _first.IntersectWith(that, thenPath); - } - - TransitionRegex thencase = _first.IntersectWith(that, thenPath); - TransitionRegex elsecase = _second.IntersectWith(that, elsePath); - if (thencase == elsecase) - { - // Both branches result in the same thing, so the test can be omitted - return thencase; - } - - return GetOrCreate(_builder, TransitionRegexKind.Conditional, _test, thencase, elsecase, null); - } - - // Swap the order of this and that if that is a Conditional - if (that._kind == TransitionRegexKind.Conditional) - { - return that.IntersectWith(this, pathIn); - } -#endregion - -#region Union - // Intersect when this is a Union - // Use the following law of distributivity: (A|B)&C = A&C|B&C - if (_kind == TransitionRegexKind.Union) - { - Debug.Assert(_first is not null && _second is not null); - return Union(_first.IntersectWith(that, pathIn), _second.IntersectWith(that, pathIn)); - } - - // Swap the order of this and that if that is a Union - if (that._kind == TransitionRegexKind.Union) - { - return that.IntersectWith(this, pathIn); - } -#endregion - -#region Nullability - if (_kind == TransitionRegexKind.Lookaround) - { - Debug.Assert(_node is not null && _first is not null && _second is not null); - return Lookaround(_node, _first.IntersectWith(that, pathIn), _second.IntersectWith(that, pathIn)); - } - - if (that._kind == TransitionRegexKind.Lookaround) - { - Debug.Assert(that._node is not null && that._first is not null && that._second is not null); - return Lookaround(that._node, that._first.IntersectWith(this, pathIn), that._second.IntersectWith(this, pathIn)); - } -#endregion - - // Propagate intersection to the leaves - Debug.Assert(_kind is TransitionRegexKind.Leaf && that._kind is TransitionRegexKind.Leaf && _node is not null && that._node is not null); - return Leaf(_builder.And(_node, that._node)); - } - - /// Union of transition regexes - public static TransitionRegex Union(TransitionRegex one, TransitionRegex two) - { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - return StackHelper.CallOnEmptyStack(Union, one, two); - } - - // Apply common simplifications, always trying to push the operations into the leaves or to eliminate redundant branches - if (one.IsNothing || two.IsAnyStar || one == two) - { - return two; - } - - if (two.IsNothing || one.IsAnyStar) - { - return one; - } - - if (one._kind == TransitionRegexKind.Conditional && two._kind == TransitionRegexKind.Conditional) - { - Debug.Assert(one._test is not null && one._first is not null && one._second is not null); - Debug.Assert(two._test is not null && two._first is not null && two._second is not null); - - // if (psi, t1, t2) | if(psi, s1, s2) = if(psi, t1|s1, t2|s2) - if (one._test.Equals(two._test)) - { - return Conditional(one._test, Union(one._first, two._first), Union(one._second, two._second)); - } - - // if (psi, t, []) | if(phi, t, []) = if(psi or phi, t, []) - if (one._second.IsNothing && two._second.IsNothing && one._first.Equals(two._first)) - { - return Conditional(one._builder._solver.Or(one._test, two._test), one._first, one._second); - } - } - - return GetOrCreate(one._builder, TransitionRegexKind.Union, default(TSet), one, two, null); - } - - public static TransitionRegex Conditional(TSet test, TransitionRegex thencase, TransitionRegex elsecase) => - (thencase == elsecase || thencase._builder._solver.Full.Equals(test)) ? thencase : - thencase._builder._solver.Empty.Equals(test) ? elsecase : - GetOrCreate(thencase._builder, TransitionRegexKind.Conditional, test, thencase, elsecase, null); - - public static TransitionRegex Lookaround(SymbolicRegexNode nullabilityTest, TransitionRegex thencase, TransitionRegex elsecase) => - (thencase == elsecase) ? thencase : GetOrCreate(thencase._builder, TransitionRegexKind.Lookaround, default(TSet), thencase, elsecase, nullabilityTest); - - public static TransitionRegex Effect(TransitionRegex child, DerivativeEffect effect) => - child.IsNothing ? child : - GetOrCreate(child._builder, TransitionRegexKind.Effect, default(TSet), child, null, null, effect); - - public override string ToString() => - _kind switch - { - TransitionRegexKind.Leaf => $"{_node}", - TransitionRegexKind.Union => $"{_first} | {_second}", - TransitionRegexKind.Conditional => $"if({_test}, {_first}, {_second})", - TransitionRegexKind.Effect => _effect?.Kind switch - { - DerivativeEffectKind.CaptureStart => $"captureStart({_effect?.CaptureNumber}, {_first})", - _ => $"captureEnd({_effect?.CaptureNumber}, {_first})", - }, - _ => $"if (IsNull({_node}), {_first}, {_second})", - }; - - /// Enumerates all the paths in this transition regex excluding dead-end paths - public IEnumerable<(TSet, SymbolicRegexNode?, SymbolicRegexNode)> EnumeratePaths(TSet pathCondition) - { - switch (_kind) - { - case TransitionRegexKind.Leaf: - Debug.Assert(_node is not null); - // Omit any path that leads to a deadend - if (!_node.IsNothing) - { - yield return (pathCondition, null, _node); - } - break; - - case TransitionRegexKind.Union: - Debug.Assert(_first is not null && _second is not null); - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in _first.EnumeratePaths(pathCondition)) - { - yield return path; - } - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in _second.EnumeratePaths(pathCondition)) - { - yield return path; - } - break; - - case TransitionRegexKind.Conditional: - Debug.Assert(_test is not null && _first is not null && _second is not null); - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in _first.EnumeratePaths(_builder._solver.And(pathCondition, _test))) - { - yield return path; - } - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in _second.EnumeratePaths(_builder._solver.And(pathCondition, _builder._solver.Not(_test)))) - { - yield return path; - } - break; - - default: - Debug.Assert(_kind is TransitionRegexKind.Lookaround && _node is not null && _first is not null && _second is not null); - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in _first.EnumeratePaths(pathCondition)) - { - SymbolicRegexNode nullabilityTest = _node; - if (path.Item2 is not null) - { - nullabilityTest = _builder.And(path.Item2, nullabilityTest); - } - yield return (path.Item1, nullabilityTest, path.Item3); - } - foreach ((TSet, SymbolicRegexNode?, SymbolicRegexNode) path in _second.EnumeratePaths(pathCondition)) - { - // Complement the nullability test - SymbolicRegexNode nullabilityTest = _builder.Not(_node); - if (path.Item2 is not null) - { - nullabilityTest = _builder.And(path.Item2, nullabilityTest); - } - yield return (path.Item1, nullabilityTest, path.Item3); - } - break; - } - } - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/TransitionRegexKind.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/TransitionRegexKind.cs deleted file mode 100644 index efda78cbd4bfda..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/TransitionRegexKind.cs +++ /dev/null @@ -1,17 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -#if DEBUG -namespace System.Text.RegularExpressions.Symbolic -{ - /// Kinds of transition regexes. Transition regexes maintain a DNF form that pushes all intersections and complements to the leaves. - internal enum TransitionRegexKind - { - Leaf, - Conditional, - Union, - Lookaround, - Effect - } -} -#endif diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs index aa5f1932118b82..2268f6aed6e1b4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexExperiment.cs @@ -70,25 +70,6 @@ private static long MeasureMatchTime(Regex re, string input, out Match match) } } - /// - /// Creates a regex that in the NonBacktracking engine in DEBUG mode represents intersection of regexes - /// - private static string And(params string[] regexes) - { - string conj = $"(?:{regexes[regexes.Length - 1]})"; - for (int i = regexes.Length - 2; i >= 0; i--) - { - conj = $"(?({regexes[i]}){conj}|[0-[0]])"; - } - - return conj; - } - - /// - /// Creates a regex that in the NonBacktracking engine in DEBUG mode represents complement of regex - /// - private static string Not(string regex) => $"(?({regex})[0-[0]]|.*)"; - /// /// When is set to return true, outputs DGML diagrams for the specified pattern. /// This is useful for understanding what graphs the NonBacktracking engine creates for the specified pattern. @@ -136,15 +117,15 @@ static void ViewDGML(string pattern, string name, bool nfa = false, bool addDotS } [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - [InlineData(".*a+", -1, new string[] { ".*a+" }, false, false)] - [InlineData("ann", -1, new string[] { "nna" }, true, false)] - [InlineData("(something|otherstuff)+", 10, new string[] { "Unexplored", "some" }, false, true)] - [InlineData("(something|otherstuff)+", 10, new string[] { "Unexplored", "ffut" }, true, true)] - public void TestDGMLGeneration(string pattern, int explorationbound, string[] expectedDgmlFragments, bool exploreInReverse, bool exploreAsNFA) + [InlineData(".*a+", new string[] { ".*a+" }, false)] + [InlineData("ann", new string[] { "nna" }, true)] + public void TestDGMLGeneration(string pattern, string[] expectedDgmlFragments, bool exploreAsNFA) { StringWriter sw = new StringWriter(); var re = new Regex(pattern, RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - if (TrySaveDGML(re, sw, exploreAsNFA, addDotStar: false, exploreInReverse, explorationbound, maxLabelLength: -1)) + if (TryExplore(re, exploreAsNFA)) + { + if (TrySaveDGML(re, sw, maxLabelLength: -1)) { string str = sw.ToString(); Assert.StartsWith("", str); @@ -154,287 +135,47 @@ public void TestDGMLGeneration(string pattern, int explorationbound, string[] ex Assert.Contains(fragment, str); } } + } - static bool TrySaveDGML(Regex regex, TextWriter writer, bool nfa, bool addDotStar, bool reverse, int maxStates, int maxLabelLength) + static bool TryExplore(Regex regex, bool exploreAsNFA) { - MethodInfo saveDgml = regex.GetType().GetMethod("SaveDGML", BindingFlags.NonPublic | BindingFlags.Instance); + MethodInfo saveDgml = regex.GetType().GetMethod("Explore", BindingFlags.NonPublic | BindingFlags.Instance); if (saveDgml is not null) { - saveDgml.Invoke(regex, new object[] { writer, nfa, addDotStar, reverse, maxStates, maxLabelLength }); + saveDgml.Invoke(regex, new object[] { true, true, true, !exploreAsNFA, exploreAsNFA}); return true; } return false; } - } - - #region Tests involving Intersection and Complement - // Currently only run in DEBUG mode in the NonBacktracking engine - //[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - private void SRMTest_ConjuctionIsMatch() - { - try - { - var re = new Regex(And(".*a.*", ".*b.*"), RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline | RegexOptions.IgnoreCase); - bool ok = re.IsMatch("xxaaxxBxaa"); - Assert.True(ok); - bool fail = re.IsMatch("xxaaxxcxaa"); - Assert.False(fail); - } - catch (NotSupportedException e) - { - // In Release build (?( test-pattern ) yes-pattern | no-pattern ) is not supported - Assert.Contains("conditional", e.Message); - } - } - - //[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - private void SRMTest_ConjuctionFindMatch() - { - try - { - // contains lower, upper, and a digit, and is between 2 and 4 characters long - var re = new Regex(And(".*[a-z].*", ".*[A-Z].*", ".*[0-9].*", ".{2,4}"), RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - var match = re.Match("xxaac\n5Bxaa"); - Assert.True(match.Success); - Assert.Equal(4, match.Index); - Assert.Equal(4, match.Length); - } - catch (NotSupportedException e) - { - // In Release build (?( test-pattern ) yes-pattern | no-pattern ) is not supported - Assert.Contains("conditional", e.Message); - } - } - - //[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - private void SRMTest_ComplementFindMatch() - { - try - { - // contains lower, upper, and a digit, and is between 4 and 8 characters long, does not contain 2 consequtive digits - var re = new Regex(And(".*[a-z].*", ".*[A-Z].*", ".*[0-9].*", ".{4,8}", - Not(".*(?:01|12|23|34|45|56|67|78|89).*")), RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - var match = re.Match("xxaac12Bxaas3455"); - Assert.True(match.Success); - Assert.Equal(6, match.Index); - Assert.Equal(7, match.Length); - } - catch (NotSupportedException e) - { - // In Release build (?( test-pattern ) yes-pattern | no-pattern ) is not supported - Assert.Contains("conditional", e.Message); - } - } - - //[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - private void PasswordSearch() - { - try - { - string twoLower = ".*[a-z].*[a-z].*"; - string twoUpper = ".*[A-Z].*[A-Z].*"; - string threeDigits = ".*[0-9].*[0-9].*[0-9].*"; - string oneSpecial = @".*[\x21-\x2F\x3A-\x40\x5B-x60\x7B-\x7E].*"; - string Not_countUp = Not(".*(?:012|123|234|345|456|567|678|789).*"); - string Not_countDown = Not(".*(?:987|876|765|654|543|432|321|210).*"); - // Observe that the space character (immediately before '!' in ASCII) is excluded - string length = "[!-~]{8,12}"; - - // Just to make the chance that the randomly generated part actually has a match - // be astronomically unlikely require 'X' and 'r' to be present also, - // although this constraint is really bogus from password constraints point of view - string contains_first_P_and_then_r = ".*X.*r.*"; - - // Conjunction of all the above constraints - string all = And(twoLower, twoUpper, threeDigits, oneSpecial, Not_countUp, Not_countDown, length, contains_first_P_and_then_r); - - // search for the password in a context surrounded by word boundaries - Regex re = new Regex($@"\b{all}\b", RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - - // Does not qualify because of 123 and connot end between 2 and 3 because of \b - string almost1 = "X@ssW0rd123"; - // Does not have at least two uppercase - string almost2 = "X@55w0rd"; - - // These two qualify - string matching1 = "X@55W0rd"; - string matching2 = "Xa5$w00rD"; - - foreach (int k in new int[] { 500, 1000, 5000, 10000, 50000, 100000 }) - { - Random random = new(k); - byte[] buffer1 = new byte[k]; - byte[] buffer2 = new byte[k]; - byte[] buffer3 = new byte[k]; - random.NextBytes(buffer1); - random.NextBytes(buffer2); - random.NextBytes(buffer3); - string part1 = new string(Array.ConvertAll(buffer1, b => (char)b)); - string part2 = new string(Array.ConvertAll(buffer2, b => (char)b)); - string part3 = new string(Array.ConvertAll(buffer3, b => (char)b)); - - string input = $"{part1} {almost1} {part2} {matching1} {part3} {matching2}, finally this {almost2} does not qualify either"; - - int expextedMatch1Index = (2 * k) + almost1.Length + 3; - int expextedMatch1Length = matching1.Length; - - int expextedMatch2Index = (3 * k) + almost1.Length + matching1.Length + 5; - int expextedMatch2Length = matching2.Length; - - // Random text hiding almostPassw and password - int t = System.Environment.TickCount; - Match match1 = re.Match(input); - Match match2 = match1.NextMatch(); - Match match3 = match2.NextMatch(); - t = System.Environment.TickCount - t; - - _output.WriteLine($@"k={k}, t={t}ms"); - - Assert.True(match1.Success); - Assert.Equal(expextedMatch1Index, match1.Index); - Assert.Equal(expextedMatch1Length, match1.Length); - Assert.Equal(matching1, match1.Value); - - Assert.True(match2.Success); - Assert.Equal(expextedMatch2Index, match2.Index); - Assert.Equal(expextedMatch2Length, match2.Length); - Assert.Equal(matching2, match2.Value); - - Assert.False(match3.Success); - } - } - catch (NotSupportedException e) - { - // In Release build (?( test-pattern ) yes-pattern | no-pattern ) is not supported - Assert.Contains("conditional", e.Message); - } - } - //[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - private void PasswordSearchDual() - { - try + static bool TrySaveDGML(Regex regex, TextWriter writer, int maxLabelLength) { - string Not_twoLower = Not(".*[a-z].*[a-z].*"); - string Not_twoUpper = Not(".*[A-Z].*[A-Z].*"); - string Not_threeDigits = Not(".*[0-9].*[0-9].*[0-9].*"); - string Not_oneSpecial = Not(@".*[\x21-\x2F\x3A-\x40\x5B-x60\x7B-\x7E].*"); - string countUp = ".*(?:012|123|234|345|456|567|678|789).*"; - string countDown = ".*(?:987|876|765|654|543|432|321|210).*"; - // Observe that the space character (immediately before '!' in ASCII) is excluded - string Not_length = Not("[!-~]{8,12}"); - - // Just to make the chance that the randomly generated part actually has a match - // be astronomically unlikely require 'P' and 'r' to be present also, - // although this constraint is really bogus from password constraints point of view - string Not_contains_first_P_and_then_r = Not(".*X.*r.*"); - - // Negated disjunction of all the above constraints - // By deMorgan's laws we know that ~(A|B|...|C) = ~A&~B&...&~C and ~~A = A - // So Not(Not_twoLower|...) is equivalent to twoLower&~(...) - string all = Not($"{Not_twoLower}|{Not_twoUpper}|{Not_threeDigits}|{Not_oneSpecial}|{countUp}|{countDown}|{Not_length}|{Not_contains_first_P_and_then_r}"); - - // search for the password in a context surrounded by word boundaries - Regex re = new Regex($@"\b{all}\b", RegexHelpers.RegexOptionNonBacktracking | RegexOptions.Singleline); - - // Does not qualify because of 123 and connot end between 2 and 3 because of \b - string almost1 = "X@ssW0rd123"; - // Does not have at least two uppercase - string almost2 = "X@55w0rd"; - - // These two qualify - string matching1 = "X@55W0rd"; - string matching2 = "Xa5$w00rD"; - - foreach (int k in new int[] { 500, 1000, 5000, 10000, 50000, 100000 }) + MethodInfo saveDgml = regex.GetType().GetMethod("SaveDGML", BindingFlags.NonPublic | BindingFlags.Instance); + if (saveDgml is not null) { - Random random = new(k); - byte[] buffer1 = new byte[k]; - byte[] buffer2 = new byte[k]; - byte[] buffer3 = new byte[k]; - random.NextBytes(buffer1); - random.NextBytes(buffer2); - random.NextBytes(buffer3); - string part1 = new string(Array.ConvertAll(buffer1, b => (char)b)); - string part2 = new string(Array.ConvertAll(buffer2, b => (char)b)); - string part3 = new string(Array.ConvertAll(buffer3, b => (char)b)); - - string input = $"{part1} {almost1} {part2} {matching1} {part3} {matching2}, finally this {almost2} does not qualify either"; - - int expectedMatch1Index = (2 * k) + almost1.Length + 3; - int expectedMatch1Length = matching1.Length; - - int expectedMatch2Index = (3 * k) + almost1.Length + matching1.Length + 5; - int expectedMatch2Length = matching2.Length; - - // Random text hiding almost and matching strings - int t = System.Environment.TickCount; - Match match1 = re.Match(input); - Match match2 = match1.NextMatch(); - Match match3 = match2.NextMatch(); - t = System.Environment.TickCount - t; - - _output.WriteLine($@"k={k}, t={t}ms"); - - Assert.True(match1.Success); - Assert.Equal(expectedMatch1Index, match1.Index); - Assert.Equal(expectedMatch1Length, match1.Length); - Assert.Equal(matching1, match1.Value); - - Assert.True(match2.Success); - Assert.Equal(expectedMatch2Index, match2.Index); - Assert.Equal(expectedMatch2Length, match2.Length); - Assert.Equal(matching2, match2.Value); - - Assert.False(match3.Success); + saveDgml.Invoke(regex, new object[] { writer, maxLabelLength }); + return true; } - } - catch (NotSupportedException e) - { - // In Release build (?( test-pattern ) yes-pattern | no-pattern ) is not supported - Assert.Contains("conditional", e.Message); - } - } - //[ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - //[InlineData("[abc]{0,10}", "a[abc]{0,3}", "xxxabbbbbbbyyy", true, "abbb")] - //[InlineData("[abc]{0,10}?", "a[abc]{0,3}?", "xxxabbbbbbbyyy", true, "a")] - private void TestConjunctionOverCounting(string conjunct1, string conjunct2, string input, bool success, string match) - { - try - { - string pattern = And(conjunct1, conjunct2); - Regex re = new Regex(pattern, RegexHelpers.RegexOptionNonBacktracking); - Match m = re.Match(input); - Assert.Equal(success, m.Success); - Assert.Equal(match, m.Value); - } - catch (NotSupportedException e) - { - // In Release build (?( test-pattern ) yes-pattern | no-pattern ) is not supported - Assert.Contains("conditional", e.Message); + return false; } } - #endregion #region Random input generation tests - public static IEnumerable GenerateRandomMembers_TestData() + public static IEnumerable SampledMatchesMatchAsExpected_TestData() { string[] patterns = new string[] { @"pa[5\$s]{2}w[o0]rd$", @"\w\d+", @"\d{10}" }; foreach (string pattern in patterns) { Regex re = new Regex(pattern, RegexHelpers.RegexOptionNonBacktracking); - foreach (bool negative in new bool[] { false, true }) + // Generate 3 inputs + List inputs = new(SampleMatchesViaReflection(re, 3, pattern.GetHashCode())); + foreach (RegexEngine engine in RegexHelpers.AvailableEngines) { - // Generate 3 positive and 3 negative inputs - List inputs = new(GenerateRandomMembersViaReflection(re, 3, 123, negative)); - foreach (RegexEngine engine in RegexHelpers.AvailableEngines) + foreach (string input in inputs) { - foreach (string input in inputs) - { - yield return new object[] { engine, pattern, input, !negative }; - } + yield return new object[] { engine, pattern, input }; } } } @@ -442,19 +183,19 @@ public static IEnumerable GenerateRandomMembers_TestData() /// Test random input generation correctness [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsNetCore))] - [MemberData(nameof(GenerateRandomMembers_TestData))] - public async Task GenerateRandomMembers(RegexEngine engine, string pattern, string input, bool isMatch) + [MemberData(nameof(SampledMatchesMatchAsExpected_TestData))] + public async Task SampledMatchesMatchAsExpected(RegexEngine engine, string pattern, string input) { Regex regex = await RegexHelpers.GetRegexAsync(engine, pattern); - Assert.Equal(isMatch, regex.IsMatch(input)); + Assert.True(regex.IsMatch(input)); } - private static IEnumerable GenerateRandomMembersViaReflection(Regex regex, int how_many_inputs, int randomseed, bool negative) + private static IEnumerable SampleMatchesViaReflection(Regex regex, int how_many_inputs, int randomseed) { - MethodInfo? gen = regex.GetType().GetMethod("GenerateRandomMembers", BindingFlags.NonPublic | BindingFlags.Instance); + MethodInfo? gen = regex.GetType().GetMethod("SampleMatches", BindingFlags.NonPublic | BindingFlags.Instance); if (gen is not null) { - return (IEnumerable)gen.Invoke(regex, new object[] { how_many_inputs, randomseed, negative }); + return (IEnumerable)gen.Invoke(regex, new object[] { how_many_inputs, randomseed }); } else {