Use IndexOf for .* in RegexInterpreter/Compiler (#31930)

* Clean up RegexInterpreter Almost entirely style. A few substantive but small changes: - Store the TextInfo rather than storing the CultureInfo and accessing the TextInfo virtual property on each call. - Removed unnecessary resx string that should have been an assert - Coalesced duplicate case blocks * Use IndexOf for Notoneloop{atomic} in RegexInterpreter/Compiler This is primarily to improve the performance of .* loops. We'll now use Span.IndexOf to search for the target character (e.g. \n), rather than the open-coded loop we currently have. * Address PR feedback
2025-06-08 03:27:04 +09:00 · 2020-02-08 08:52:12 -05:00 · 2020-02-08 08:52:12 -05:00 · 850b4a23e5
commit 850b4a23e5
parent 76f525d94e
8 changed files with 543 additions and 532 deletions
--- a/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx
+++ b/src/libraries/System.Text.RegularExpressions/src/Resources/Strings.resx
@ -183,9 +183,6 @@
  <data name="UnexpectedOpcode" xml:space="preserve">
    <value>Unexpected opcode in regular expression generation: {0}.</value>
  </data>
-  <data name="UnimplementedState" xml:space="preserve">
-    <value>Unimplemented state.</value>
-  </data>
  <data name="UnknownProperty" xml:space="preserve">
    <value>Unknown property '{0}'.</value>
  </data>
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
@ -47,7 +47,7 @@ namespace System.Text.RegularExpressions
        public const int Bol = 14;                //                          ^
        public const int Eol = 15;                //                          $
        public const int Boundary = 16;           //                          \b
-        public const int Nonboundary = 17;        //                          \B
+        public const int NonBoundary = 17;        //                          \B
        public const int Beginning = 18;          //                          \A
        public const int Start = 19;              //                          \G
        public const int EndZ = 20;               //                          \Z
@ -170,7 +170,7 @@ namespace System.Text.RegularExpressions
                case Bol:
                case Eol:
                case Boundary:
-                case Nonboundary:
+                case NonBoundary:
                case ECMABoundary:
                case NonECMABoundary:
                case Beginning:
@ -245,7 +245,7 @@ namespace System.Text.RegularExpressions
                Bol => nameof(Bol),
                Eol => nameof(Eol),
                Boundary => nameof(Boundary),
-                Nonboundary => nameof(Nonboundary),
+                NonBoundary => nameof(NonBoundary),
                Beginning => nameof(Beginning),
                Start => nameof(Start),
                EndZ => nameof(EndZ),
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@ -1769,7 +1769,7 @@ namespace System.Text.RegularExpressions
                        case RegexNode.Multi:
                        // Boundaries are like set checks and don't involve repetition, either.
                        case RegexNode.Boundary:
-                        case RegexNode.Nonboundary:
+                        case RegexNode.NonBoundary:
                        case RegexNode.ECMABoundary:
                        case RegexNode.NonECMABoundary:
                        // Anchors are also trivial.
@ -2259,7 +2259,7 @@ namespace System.Text.RegularExpressions
                        break;

                    case RegexNode.Boundary:
-                    case RegexNode.Nonboundary:
+                    case RegexNode.NonBoundary:
                    case RegexNode.ECMABoundary:
                    case RegexNode.NonECMABoundary:
                        EmitBoundary(node);
@ -2418,7 +2418,7 @@ namespace System.Text.RegularExpressions
                        BrfalseFar(doneLabel);
                        break;

-                    case RegexNode.Nonboundary:
+                    case RegexNode.NonBoundary:
                        Callvirt(s_isBoundaryMethod);
                        BrtrueFar(doneLabel);
                        break;
@ -3891,7 +3891,7 @@ namespace System.Text.RegularExpressions
                    }

                case RegexCode.Boundary:
-                case RegexCode.Nonboundary:
+                case RegexCode.NonBoundary:
                    //: if (!IsBoundary(Textpos(), _textbeg, _textend))
                    //:     break Backward;
                    Ldthis();
@ -4355,12 +4355,12 @@ namespace System.Text.RegularExpressions
                case RegexCode.Oneloopatomic | RegexCode.Ci | RegexCode.Rtl:
                case RegexCode.Notoneloopatomic | RegexCode.Ci | RegexCode.Rtl:
                case RegexCode.Setloopatomic | RegexCode.Ci | RegexCode.Rtl:
-                    //: int c = Operand(1);
-                    //: if (c > Rightchars())
-                    //:     c = Rightchars();
+                    //: int len = Operand(1);
+                    //: if (len > Rightchars())
+                    //:     len = Rightchars();
                    //: char ch = (char)Operand(0);
                    //: int i;
-                    //: for (i = c; i > 0; i--)
+                    //: for (i = len; i > 0; i--)
                    //: {
                    //:     if (Rightcharnext() != ch)
                    //:     {
@ -4368,14 +4368,13 @@ namespace System.Text.RegularExpressions
                    //:         break;
                    //:     }
                    //: }
-                    //: if (c > i)
-                    //:     Track(c - i - 1, Textpos() - 1);
+                    //: if (len > i)
+                    //:     Track(len - i - 1, Textpos() - 1);
                    {
-                        LocalBuilder cLocal = _temp1Local!;
                        LocalBuilder lenLocal = _temp2Local!;
+                        LocalBuilder iLocal = _temp1Local!;
                        charInClassLocal = _temp3Local!;
-                        Label l1 = DefineLabel();
-                        Label l2 = DefineLabel();
+                        Label loopEnd = DefineLabel();

                        int c = Operand(1);
                        if (c == 0)
@ -4404,78 +4403,137 @@ namespace System.Text.RegularExpressions
                            Ldc(c);
                            MarkLabel(l4);
                        }
-                        Dup();
-                        Stloc(lenLocal);
-                        Ldc(1);
-                        Add();
-                        Stloc(cLocal);

-                        MarkLabel(l1);
-                        Ldloc(cLocal);
-                        Ldc(1);
-                        Sub();
-                        Dup();
-                        Stloc(cLocal);
-                        Ldc(0);
-                        if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
+                        // If this is a notoneloop{atomic} and we're left-to-right and case-sensitive,
+                        // we can use the vectorized IndexOf to search for the target character.
+                        if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) &&
+                            !IsRightToLeft() &&
+                            !IsCaseInsensitive())
                        {
-                            BleFar(l2);
-                        }
-                        else
-                        {
-                            Ble(l2);
-                        }
-
-                        if (IsRightToLeft())
-                        {
-                            Leftcharnext();
-                        }
-                        else
-                        {
-                            Rightcharnext();
-                        }
-
-                        if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
-                        {
-                            EmitTimeoutCheck();
-                            EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
-                            BrtrueFar(l1);
-                        }
-                        else
-                        {
-                            if (IsCaseInsensitive())
-                            {
-                                CallToLower();
-                            }
+                            Stloc(lenLocal);

+                            // i = runtext.AsSpan(runtextpos, len).IndexOf(ch);
+                            Ldloc(_runtextLocal!);
+                            Ldloc(_runtextposLocal!);
+                            Ldloc(lenLocal);
+                            Call(s_stringAsSpanIntIntMethod);
                            Ldc(Operand(0));
-                            if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
+                            Call(s_spanIndexOf);
+                            Stloc(iLocal);
+
+                            Label charFound = DefineLabel();
+
+                            // if (i != -1) goto charFound;
+                            Ldloc(iLocal);
+                            Ldc(-1);
+                            Bne(charFound);
+
+                            // runtextpos += len;
+                            // i = 0;
+                            // goto loopEnd;
+                            Ldloc(_runtextposLocal!);
+                            Ldloc(lenLocal);
+                            Add();
+                            Stloc(_runtextposLocal!);
+                            Ldc(0);
+                            Stloc(iLocal);
+                            BrFar(loopEnd);
+
+                            // charFound:
+                            // runtextpos += i;
+                            // i = len - i;
+                            // goto loopEnd;
+                            MarkLabel(charFound);
+                            Ldloc(_runtextposLocal!);
+                            Ldloc(iLocal);
+                            Add();
+                            Stloc(_runtextposLocal!);
+                            Ldloc(lenLocal);
+                            Ldloc(iLocal);
+                            Sub();
+                            Stloc(iLocal);
+                            BrFar(loopEnd);
+                        }
+                        else
+                        {
+                            // Otherwise, we emit the open-coded loop.
+
+                            Dup();
+                            Stloc(lenLocal);
+                            Ldc(1);
+                            Add();
+                            Stloc(iLocal);
+
+                            Label loopCondition = DefineLabel();
+                            MarkLabel(loopCondition);
+                            Ldloc(iLocal);
+                            Ldc(1);
+                            Sub();
+                            Dup();
+                            Stloc(iLocal);
+                            Ldc(0);
+                            if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
                            {
-                                Beq(l1);
+                                BleFar(loopEnd);
                            }
                            else
                            {
-                                Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
-                                Bne(l1);
+                                Ble(loopEnd);
                            }
+
+                            if (IsRightToLeft())
+                            {
+                                Leftcharnext();
+                            }
+                            else
+                            {
+                                Rightcharnext();
+                            }
+
+                            if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
+                            {
+                                EmitTimeoutCheck();
+                                EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
+                                BrtrueFar(loopCondition);
+                            }
+                            else
+                            {
+                                if (IsCaseInsensitive())
+                                {
+                                    CallToLower();
+                                }
+
+                                Ldc(Operand(0));
+                                if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
+                                {
+                                    Beq(loopCondition);
+                                }
+                                else
+                                {
+                                    Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
+                                    Bne(loopCondition);
+                                }
+                            }
+
+                            Ldloc(_runtextposLocal!);
+                            Ldc(1);
+                            Sub(IsRightToLeft());
+                            Stloc(_runtextposLocal!);
                        }

-                        Ldloc(_runtextposLocal!);
-                        Ldc(1);
-                        Sub(IsRightToLeft());
-                        Stloc(_runtextposLocal!);
-
-                        MarkLabel(l2);
-
+                        // loopEnd:
+                        MarkLabel(loopEnd);
                        if (Code() != RegexCode.Oneloopatomic && Code() != RegexCode.Notoneloopatomic && Code() != RegexCode.Setloopatomic)
                        {
+                            // if (len <= i) goto advance;
                            Ldloc(lenLocal);
-                            Ldloc(cLocal);
+                            Ldloc(iLocal);
                            Ble(AdvanceLabel());

+                            // TrackPush(len - i - 1, runtextpos - Bump())
                            ReadyPushTrack();
                            Ldloc(lenLocal);
-                            Ldloc(cLocal);
+                            Ldloc(iLocal);
                            Sub();
                            Ldc(1);
                            Sub();
@ -4667,7 +4725,8 @@ namespace System.Text.RegularExpressions
                    break;

                default:
-                    throw new NotImplementedException(SR.UnimplementedState);
+                    Debug.Fail($"Unimplemented state: {_regexopcode:X8}");
+                    break;
            }
        }

--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@ -69,7 +69,7 @@ namespace System.Text.RegularExpressions
        public const int Bol = RegexCode.Bol;                         //          ^
        public const int Eol = RegexCode.Eol;                         //          $
        public const int Boundary = RegexCode.Boundary;               //          \b
-        public const int Nonboundary = RegexCode.Nonboundary;         //          \B
+        public const int NonBoundary = RegexCode.NonBoundary;         //          \B
        public const int ECMABoundary = RegexCode.ECMABoundary;       // \b
        public const int NonECMABoundary = RegexCode.NonECMABoundary; // \B
        public const int Beginning = RegexCode.Beginning;             //          \A
@ -218,7 +218,7 @@ namespace System.Text.RegularExpressions
                    case EndZ:
                    case Eol:
                    case Multi:
-                    case Nonboundary:
+                    case NonBoundary:
                    case NonECMABoundary:
                    case Nothing:
                    case Notone:
@ -1514,7 +1514,7 @@ namespace System.Text.RegularExpressions
                        case EndZ when node.Ch != '\n':
                        case Eol when node.Ch != '\n':
                        case Boundary when RegexCharClass.IsWordChar(node.Ch):
-                        case Nonboundary when !RegexCharClass.IsWordChar(node.Ch):
+                        case NonBoundary when !RegexCharClass.IsWordChar(node.Ch):
                        case ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
                        case NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
                            return true;
@ -1554,7 +1554,7 @@ namespace System.Text.RegularExpressions
                        case EndZ when !RegexCharClass.CharInClass('\n', node.Str!):
                        case Eol when !RegexCharClass.CharInClass('\n', node.Str!):
                        case Boundary when node.Str == RegexCharClass.WordClass || node.Str == RegexCharClass.DigitClass: // TODO: Expand these with a more inclusive overlap check that considers categories
-                        case Nonboundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
+                        case NonBoundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
                        case ECMABoundary when node.Str == RegexCharClass.ECMAWordClass || node.Str == RegexCharClass.ECMADigitClass:
                        case NonECMABoundary when node.Str == RegexCharClass.NotECMAWordClass || node.Str == RegexCharClass.NotDigitClass:
                            return true;
@ -1655,7 +1655,7 @@ namespace System.Text.RegularExpressions
                    case End:
                    case EndZ:
                    case Eol:
-                    case Nonboundary:
+                    case NonBoundary:
                    case NonECMABoundary:
                    case Start:
                    // Difficult to glean anything meaningful from boundaries or results only known at run time.
@ -1782,7 +1782,7 @@ namespace System.Text.RegularExpressions
                Bol => nameof(Bol),
                Eol => nameof(Eol),
                Boundary => nameof(Boundary),
-                Nonboundary => nameof(Nonboundary),
+                NonBoundary => nameof(NonBoundary),
                ECMABoundary => nameof(ECMABoundary),
                NonECMABoundary => nameof(NonECMABoundary),
                Beginning => nameof(Beginning),
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs
@ -1750,7 +1750,7 @@ namespace System.Text.RegularExpressions
            ch switch
            {
                'b' => UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary,
-                'B' => UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary,
+                'B' => UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.NonBoundary,
                'A' => RegexNode.Beginning,
                'G' => RegexNode.Start,
                'Z' => RegexNode.EndZ,
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@ -598,7 +598,7 @@ namespace System.Text.RegularExpressions
                case RegexNode.Bol:
                case RegexNode.Eol:
                case RegexNode.Boundary:
-                case RegexNode.Nonboundary:
+                case RegexNode.NonBoundary:
                case RegexNode.ECMABoundary:
                case RegexNode.NonECMABoundary:
                case RegexNode.Beginning:
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
@ -511,7 +511,7 @@ namespace System.Text.RegularExpressions
                case RegexNode.Bol:
                case RegexNode.Eol:
                case RegexNode.Boundary:
-                case RegexNode.Nonboundary:
+                case RegexNode.NonBoundary:
                case RegexNode.ECMABoundary:
                case RegexNode.NonECMABoundary:
                case RegexNode.Beginning: