1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-08 03:27:04 +09:00

Use IndexOf for .* in RegexInterpreter/Compiler (#31930)

* Clean up RegexInterpreter

Almost entirely style.  A few substantive but small changes:
- Store the TextInfo rather than storing the CultureInfo and accessing the TextInfo virtual property on each call.
- Removed unnecessary resx string that should have been an assert
- Coalesced duplicate case blocks

* Use IndexOf for Notoneloop{atomic} in RegexInterpreter/Compiler

This is primarily to improve the performance of .* loops.  We'll now use Span.IndexOf to search for the target character (e.g. \n), rather than the open-coded loop we currently have.

* Address PR feedback
This commit is contained in:
Stephen Toub 2020-02-08 08:52:12 -05:00 committed by GitHub
parent 76f525d94e
commit 850b4a23e5
Signed by: github
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 543 additions and 532 deletions

View file

@ -183,9 +183,6 @@
<data name="UnexpectedOpcode" xml:space="preserve">
<value>Unexpected opcode in regular expression generation: {0}.</value>
</data>
<data name="UnimplementedState" xml:space="preserve">
<value>Unimplemented state.</value>
</data>
<data name="UnknownProperty" xml:space="preserve">
<value>Unknown property '{0}'.</value>
</data>

View file

@ -47,7 +47,7 @@ namespace System.Text.RegularExpressions
public const int Bol = 14; // ^
public const int Eol = 15; // $
public const int Boundary = 16; // \b
public const int Nonboundary = 17; // \B
public const int NonBoundary = 17; // \B
public const int Beginning = 18; // \A
public const int Start = 19; // \G
public const int EndZ = 20; // \Z
@ -170,7 +170,7 @@ namespace System.Text.RegularExpressions
case Bol:
case Eol:
case Boundary:
case Nonboundary:
case NonBoundary:
case ECMABoundary:
case NonECMABoundary:
case Beginning:
@ -245,7 +245,7 @@ namespace System.Text.RegularExpressions
Bol => nameof(Bol),
Eol => nameof(Eol),
Boundary => nameof(Boundary),
Nonboundary => nameof(Nonboundary),
NonBoundary => nameof(NonBoundary),
Beginning => nameof(Beginning),
Start => nameof(Start),
EndZ => nameof(EndZ),

View file

@ -1769,7 +1769,7 @@ namespace System.Text.RegularExpressions
case RegexNode.Multi:
// Boundaries are like set checks and don't involve repetition, either.
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.NonBoundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
// Anchors are also trivial.
@ -2259,7 +2259,7 @@ namespace System.Text.RegularExpressions
break;
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.NonBoundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
EmitBoundary(node);
@ -2418,7 +2418,7 @@ namespace System.Text.RegularExpressions
BrfalseFar(doneLabel);
break;
case RegexNode.Nonboundary:
case RegexNode.NonBoundary:
Callvirt(s_isBoundaryMethod);
BrtrueFar(doneLabel);
break;
@ -3891,7 +3891,7 @@ namespace System.Text.RegularExpressions
}
case RegexCode.Boundary:
case RegexCode.Nonboundary:
case RegexCode.NonBoundary:
//: if (!IsBoundary(Textpos(), _textbeg, _textend))
//: break Backward;
Ldthis();
@ -4355,12 +4355,12 @@ namespace System.Text.RegularExpressions
case RegexCode.Oneloopatomic | RegexCode.Ci | RegexCode.Rtl:
case RegexCode.Notoneloopatomic | RegexCode.Ci | RegexCode.Rtl:
case RegexCode.Setloopatomic | RegexCode.Ci | RegexCode.Rtl:
//: int c = Operand(1);
//: if (c > Rightchars())
//: c = Rightchars();
//: int len = Operand(1);
//: if (len > Rightchars())
//: len = Rightchars();
//: char ch = (char)Operand(0);
//: int i;
//: for (i = c; i > 0; i--)
//: for (i = len; i > 0; i--)
//: {
//: if (Rightcharnext() != ch)
//: {
@ -4368,14 +4368,13 @@ namespace System.Text.RegularExpressions
//: break;
//: }
//: }
//: if (c > i)
//: Track(c - i - 1, Textpos() - 1);
//: if (len > i)
//: Track(len - i - 1, Textpos() - 1);
{
LocalBuilder cLocal = _temp1Local!;
LocalBuilder lenLocal = _temp2Local!;
LocalBuilder iLocal = _temp1Local!;
charInClassLocal = _temp3Local!;
Label l1 = DefineLabel();
Label l2 = DefineLabel();
Label loopEnd = DefineLabel();
int c = Operand(1);
if (c == 0)
@ -4404,78 +4403,137 @@ namespace System.Text.RegularExpressions
Ldc(c);
MarkLabel(l4);
}
Dup();
Stloc(lenLocal);
Ldc(1);
Add();
Stloc(cLocal);
MarkLabel(l1);
Ldloc(cLocal);
Ldc(1);
Sub();
Dup();
Stloc(cLocal);
Ldc(0);
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
// If this is a notoneloop{atomic} and we're left-to-right and case-sensitive,
// we can use the vectorized IndexOf to search for the target character.
if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) &&
!IsRightToLeft() &&
!IsCaseInsensitive())
{
BleFar(l2);
}
else
{
Ble(l2);
}
if (IsRightToLeft())
{
Leftcharnext();
}
else
{
Rightcharnext();
}
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
{
EmitTimeoutCheck();
EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
BrtrueFar(l1);
}
else
{
if (IsCaseInsensitive())
{
CallToLower();
}
Stloc(lenLocal);
// i = runtext.AsSpan(runtextpos, len).IndexOf(ch);
Ldloc(_runtextLocal!);
Ldloc(_runtextposLocal!);
Ldloc(lenLocal);
Call(s_stringAsSpanIntIntMethod);
Ldc(Operand(0));
if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
Call(s_spanIndexOf);
Stloc(iLocal);
Label charFound = DefineLabel();
// if (i != -1) goto charFound;
Ldloc(iLocal);
Ldc(-1);
Bne(charFound);
// runtextpos += len;
// i = 0;
// goto loopEnd;
Ldloc(_runtextposLocal!);
Ldloc(lenLocal);
Add();
Stloc(_runtextposLocal!);
Ldc(0);
Stloc(iLocal);
BrFar(loopEnd);
// charFound:
// runtextpos += i;
// i = len - i;
// goto loopEnd;
MarkLabel(charFound);
Ldloc(_runtextposLocal!);
Ldloc(iLocal);
Add();
Stloc(_runtextposLocal!);
Ldloc(lenLocal);
Ldloc(iLocal);
Sub();
Stloc(iLocal);
BrFar(loopEnd);
}
else
{
// Otherwise, we emit the open-coded loop.
Dup();
Stloc(lenLocal);
Ldc(1);
Add();
Stloc(iLocal);
Label loopCondition = DefineLabel();
MarkLabel(loopCondition);
Ldloc(iLocal);
Ldc(1);
Sub();
Dup();
Stloc(iLocal);
Ldc(0);
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
{
Beq(l1);
BleFar(loopEnd);
}
else
{
Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
Bne(l1);
Ble(loopEnd);
}
if (IsRightToLeft())
{
Leftcharnext();
}
else
{
Rightcharnext();
}
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
{
EmitTimeoutCheck();
EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
BrtrueFar(loopCondition);
}
else
{
if (IsCaseInsensitive())
{
CallToLower();
}
Ldc(Operand(0));
if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
{
Beq(loopCondition);
}
else
{
Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
Bne(loopCondition);
}
}
Ldloc(_runtextposLocal!);
Ldc(1);
Sub(IsRightToLeft());
Stloc(_runtextposLocal!);
}
Ldloc(_runtextposLocal!);
Ldc(1);
Sub(IsRightToLeft());
Stloc(_runtextposLocal!);
MarkLabel(l2);
// loopEnd:
MarkLabel(loopEnd);
if (Code() != RegexCode.Oneloopatomic && Code() != RegexCode.Notoneloopatomic && Code() != RegexCode.Setloopatomic)
{
// if (len <= i) goto advance;
Ldloc(lenLocal);
Ldloc(cLocal);
Ldloc(iLocal);
Ble(AdvanceLabel());
// TrackPush(len - i - 1, runtextpos - Bump())
ReadyPushTrack();
Ldloc(lenLocal);
Ldloc(cLocal);
Ldloc(iLocal);
Sub();
Ldc(1);
Sub();
@ -4667,7 +4725,8 @@ namespace System.Text.RegularExpressions
break;
default:
throw new NotImplementedException(SR.UnimplementedState);
Debug.Fail($"Unimplemented state: {_regexopcode:X8}");
break;
}
}

View file

@ -69,7 +69,7 @@ namespace System.Text.RegularExpressions
public const int Bol = RegexCode.Bol; // ^
public const int Eol = RegexCode.Eol; // $
public const int Boundary = RegexCode.Boundary; // \b
public const int Nonboundary = RegexCode.Nonboundary; // \B
public const int NonBoundary = RegexCode.NonBoundary; // \B
public const int ECMABoundary = RegexCode.ECMABoundary; // \b
public const int NonECMABoundary = RegexCode.NonECMABoundary; // \B
public const int Beginning = RegexCode.Beginning; // \A
@ -218,7 +218,7 @@ namespace System.Text.RegularExpressions
case EndZ:
case Eol:
case Multi:
case Nonboundary:
case NonBoundary:
case NonECMABoundary:
case Nothing:
case Notone:
@ -1514,7 +1514,7 @@ namespace System.Text.RegularExpressions
case EndZ when node.Ch != '\n':
case Eol when node.Ch != '\n':
case Boundary when RegexCharClass.IsWordChar(node.Ch):
case Nonboundary when !RegexCharClass.IsWordChar(node.Ch):
case NonBoundary when !RegexCharClass.IsWordChar(node.Ch):
case ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
case NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
return true;
@ -1554,7 +1554,7 @@ namespace System.Text.RegularExpressions
case EndZ when !RegexCharClass.CharInClass('\n', node.Str!):
case Eol when !RegexCharClass.CharInClass('\n', node.Str!):
case Boundary when node.Str == RegexCharClass.WordClass || node.Str == RegexCharClass.DigitClass: // TODO: Expand these with a more inclusive overlap check that considers categories
case Nonboundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
case NonBoundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
case ECMABoundary when node.Str == RegexCharClass.ECMAWordClass || node.Str == RegexCharClass.ECMADigitClass:
case NonECMABoundary when node.Str == RegexCharClass.NotECMAWordClass || node.Str == RegexCharClass.NotDigitClass:
return true;
@ -1655,7 +1655,7 @@ namespace System.Text.RegularExpressions
case End:
case EndZ:
case Eol:
case Nonboundary:
case NonBoundary:
case NonECMABoundary:
case Start:
// Difficult to glean anything meaningful from boundaries or results only known at run time.
@ -1782,7 +1782,7 @@ namespace System.Text.RegularExpressions
Bol => nameof(Bol),
Eol => nameof(Eol),
Boundary => nameof(Boundary),
Nonboundary => nameof(Nonboundary),
NonBoundary => nameof(NonBoundary),
ECMABoundary => nameof(ECMABoundary),
NonECMABoundary => nameof(NonECMABoundary),
Beginning => nameof(Beginning),

View file

@ -1750,7 +1750,7 @@ namespace System.Text.RegularExpressions
ch switch
{
'b' => UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary,
'B' => UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary,
'B' => UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.NonBoundary,
'A' => RegexNode.Beginning,
'G' => RegexNode.Start,
'Z' => RegexNode.EndZ,

View file

@ -598,7 +598,7 @@ namespace System.Text.RegularExpressions
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.NonBoundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
case RegexNode.Beginning:

View file

@ -511,7 +511,7 @@ namespace System.Text.RegularExpressions
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.NonBoundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
case RegexNode.Beginning: