mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-08 03:27:04 +09:00
Use IndexOf for .* in RegexInterpreter/Compiler (#31930)
* Clean up RegexInterpreter Almost entirely style. A few substantive but small changes: - Store the TextInfo rather than storing the CultureInfo and accessing the TextInfo virtual property on each call. - Removed unnecessary resx string that should have been an assert - Coalesced duplicate case blocks * Use IndexOf for Notoneloop{atomic} in RegexInterpreter/Compiler This is primarily to improve the performance of .* loops. We'll now use Span.IndexOf to search for the target character (e.g. \n), rather than the open-coded loop we currently have. * Address PR feedback
This commit is contained in:
parent
76f525d94e
commit
850b4a23e5
8 changed files with 543 additions and 532 deletions
|
@ -183,9 +183,6 @@
|
|||
<data name="UnexpectedOpcode" xml:space="preserve">
|
||||
<value>Unexpected opcode in regular expression generation: {0}.</value>
|
||||
</data>
|
||||
<data name="UnimplementedState" xml:space="preserve">
|
||||
<value>Unimplemented state.</value>
|
||||
</data>
|
||||
<data name="UnknownProperty" xml:space="preserve">
|
||||
<value>Unknown property '{0}'.</value>
|
||||
</data>
|
||||
|
|
|
@ -47,7 +47,7 @@ namespace System.Text.RegularExpressions
|
|||
public const int Bol = 14; // ^
|
||||
public const int Eol = 15; // $
|
||||
public const int Boundary = 16; // \b
|
||||
public const int Nonboundary = 17; // \B
|
||||
public const int NonBoundary = 17; // \B
|
||||
public const int Beginning = 18; // \A
|
||||
public const int Start = 19; // \G
|
||||
public const int EndZ = 20; // \Z
|
||||
|
@ -170,7 +170,7 @@ namespace System.Text.RegularExpressions
|
|||
case Bol:
|
||||
case Eol:
|
||||
case Boundary:
|
||||
case Nonboundary:
|
||||
case NonBoundary:
|
||||
case ECMABoundary:
|
||||
case NonECMABoundary:
|
||||
case Beginning:
|
||||
|
@ -245,7 +245,7 @@ namespace System.Text.RegularExpressions
|
|||
Bol => nameof(Bol),
|
||||
Eol => nameof(Eol),
|
||||
Boundary => nameof(Boundary),
|
||||
Nonboundary => nameof(Nonboundary),
|
||||
NonBoundary => nameof(NonBoundary),
|
||||
Beginning => nameof(Beginning),
|
||||
Start => nameof(Start),
|
||||
EndZ => nameof(EndZ),
|
||||
|
|
|
@ -1769,7 +1769,7 @@ namespace System.Text.RegularExpressions
|
|||
case RegexNode.Multi:
|
||||
// Boundaries are like set checks and don't involve repetition, either.
|
||||
case RegexNode.Boundary:
|
||||
case RegexNode.Nonboundary:
|
||||
case RegexNode.NonBoundary:
|
||||
case RegexNode.ECMABoundary:
|
||||
case RegexNode.NonECMABoundary:
|
||||
// Anchors are also trivial.
|
||||
|
@ -2259,7 +2259,7 @@ namespace System.Text.RegularExpressions
|
|||
break;
|
||||
|
||||
case RegexNode.Boundary:
|
||||
case RegexNode.Nonboundary:
|
||||
case RegexNode.NonBoundary:
|
||||
case RegexNode.ECMABoundary:
|
||||
case RegexNode.NonECMABoundary:
|
||||
EmitBoundary(node);
|
||||
|
@ -2418,7 +2418,7 @@ namespace System.Text.RegularExpressions
|
|||
BrfalseFar(doneLabel);
|
||||
break;
|
||||
|
||||
case RegexNode.Nonboundary:
|
||||
case RegexNode.NonBoundary:
|
||||
Callvirt(s_isBoundaryMethod);
|
||||
BrtrueFar(doneLabel);
|
||||
break;
|
||||
|
@ -3891,7 +3891,7 @@ namespace System.Text.RegularExpressions
|
|||
}
|
||||
|
||||
case RegexCode.Boundary:
|
||||
case RegexCode.Nonboundary:
|
||||
case RegexCode.NonBoundary:
|
||||
//: if (!IsBoundary(Textpos(), _textbeg, _textend))
|
||||
//: break Backward;
|
||||
Ldthis();
|
||||
|
@ -4355,12 +4355,12 @@ namespace System.Text.RegularExpressions
|
|||
case RegexCode.Oneloopatomic | RegexCode.Ci | RegexCode.Rtl:
|
||||
case RegexCode.Notoneloopatomic | RegexCode.Ci | RegexCode.Rtl:
|
||||
case RegexCode.Setloopatomic | RegexCode.Ci | RegexCode.Rtl:
|
||||
//: int c = Operand(1);
|
||||
//: if (c > Rightchars())
|
||||
//: c = Rightchars();
|
||||
//: int len = Operand(1);
|
||||
//: if (len > Rightchars())
|
||||
//: len = Rightchars();
|
||||
//: char ch = (char)Operand(0);
|
||||
//: int i;
|
||||
//: for (i = c; i > 0; i--)
|
||||
//: for (i = len; i > 0; i--)
|
||||
//: {
|
||||
//: if (Rightcharnext() != ch)
|
||||
//: {
|
||||
|
@ -4368,14 +4368,13 @@ namespace System.Text.RegularExpressions
|
|||
//: break;
|
||||
//: }
|
||||
//: }
|
||||
//: if (c > i)
|
||||
//: Track(c - i - 1, Textpos() - 1);
|
||||
//: if (len > i)
|
||||
//: Track(len - i - 1, Textpos() - 1);
|
||||
{
|
||||
LocalBuilder cLocal = _temp1Local!;
|
||||
LocalBuilder lenLocal = _temp2Local!;
|
||||
LocalBuilder iLocal = _temp1Local!;
|
||||
charInClassLocal = _temp3Local!;
|
||||
Label l1 = DefineLabel();
|
||||
Label l2 = DefineLabel();
|
||||
Label loopEnd = DefineLabel();
|
||||
|
||||
int c = Operand(1);
|
||||
if (c == 0)
|
||||
|
@ -4404,78 +4403,137 @@ namespace System.Text.RegularExpressions
|
|||
Ldc(c);
|
||||
MarkLabel(l4);
|
||||
}
|
||||
Dup();
|
||||
Stloc(lenLocal);
|
||||
Ldc(1);
|
||||
Add();
|
||||
Stloc(cLocal);
|
||||
|
||||
MarkLabel(l1);
|
||||
Ldloc(cLocal);
|
||||
Ldc(1);
|
||||
Sub();
|
||||
Dup();
|
||||
Stloc(cLocal);
|
||||
Ldc(0);
|
||||
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
|
||||
// If this is a notoneloop{atomic} and we're left-to-right and case-sensitive,
|
||||
// we can use the vectorized IndexOf to search for the target character.
|
||||
if ((Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic) &&
|
||||
!IsRightToLeft() &&
|
||||
!IsCaseInsensitive())
|
||||
{
|
||||
BleFar(l2);
|
||||
}
|
||||
else
|
||||
{
|
||||
Ble(l2);
|
||||
}
|
||||
|
||||
if (IsRightToLeft())
|
||||
{
|
||||
Leftcharnext();
|
||||
}
|
||||
else
|
||||
{
|
||||
Rightcharnext();
|
||||
}
|
||||
|
||||
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
|
||||
{
|
||||
EmitTimeoutCheck();
|
||||
EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
|
||||
BrtrueFar(l1);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (IsCaseInsensitive())
|
||||
{
|
||||
CallToLower();
|
||||
}
|
||||
Stloc(lenLocal);
|
||||
|
||||
// i = runtext.AsSpan(runtextpos, len).IndexOf(ch);
|
||||
Ldloc(_runtextLocal!);
|
||||
Ldloc(_runtextposLocal!);
|
||||
Ldloc(lenLocal);
|
||||
Call(s_stringAsSpanIntIntMethod);
|
||||
Ldc(Operand(0));
|
||||
if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
|
||||
Call(s_spanIndexOf);
|
||||
Stloc(iLocal);
|
||||
|
||||
Label charFound = DefineLabel();
|
||||
|
||||
// if (i != -1) goto charFound;
|
||||
Ldloc(iLocal);
|
||||
Ldc(-1);
|
||||
Bne(charFound);
|
||||
|
||||
// runtextpos += len;
|
||||
// i = 0;
|
||||
// goto loopEnd;
|
||||
Ldloc(_runtextposLocal!);
|
||||
Ldloc(lenLocal);
|
||||
Add();
|
||||
Stloc(_runtextposLocal!);
|
||||
Ldc(0);
|
||||
Stloc(iLocal);
|
||||
BrFar(loopEnd);
|
||||
|
||||
// charFound:
|
||||
// runtextpos += i;
|
||||
// i = len - i;
|
||||
// goto loopEnd;
|
||||
MarkLabel(charFound);
|
||||
Ldloc(_runtextposLocal!);
|
||||
Ldloc(iLocal);
|
||||
Add();
|
||||
Stloc(_runtextposLocal!);
|
||||
Ldloc(lenLocal);
|
||||
Ldloc(iLocal);
|
||||
Sub();
|
||||
Stloc(iLocal);
|
||||
BrFar(loopEnd);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Otherwise, we emit the open-coded loop.
|
||||
|
||||
Dup();
|
||||
Stloc(lenLocal);
|
||||
Ldc(1);
|
||||
Add();
|
||||
Stloc(iLocal);
|
||||
|
||||
Label loopCondition = DefineLabel();
|
||||
MarkLabel(loopCondition);
|
||||
Ldloc(iLocal);
|
||||
Ldc(1);
|
||||
Sub();
|
||||
Dup();
|
||||
Stloc(iLocal);
|
||||
Ldc(0);
|
||||
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
|
||||
{
|
||||
Beq(l1);
|
||||
BleFar(loopEnd);
|
||||
}
|
||||
else
|
||||
{
|
||||
Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
|
||||
Bne(l1);
|
||||
Ble(loopEnd);
|
||||
}
|
||||
|
||||
if (IsRightToLeft())
|
||||
{
|
||||
Leftcharnext();
|
||||
}
|
||||
else
|
||||
{
|
||||
Rightcharnext();
|
||||
}
|
||||
|
||||
if (Code() == RegexCode.Setloop || Code() == RegexCode.Setloopatomic)
|
||||
{
|
||||
EmitTimeoutCheck();
|
||||
EmitMatchCharacterClass(_strings![Operand(0)], IsCaseInsensitive(), charInClassLocal);
|
||||
BrtrueFar(loopCondition);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (IsCaseInsensitive())
|
||||
{
|
||||
CallToLower();
|
||||
}
|
||||
|
||||
Ldc(Operand(0));
|
||||
if (Code() == RegexCode.Oneloop || Code() == RegexCode.Oneloopatomic)
|
||||
{
|
||||
Beq(loopCondition);
|
||||
}
|
||||
else
|
||||
{
|
||||
Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopatomic);
|
||||
Bne(loopCondition);
|
||||
}
|
||||
}
|
||||
|
||||
Ldloc(_runtextposLocal!);
|
||||
Ldc(1);
|
||||
Sub(IsRightToLeft());
|
||||
Stloc(_runtextposLocal!);
|
||||
}
|
||||
|
||||
Ldloc(_runtextposLocal!);
|
||||
Ldc(1);
|
||||
Sub(IsRightToLeft());
|
||||
Stloc(_runtextposLocal!);
|
||||
|
||||
MarkLabel(l2);
|
||||
|
||||
// loopEnd:
|
||||
MarkLabel(loopEnd);
|
||||
if (Code() != RegexCode.Oneloopatomic && Code() != RegexCode.Notoneloopatomic && Code() != RegexCode.Setloopatomic)
|
||||
{
|
||||
// if (len <= i) goto advance;
|
||||
Ldloc(lenLocal);
|
||||
Ldloc(cLocal);
|
||||
Ldloc(iLocal);
|
||||
Ble(AdvanceLabel());
|
||||
|
||||
// TrackPush(len - i - 1, runtextpos - Bump())
|
||||
ReadyPushTrack();
|
||||
Ldloc(lenLocal);
|
||||
Ldloc(cLocal);
|
||||
Ldloc(iLocal);
|
||||
Sub();
|
||||
Ldc(1);
|
||||
Sub();
|
||||
|
@ -4667,7 +4725,8 @@ namespace System.Text.RegularExpressions
|
|||
break;
|
||||
|
||||
default:
|
||||
throw new NotImplementedException(SR.UnimplementedState);
|
||||
Debug.Fail($"Unimplemented state: {_regexopcode:X8}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -69,7 +69,7 @@ namespace System.Text.RegularExpressions
|
|||
public const int Bol = RegexCode.Bol; // ^
|
||||
public const int Eol = RegexCode.Eol; // $
|
||||
public const int Boundary = RegexCode.Boundary; // \b
|
||||
public const int Nonboundary = RegexCode.Nonboundary; // \B
|
||||
public const int NonBoundary = RegexCode.NonBoundary; // \B
|
||||
public const int ECMABoundary = RegexCode.ECMABoundary; // \b
|
||||
public const int NonECMABoundary = RegexCode.NonECMABoundary; // \B
|
||||
public const int Beginning = RegexCode.Beginning; // \A
|
||||
|
@ -218,7 +218,7 @@ namespace System.Text.RegularExpressions
|
|||
case EndZ:
|
||||
case Eol:
|
||||
case Multi:
|
||||
case Nonboundary:
|
||||
case NonBoundary:
|
||||
case NonECMABoundary:
|
||||
case Nothing:
|
||||
case Notone:
|
||||
|
@ -1514,7 +1514,7 @@ namespace System.Text.RegularExpressions
|
|||
case EndZ when node.Ch != '\n':
|
||||
case Eol when node.Ch != '\n':
|
||||
case Boundary when RegexCharClass.IsWordChar(node.Ch):
|
||||
case Nonboundary when !RegexCharClass.IsWordChar(node.Ch):
|
||||
case NonBoundary when !RegexCharClass.IsWordChar(node.Ch):
|
||||
case ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
|
||||
case NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
|
||||
return true;
|
||||
|
@ -1554,7 +1554,7 @@ namespace System.Text.RegularExpressions
|
|||
case EndZ when !RegexCharClass.CharInClass('\n', node.Str!):
|
||||
case Eol when !RegexCharClass.CharInClass('\n', node.Str!):
|
||||
case Boundary when node.Str == RegexCharClass.WordClass || node.Str == RegexCharClass.DigitClass: // TODO: Expand these with a more inclusive overlap check that considers categories
|
||||
case Nonboundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
|
||||
case NonBoundary when node.Str == RegexCharClass.NotWordClass || node.Str == RegexCharClass.NotDigitClass:
|
||||
case ECMABoundary when node.Str == RegexCharClass.ECMAWordClass || node.Str == RegexCharClass.ECMADigitClass:
|
||||
case NonECMABoundary when node.Str == RegexCharClass.NotECMAWordClass || node.Str == RegexCharClass.NotDigitClass:
|
||||
return true;
|
||||
|
@ -1655,7 +1655,7 @@ namespace System.Text.RegularExpressions
|
|||
case End:
|
||||
case EndZ:
|
||||
case Eol:
|
||||
case Nonboundary:
|
||||
case NonBoundary:
|
||||
case NonECMABoundary:
|
||||
case Start:
|
||||
// Difficult to glean anything meaningful from boundaries or results only known at run time.
|
||||
|
@ -1782,7 +1782,7 @@ namespace System.Text.RegularExpressions
|
|||
Bol => nameof(Bol),
|
||||
Eol => nameof(Eol),
|
||||
Boundary => nameof(Boundary),
|
||||
Nonboundary => nameof(Nonboundary),
|
||||
NonBoundary => nameof(NonBoundary),
|
||||
ECMABoundary => nameof(ECMABoundary),
|
||||
NonECMABoundary => nameof(NonECMABoundary),
|
||||
Beginning => nameof(Beginning),
|
||||
|
|
|
@ -1750,7 +1750,7 @@ namespace System.Text.RegularExpressions
|
|||
ch switch
|
||||
{
|
||||
'b' => UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary,
|
||||
'B' => UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary,
|
||||
'B' => UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.NonBoundary,
|
||||
'A' => RegexNode.Beginning,
|
||||
'G' => RegexNode.Start,
|
||||
'Z' => RegexNode.EndZ,
|
||||
|
|
|
@ -598,7 +598,7 @@ namespace System.Text.RegularExpressions
|
|||
case RegexNode.Bol:
|
||||
case RegexNode.Eol:
|
||||
case RegexNode.Boundary:
|
||||
case RegexNode.Nonboundary:
|
||||
case RegexNode.NonBoundary:
|
||||
case RegexNode.ECMABoundary:
|
||||
case RegexNode.NonECMABoundary:
|
||||
case RegexNode.Beginning:
|
||||
|
|
|
@ -511,7 +511,7 @@ namespace System.Text.RegularExpressions
|
|||
case RegexNode.Bol:
|
||||
case RegexNode.Eol:
|
||||
case RegexNode.Boundary:
|
||||
case RegexNode.Nonboundary:
|
||||
case RegexNode.NonBoundary:
|
||||
case RegexNode.ECMABoundary:
|
||||
case RegexNode.NonECMABoundary:
|
||||
case RegexNode.Beginning:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue