mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-10 18:11:04 +09:00
Some more cleanup to regex NonBacktracking (#104766)
* Rent object[] rather than (uint,uint)[][] from the ArrayPool * Remove unnecessary TInputReader generic from functions * Add more comments and do some renames * Remove unused TFindOptimizationsHandler from FindEndPositionDeltasNFA * Fix a stray input reader * Some more renames * Avoid duplicated reads of input character and nullability info * Remove initialStateId from TryFindNextStartingPosition and make initial accelerators more similar * Remove unused initialStatePos / initialStatePosCandidate It's only ever written and not actually used for anything. * Remove unnecessary generic args and remove resulting dead code Multiple XxDfa / XxNfa methods took a TStateHandler, but it was only ever DfaStateHandler for XxDfa or NfaStateHandler for XxNfa. We can just use the types directly in those methods, rather than generically parameterizing. Doing that revealed all but one of the members of IStateHandler weren't needed on the interface. And removing those revealed a bunch of dead code on DfaStateHandler/NfaStateHandler, which were removed, as well as arguments to some methods that weren't used. * Put GetStateFlags back in IStateHandler and use it to avoid duplication at call sites * Put out argument last in TryCreateNewTransition * Store state to local in FindStartPositionDeltasDFA * Merge IAcceleratedStateHandler into IInitialStateHandler * Remove MintermClassifier.IntLookup
This commit is contained in:
parent
9b09bcfada
commit
b54bfdd041
5 changed files with 279 additions and 356 deletions
|
@ -47,8 +47,11 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
// in order to size the lookup array to minimize steady-state memory consumption of the potentially
|
||||
// large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
|
||||
// consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
|
||||
// However, when there are more than 255 minterms, we need to use int[] _intLookup.
|
||||
(uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
|
||||
// However, when there are more than 255 minterms, we need to use int[] _intLookup. We rent an object[]
|
||||
// rather than a (uint,uint)[][] to avoid the extra type pressure on the ArrayPool (object[]s are common,
|
||||
// (uint,uint)[][]s much less so).
|
||||
object[] arrayPoolArray = ArrayPool<object>.Shared.Rent(minterms.Length);
|
||||
Span<object> charRangesPerMinterm = arrayPoolArray.AsSpan(0, minterms.Length);
|
||||
|
||||
int maxChar = -1;
|
||||
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
|
||||
|
@ -70,17 +73,17 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
}
|
||||
|
||||
// Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
|
||||
Array.Clear(charRangesPerMinterm, 0, minterms.Length);
|
||||
ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
|
||||
charRangesPerMinterm.Clear();
|
||||
ArrayPool<object>.Shared.Return(arrayPoolArray);
|
||||
|
||||
// Creates the lookup array.
|
||||
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
|
||||
// Creates the lookup array. charRangesPerMinterm needs to have already been populated with (uint, uint)[] instances.
|
||||
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<object> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
|
||||
{
|
||||
T[] lookup = new T[_maxChar + 1];
|
||||
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
|
||||
{
|
||||
// Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
|
||||
foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
|
||||
foreach ((uint start, uint end) in ((uint, uint)[])charRangesPerMinterm[mintermId])
|
||||
{
|
||||
lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
|
||||
}
|
||||
|
@ -101,7 +104,9 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
}
|
||||
else
|
||||
{
|
||||
int[] lookup = _intLookup!;
|
||||
Debug.Assert(_intLookup is not null);
|
||||
|
||||
int[] lookup = _intLookup;
|
||||
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
|
||||
}
|
||||
}
|
||||
|
@ -111,12 +116,6 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
/// </summary>
|
||||
public byte[]? ByteLookup => _lookup;
|
||||
|
||||
/// <summary>
|
||||
/// Gets a mapping from char to minterm for the rare case when there are >= 255 minterms.
|
||||
/// Null in the common case where there are fewer than 255 minterms.
|
||||
/// </summary>
|
||||
public int[]? IntLookup => _intLookup;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum ordinal character for a non-0 minterm, used to conserve memory
|
||||
/// </summary>
|
||||
|
|
|
@ -120,8 +120,8 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
/// Pre-computed hot-loop version of nullability check
|
||||
/// </summary>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool IsNullableWithContext(int stateId, int mintermId) =>
|
||||
(_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0;
|
||||
private bool IsNullableWithContext(byte stateNullability, int mintermId) =>
|
||||
(stateNullability & (1 << (int)GetPositionKind(mintermId))) != 0;
|
||||
|
||||
/// <summary>Returns the span from <see cref="_dfaDelta"/> that may contain transitions for the given state</summary>
|
||||
private Span<int> GetDeltasFor(MatchingState<TSet> state)
|
||||
|
@ -355,9 +355,7 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
|
||||
/// <summary>Gets or creates a new DFA transition.</summary>
|
||||
/// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
|
||||
private bool TryCreateNewTransition(
|
||||
MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState<TSet>? nextState,
|
||||
long timeoutOccursAt = 0)
|
||||
private bool TryCreateNewTransition(MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, long timeoutOccursAt, [NotNullWhen(true)] out MatchingState<TSet>? nextState)
|
||||
{
|
||||
Debug.Assert(offset < _dfaDelta.Length);
|
||||
lock (this)
|
||||
|
|
|
@ -35,16 +35,22 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
{
|
||||
// Don't dequeue yet, because a transition might fail
|
||||
MatchingState<TSet> state = toExplore.Peek();
|
||||
|
||||
// Include the special minterm for the last end-of-line if the state is sensitive to it
|
||||
int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1;
|
||||
|
||||
// Explore successor states for each minterm
|
||||
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
|
||||
{
|
||||
int offset = DeltaOffset(state.Id, mintermId);
|
||||
if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState<TSet>? nextState))
|
||||
if (!TryCreateNewTransition(state, mintermId, offset, true, 0, out MatchingState<TSet>? nextState))
|
||||
{
|
||||
goto DfaLimitReached;
|
||||
}
|
||||
|
||||
EnqueueIfUnseen(nextState, seen, toExplore);
|
||||
}
|
||||
|
||||
// Safe to dequeue now that the state has been completely handled
|
||||
toExplore.Dequeue();
|
||||
}
|
||||
|
|
|
@ -71,7 +71,7 @@ namespace System.Text.RegularExpressions.Symbolic
|
|||
NfaMatchingState states = new();
|
||||
// Here one could also consider previous characters for example for \b, \B, and ^ anchors
|
||||
// and initialize inputSoFar accordingly
|
||||
states.InitializeFrom(this, _initialStates[GetCharKind<FullInputReader>([], -1)]);
|
||||
states.InitializeFrom(this, _initialStates[GetCharKind([], -1)]);
|
||||
CurrentState statesWrapper = new(states);
|
||||
|
||||
// Used for end suffixes
|
||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue