mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-09 09:34:49 +09:00
parent
ac02b66e41
commit
899461780a
37 changed files with 4372 additions and 94 deletions
|
@ -1270,3 +1270,30 @@ Licensed under the Apache License, Version 2.0.
|
|||
|
||||
Available at
|
||||
https://github.com/SixLabors/ImageSharp/blob/f4f689ce67ecbcc35cebddba5aacb603e6d1068a/LICENSE
|
||||
|
||||
License for the Teddy multi-substring searching implementation
|
||||
--------------------------------------
|
||||
|
||||
https://github.com/BurntSushi/aho-corasick
|
||||
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Andrew Gallant
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
|
|
@ -235,6 +235,8 @@ namespace System
|
|||
public static bool Contains(this System.ReadOnlySpan<char> span, System.ReadOnlySpan<char> value, System.StringComparison comparisonType) { throw null; }
|
||||
public static bool Contains<T>(this System.ReadOnlySpan<T> span, T value) where T : System.IEquatable<T>? { throw null; }
|
||||
public static bool Contains<T>(this System.Span<T> span, T value) where T : System.IEquatable<T>? { throw null; }
|
||||
public static bool ContainsAny(this System.ReadOnlySpan<char> span, System.Buffers.SearchValues<string> values) { throw null; }
|
||||
public static bool ContainsAny(this System.Span<char> span, System.Buffers.SearchValues<string> values) { throw null; }
|
||||
public static bool ContainsAny<T>(this System.ReadOnlySpan<T> span, System.Buffers.SearchValues<T> values) where T : System.IEquatable<T>? { throw null; }
|
||||
public static bool ContainsAny<T>(this System.ReadOnlySpan<T> span, System.ReadOnlySpan<T> values) where T : System.IEquatable<T>? { throw null; }
|
||||
public static bool ContainsAny<T>(this System.ReadOnlySpan<T> span, T value0, T value1) where T : System.IEquatable<T>? { throw null; }
|
||||
|
@ -272,6 +274,8 @@ namespace System
|
|||
public static System.Text.SpanRuneEnumerator EnumerateRunes(this System.Span<char> span) { throw null; }
|
||||
public static bool Equals(this System.ReadOnlySpan<char> span, System.ReadOnlySpan<char> other, System.StringComparison comparisonType) { throw null; }
|
||||
public static int IndexOf(this System.ReadOnlySpan<char> span, System.ReadOnlySpan<char> value, System.StringComparison comparisonType) { throw null; }
|
||||
public static int IndexOfAny(this System.ReadOnlySpan<char> span, System.Buffers.SearchValues<string> values) { throw null; }
|
||||
public static int IndexOfAny(this System.Span<char> span, System.Buffers.SearchValues<string> values) { throw null; }
|
||||
public static int IndexOfAny<T>(this System.ReadOnlySpan<T> span, System.Buffers.SearchValues<T> values) where T : System.IEquatable<T>? { throw null; }
|
||||
public static int IndexOfAny<T>(this System.ReadOnlySpan<T> span, System.ReadOnlySpan<T> values) where T : System.IEquatable<T>? { throw null; }
|
||||
public static int IndexOfAny<T>(this System.ReadOnlySpan<T> span, T value0, T value1) where T : System.IEquatable<T>? { throw null; }
|
||||
|
|
519
src/libraries/System.Memory/tests/Span/StringSearchValues.cs
Normal file
519
src/libraries/System.Memory/tests/Span/StringSearchValues.cs
Normal file
|
@ -0,0 +1,519 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Buffers;
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Runtime.ExceptionServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.DotNet.RemoteExecutor;
|
||||
using Xunit;
|
||||
|
||||
namespace System.Memory.Tests.Span
|
||||
{
|
||||
public static class StringSearchValuesTests
|
||||
{
|
||||
public static bool CanTestInvariantCulture => RemoteExecutor.IsSupported;
|
||||
public static bool CanTestNls => RemoteExecutor.IsSupported && OperatingSystem.IsWindows();
|
||||
|
||||
[Theory]
|
||||
[InlineData(StringComparison.Ordinal, "a")]
|
||||
[InlineData(StringComparison.Ordinal, "A")]
|
||||
[InlineData(StringComparison.Ordinal, "a", "ab", "abc", "bc")]
|
||||
[InlineData(StringComparison.Ordinal, "A", "ab", "aBc", "Bc")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, "a")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, "A")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, "A", "a")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, "a", "Ab", "abc", "bC")]
|
||||
public static void Values_ImplementsSearchValuesBase(StringComparison comparisonType, params string[] values)
|
||||
{
|
||||
const string ValueNotInSet = "Hello world";
|
||||
|
||||
SearchValues<string> stringValues = SearchValues.Create(values, comparisonType);
|
||||
|
||||
Assert.False(stringValues.Contains(ValueNotInSet));
|
||||
|
||||
AssertIndexOfAnyAndFriends(Span<string>.Empty, -1, -1, -1, -1);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet }, -1, 0, -1, 0);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, ValueNotInSet }, -1, 0, -1, 1);
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
string differentCase = value.ToLowerInvariant();
|
||||
if (value == differentCase)
|
||||
{
|
||||
differentCase = value.ToUpperInvariant();
|
||||
Assert.NotEqual(value, differentCase);
|
||||
}
|
||||
|
||||
Assert.True(stringValues.Contains(value));
|
||||
Assert.Equal(comparisonType == StringComparison.OrdinalIgnoreCase, stringValues.Contains(differentCase));
|
||||
|
||||
AssertIndexOfAnyAndFriends(new[] { value }, 0, -1, 0, -1);
|
||||
AssertIndexOfAnyAndFriends(new[] { value, value }, 0, -1, 1, -1);
|
||||
AssertIndexOfAnyAndFriends(new[] { value, ValueNotInSet }, 0, 1, 0, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { value, ValueNotInSet, ValueNotInSet }, 0, 1, 0, 2);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, value }, 1, 0, 1, 0);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, ValueNotInSet, value }, 2, 0, 2, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, value, ValueNotInSet }, 1, 0, 1, 2);
|
||||
AssertIndexOfAnyAndFriends(new[] { value, ValueNotInSet, value }, 0, 1, 2, 1);
|
||||
|
||||
if (comparisonType == StringComparison.OrdinalIgnoreCase)
|
||||
{
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase }, 0, -1, 0, -1);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, differentCase }, 0, -1, 1, -1);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet }, 0, 1, 0, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet, ValueNotInSet }, 0, 1, 0, 2);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, differentCase }, 1, 0, 1, 0);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, ValueNotInSet, differentCase }, 2, 0, 2, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, differentCase, ValueNotInSet }, 1, 0, 1, 2);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet, differentCase }, 0, 1, 2, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase }, -1, 0, -1, 0);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, differentCase }, -1, 0, -1, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet }, -1, 0, -1, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, differentCase }, -1, 0, -1, 1);
|
||||
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet, ValueNotInSet }, -1, 0, -1, 2);
|
||||
}
|
||||
}
|
||||
|
||||
void AssertIndexOfAnyAndFriends(Span<string> values, int any, int anyExcept, int last, int lastExcept)
|
||||
{
|
||||
Assert.Equal(any >= 0, last >= 0);
|
||||
Assert.Equal(anyExcept >= 0, lastExcept >= 0);
|
||||
|
||||
Assert.Equal(any, values.IndexOfAny(stringValues));
|
||||
Assert.Equal(any, ((ReadOnlySpan<string>)values).IndexOfAny(stringValues));
|
||||
Assert.Equal(anyExcept, values.IndexOfAnyExcept(stringValues));
|
||||
Assert.Equal(anyExcept, ((ReadOnlySpan<string>)values).IndexOfAnyExcept(stringValues));
|
||||
Assert.Equal(last, values.LastIndexOfAny(stringValues));
|
||||
Assert.Equal(last, ((ReadOnlySpan<string>)values).LastIndexOfAny(stringValues));
|
||||
Assert.Equal(lastExcept, values.LastIndexOfAnyExcept(stringValues));
|
||||
Assert.Equal(lastExcept, ((ReadOnlySpan<string>)values).LastIndexOfAnyExcept(stringValues));
|
||||
|
||||
Assert.Equal(any >= 0, values.ContainsAny(stringValues));
|
||||
Assert.Equal(any >= 0, ((ReadOnlySpan<string>)values).ContainsAny(stringValues));
|
||||
Assert.Equal(anyExcept >= 0, values.ContainsAnyExcept(stringValues));
|
||||
Assert.Equal(anyExcept >= 0, ((ReadOnlySpan<string>)values).ContainsAnyExcept(stringValues));
|
||||
}
|
||||
}
|
||||
|
||||
[Theory]
|
||||
// Sets with empty values
|
||||
[InlineData(StringComparison.Ordinal, 0, " ", "abc, ")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 0, " ", "abc, ")]
|
||||
[InlineData(StringComparison.Ordinal, 0, "", "")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 0, "", "abc, ")]
|
||||
// Empty sets
|
||||
[InlineData(StringComparison.Ordinal, -1, " ", null)]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, -1, " ", null)]
|
||||
[InlineData(StringComparison.Ordinal, -1, "", null)]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "", null)]
|
||||
// A few simple cases
|
||||
[InlineData(StringComparison.Ordinal, 1, "xbc", "abc, bc")]
|
||||
[InlineData(StringComparison.Ordinal, 0, "foobar", "foo, bar")]
|
||||
[InlineData(StringComparison.Ordinal, 0, "barfoo", "foo, bar")]
|
||||
[InlineData(StringComparison.Ordinal, 0, "foofoo", "foo, bar")]
|
||||
[InlineData(StringComparison.Ordinal, 0, "barbar", "foo, bar")]
|
||||
[InlineData(StringComparison.Ordinal, 4, "bafofoo", "foo, bar")]
|
||||
[InlineData(StringComparison.Ordinal, 4, "bafofoo", "bar, foo")]
|
||||
[InlineData(StringComparison.Ordinal, 4, "fobabar", "foo, bar")]
|
||||
[InlineData(StringComparison.Ordinal, 4, "fobabar", "bar, foo")]
|
||||
// Multiple potential matches - we want the first one
|
||||
[InlineData(StringComparison.Ordinal, 1, "abcd", "bc, cd")]
|
||||
// Simple case sensitivity
|
||||
[InlineData(StringComparison.Ordinal, -1, " ABC", "abc")]
|
||||
[InlineData(StringComparison.Ordinal, 1, " abc", "abc")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 1, " ABC", "abc")]
|
||||
// A few more complex cases that test the Aho-Corasick implementation
|
||||
[InlineData(StringComparison.Ordinal, 3, "RyrIGEdt2S9", "IGEdt2, G, rIGm6i")]
|
||||
[InlineData(StringComparison.Ordinal, 2, "Npww1HtmO", "NVOhQu, w, XeR")]
|
||||
[InlineData(StringComparison.Ordinal, 1, "08Qq6", "8, vx, BFA4s, aLP2, hm, lmT, y, CNTB, Q, vd")]
|
||||
[InlineData(StringComparison.Ordinal, 3, "A4sRYUhKZR1Vn8N", "F, scsx, nWBhrx, Q, 7Of, BX, huoJ, R")]
|
||||
[InlineData(StringComparison.Ordinal, 9, "40sufu3TdzcKQfK", "3MXvo26, zPd6t, zc, c5, ypUCK3A9, K, YlX")]
|
||||
[InlineData(StringComparison.Ordinal, 0, "111KtTGeWuV", "11, B51tJ, Z, j0DWudC, kuJRbcovn, 0T2vnT9")]
|
||||
[InlineData(StringComparison.Ordinal, 5, "Uykbt1zWw7wylEgC", "1zWw7, Bh, 7qDgAY, w, Z, dP, V, W, Hiols, T")]
|
||||
[InlineData(StringComparison.Ordinal, 6, "PI9yZx9AOWrUR", "4, A, MLbg, jACE, x9AZEYPbLr, 4bYTzw, W, 9AOW, O")]
|
||||
[InlineData(StringComparison.Ordinal, 7, "KV4cRyrIGEdt2S9kbXVK", "e64, 10Yw7k, IGEdt2, G, brL, rIGm6i, Z3, FHoVN, 7P2s")]
|
||||
// OrdinalIgnoreCase does not match ASCII chars with non-ASCII ones
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 4, "AAAA\u212ABKBkBBCCCC", "\u212A")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAAKB\u212ABkBBCCCC", "\u212A")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAAkB\u212ABKBBCCCC", "\u212A")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 4, "AAAA\u017FBSBsBBCCCC", "\u017F")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAASB\u017FBsBBCCCC", "\u017F")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAAsB\u017FBSBBCCCC", "\u017F")]
|
||||
// A few misc non-ASCII examples
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 2, "\0\u1226\u2C5F\0\n\0\u1226\u1242", "hh\u0012\uFE00\u26FF\0\u6C00\u2C00\0b, \u2C5F\0")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "barkbarK", "foo, bar\u212A")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 4, "bar\u212AbarK", "foo, bark")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 0, "bar\u03A3barK", "foo, bar\u03C3")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 1, "bar\u03A3barK", "foo, ar\u03C3")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 1, " foo\u0131", "foo\u0131")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, 1, " foo\u0131", "bar, foo\u0131")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "fooifooIfoo\u0130", "bar, foo\u0131")]
|
||||
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "fooifooIfoo\u0131", "bar, foo\u0130")]
|
||||
public static void IndexOfAny(StringComparison comparisonType, int expected, string text, string? values)
|
||||
{
|
||||
Span<char> textSpan = text.ToArray(); // Test non-readonly Span<char> overloads
|
||||
|
||||
string[] valuesArray = values is null ? Array.Empty<string>() : values.Split(", ");
|
||||
|
||||
SearchValues<string> stringValues = SearchValues.Create(valuesArray, comparisonType);
|
||||
|
||||
Assert.Equal(expected, IndexOfAnyReferenceImpl(text, valuesArray, comparisonType));
|
||||
|
||||
Assert.Equal(expected, text.AsSpan().IndexOfAny(stringValues));
|
||||
Assert.Equal(expected, textSpan.IndexOfAny(stringValues));
|
||||
|
||||
Assert.Equal(expected >= 0, text.AsSpan().ContainsAny(stringValues));
|
||||
Assert.Equal(expected >= 0, textSpan.ContainsAny(stringValues));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public static void IndexOfAny_InvalidUtf16()
|
||||
{
|
||||
// Not using [InlineData] to prevent Xunit from modifying the invalid strings.
|
||||
// These strings have a high surrogate without the full pair.
|
||||
IndexOfAny(StringComparison.Ordinal, 1, " foo\uD800bar", "foo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.Ordinal, -1, " foo\uD801bar", "foo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.Ordinal, 2, " foo\uD800bar", "oo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.Ordinal, -1, " foo\uD801bar", "oo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " foo\uD800bar", "foo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " foo\uD801bar", "foo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 2, " foo\uD800bar", "oo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " foo\uD801bar", "oo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " fOo\uD800bar", "Foo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " fOo\uD801bar", "Foo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 2, " foo\uD800bAr", "Oo\uD800bar, bar\uD800foo");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " foO\uD801bar", "oo\uD800baR, bar\uD800foo");
|
||||
|
||||
// Low surrogate without the high surrogate.
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, "\uD801\uDCD8\uD8FB\uDCD8", "foo, \uDCD8");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public static void IndexOfAny_CanProduceDifferentResultsUnderNls()
|
||||
{
|
||||
if (CanTestInvariantCulture)
|
||||
{
|
||||
RunUsingInvariantCulture(static () =>
|
||||
{
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " \U00016E40", "\U00016E60");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " \U00016E40abc", "\U00016E60, abc");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " abc\U00016E40", "abc\U00016E60");
|
||||
});
|
||||
}
|
||||
|
||||
if (CanTestNls)
|
||||
{
|
||||
RunUsingNLS(static () =>
|
||||
{
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " \U00016E40", "\U00016E60");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, 3, " \U00016E40abc", "\U00016E60, abc");
|
||||
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " abc\U00016E40", "abc\U00016E60");
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public static void Create_OnlyOrdinalComparisonIsSupported()
|
||||
{
|
||||
foreach (StringComparison comparisonType in Enum.GetValues<StringComparison>())
|
||||
{
|
||||
if (comparisonType is StringComparison.Ordinal or StringComparison.OrdinalIgnoreCase)
|
||||
{
|
||||
_ = SearchValues.Create(new[] { "abc" }, comparisonType);
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert.Throws<ArgumentException>(() => SearchValues.Create(new[] { "abc" }, comparisonType));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public static void Create_ThrowsOnNullValues()
|
||||
{
|
||||
Assert.Throws<ArgumentNullException>("values", () => SearchValues.Create(new[] { "foo", null, "bar" }, StringComparison.Ordinal));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public static void TestIndexOfAny_RandomInputs()
|
||||
{
|
||||
var helper = new StringSearchValuesTestHelper(
|
||||
expected: IndexOfAnyReferenceImpl,
|
||||
searchValues: (searchSpace, values) => searchSpace.IndexOfAny(values));
|
||||
|
||||
helper.TestRandomInputs();
|
||||
}
|
||||
|
||||
[ConditionalFact(nameof(CanTestInvariantCulture))]
|
||||
public static void TestIndexOfAny_RandomInputs_InvariantCulture()
|
||||
{
|
||||
RunUsingInvariantCulture(static () =>
|
||||
{
|
||||
Assert.Equal("Invariant Language (Invariant Country)", CultureInfo.CurrentCulture.NativeName);
|
||||
|
||||
TestIndexOfAny_RandomInputs();
|
||||
});
|
||||
}
|
||||
|
||||
[ConditionalFact(nameof(CanTestNls))]
|
||||
public static void TestIndexOfAny_RandomInputs_Nls()
|
||||
{
|
||||
RunUsingNLS(static () =>
|
||||
{
|
||||
Assert.NotEqual("Invariant Language (Invariant Country)", CultureInfo.CurrentCulture.NativeName);
|
||||
|
||||
TestIndexOfAny_RandomInputs();
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
[ActiveIssue("Manual execution only. Worth running any time SearchValues<string> logic is modified.")]
|
||||
public static void TestIndexOfAny_RandomInputs_Stress()
|
||||
{
|
||||
RunStress();
|
||||
|
||||
if (CanTestInvariantCulture)
|
||||
{
|
||||
RunUsingInvariantCulture(static () => RunStress());
|
||||
}
|
||||
|
||||
if (CanTestNls)
|
||||
{
|
||||
RunUsingNLS(static () => RunStress());
|
||||
}
|
||||
|
||||
static void RunStress()
|
||||
{
|
||||
foreach (int maxNeedleCount in new[] { 2, 8, 20, 100 })
|
||||
{
|
||||
foreach (int maxNeedleValueLength in new[] { 8, 40 })
|
||||
{
|
||||
foreach (int haystackLength in new[] { 100, 1024 })
|
||||
{
|
||||
var helper = new StringSearchValuesTestHelper(
|
||||
expected: IndexOfAnyReferenceImpl,
|
||||
searchValues: (searchSpace, values) => searchSpace.IndexOfAny(values),
|
||||
rngSeed: Random.Shared.Next())
|
||||
{
|
||||
MaxNeedleCount = maxNeedleCount,
|
||||
MaxNeedleValueLength = maxNeedleValueLength,
|
||||
MaxHaystackLength = haystackLength,
|
||||
HaystackIterationsPerNeedle = 1_000,
|
||||
};
|
||||
|
||||
helper.StressRandomInputs(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static int IndexOfAnyReferenceImpl(ReadOnlySpan<char> searchSpace, ReadOnlySpan<string> values, StringComparison comparisonType)
|
||||
{
|
||||
int minIndex = int.MaxValue;
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
int i = searchSpace.IndexOf(value, comparisonType);
|
||||
if ((uint)i < minIndex)
|
||||
{
|
||||
minIndex = i;
|
||||
}
|
||||
}
|
||||
|
||||
return minIndex == int.MaxValue ? -1 : minIndex;
|
||||
}
|
||||
|
||||
private static void RunUsingInvariantCulture(Action action)
|
||||
{
|
||||
Assert.True(CanTestInvariantCulture);
|
||||
|
||||
var psi = new ProcessStartInfo();
|
||||
psi.Environment.Clear();
|
||||
psi.Environment.Add("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "true");
|
||||
|
||||
RemoteExecutor.Invoke(action, new RemoteInvokeOptions { StartInfo = psi, TimeOut = 10 * 60 * 1000 }).Dispose();
|
||||
}
|
||||
|
||||
private static void RunUsingNLS(Action action)
|
||||
{
|
||||
Assert.True(CanTestNls);
|
||||
|
||||
var psi = new ProcessStartInfo();
|
||||
psi.Environment.Clear();
|
||||
psi.Environment.Add("DOTNET_SYSTEM_GLOBALIZATION_USENLS", "true");
|
||||
|
||||
RemoteExecutor.Invoke(action, new RemoteInvokeOptions { StartInfo = psi, TimeOut = 10 * 60 * 1000 }).Dispose();
|
||||
}
|
||||
|
||||
private sealed class StringSearchValuesTestHelper
|
||||
{
|
||||
public delegate int IndexOfAnySearchDelegate(ReadOnlySpan<char> searchSpace, ReadOnlySpan<string> values, StringComparison comparisonType);
|
||||
|
||||
public delegate int SearchValuesSearchDelegate(ReadOnlySpan<char> searchSpace, SearchValues<string> values);
|
||||
|
||||
public int MaxNeedleCount = 20;
|
||||
public int MaxNeedleValueLength = 10;
|
||||
public int MaxHaystackLength = 100;
|
||||
public int HaystackIterationsPerNeedle = 50;
|
||||
public int MinValueLength = 1;
|
||||
|
||||
private readonly IndexOfAnySearchDelegate _expectedDelegate;
|
||||
private readonly SearchValuesSearchDelegate _searchValuesDelegate;
|
||||
|
||||
private readonly char[] _randomAsciiChars;
|
||||
private readonly char[] _randomSimpleAsciiChars;
|
||||
private readonly char[] _randomChars;
|
||||
|
||||
public StringSearchValuesTestHelper(IndexOfAnySearchDelegate expected, SearchValuesSearchDelegate searchValues, int rngSeed = 42)
|
||||
{
|
||||
_expectedDelegate = expected;
|
||||
_searchValuesDelegate = searchValues;
|
||||
|
||||
_randomAsciiChars = new char[100 * 1024];
|
||||
_randomSimpleAsciiChars = new char[100 * 1024];
|
||||
_randomChars = new char[1024 * 1024];
|
||||
|
||||
var rng = new Random(rngSeed);
|
||||
|
||||
for (int i = 0; i < _randomAsciiChars.Length; i++)
|
||||
{
|
||||
_randomAsciiChars[i] = (char)rng.Next(0, 128);
|
||||
}
|
||||
|
||||
for (int i = 0; i < _randomSimpleAsciiChars.Length; i++)
|
||||
{
|
||||
int random = rng.Next(26 * 2 + 10);
|
||||
|
||||
_randomSimpleAsciiChars[i] = (char)(random + (random switch
|
||||
{
|
||||
< 10 => '0',
|
||||
< 36 => 'a' - 10,
|
||||
_ => 'A' - 36,
|
||||
}));
|
||||
}
|
||||
|
||||
rng.NextBytes(MemoryMarshal.Cast<char, byte>(_randomChars));
|
||||
}
|
||||
|
||||
public void StressRandomInputs(TimeSpan duration)
|
||||
{
|
||||
ExceptionDispatchInfo? exception = null;
|
||||
Stopwatch s = Stopwatch.StartNew();
|
||||
|
||||
Parallel.For(0, Environment.ProcessorCount - 1, _ =>
|
||||
{
|
||||
while (s.Elapsed < duration && Volatile.Read(ref exception) is null)
|
||||
{
|
||||
try
|
||||
{
|
||||
TestRandomInputs(iterationCount: 1, rng: new Random());
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
exception = ExceptionDispatchInfo.Capture(ex);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
exception?.Throw();
|
||||
}
|
||||
|
||||
public void TestRandomInputs(int iterationCount = 1_000, Random? rng = null)
|
||||
{
|
||||
rng ??= new Random(42);
|
||||
|
||||
for (int iterations = 0; iterations < iterationCount; iterations++)
|
||||
{
|
||||
// There are more interesting corner cases with ASCII needles, test those more.
|
||||
Test(rng, _randomSimpleAsciiChars, _randomSimpleAsciiChars);
|
||||
Test(rng, _randomAsciiChars, _randomSimpleAsciiChars);
|
||||
Test(rng, _randomSimpleAsciiChars, _randomAsciiChars);
|
||||
Test(rng, _randomAsciiChars, _randomAsciiChars);
|
||||
Test(rng, _randomChars, _randomSimpleAsciiChars);
|
||||
Test(rng, _randomChars, _randomAsciiChars);
|
||||
|
||||
Test(rng, _randomChars, _randomChars);
|
||||
}
|
||||
}
|
||||
|
||||
private void Test(Random rng, ReadOnlySpan<char> haystackRandom, ReadOnlySpan<char> needleRandom)
|
||||
{
|
||||
string[] values = new string[rng.Next(MaxNeedleCount) + 1];
|
||||
|
||||
for (int i = 0; i < values.Length; i++)
|
||||
{
|
||||
ReadOnlySpan<char> valueSpan;
|
||||
do
|
||||
{
|
||||
valueSpan = GetRandomSlice(rng, needleRandom, MaxNeedleValueLength);
|
||||
}
|
||||
while (valueSpan.Length < MinValueLength);
|
||||
|
||||
values[i] = valueSpan.ToString();
|
||||
}
|
||||
|
||||
SearchValues<string> valuesOrdinal = SearchValues.Create(values, StringComparison.Ordinal);
|
||||
SearchValues<string> valuesOrdinalIgnoreCase = SearchValues.Create(values, StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
for (int i = 0; i < HaystackIterationsPerNeedle; i++)
|
||||
{
|
||||
Test(rng, StringComparison.Ordinal, haystackRandom, values, valuesOrdinal);
|
||||
Test(rng, StringComparison.OrdinalIgnoreCase, haystackRandom, values, valuesOrdinalIgnoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
private void Test(Random rng, StringComparison comparisonType, ReadOnlySpan<char> haystackRandom,
|
||||
string[] needle, SearchValues<string> searchValuesInstance)
|
||||
{
|
||||
ReadOnlySpan<char> haystack = GetRandomSlice(rng, haystackRandom, MaxHaystackLength);
|
||||
|
||||
int expectedIndex = _expectedDelegate(haystack, needle, comparisonType);
|
||||
int searchValuesIndex = _searchValuesDelegate(haystack, searchValuesInstance);
|
||||
|
||||
if (expectedIndex != searchValuesIndex)
|
||||
{
|
||||
AssertionFailed(haystack, needle, searchValuesInstance, comparisonType, expectedIndex, searchValuesIndex);
|
||||
}
|
||||
}
|
||||
|
||||
private static ReadOnlySpan<T> GetRandomSlice<T>(Random rng, ReadOnlySpan<T> span, int maxLength)
|
||||
{
|
||||
ReadOnlySpan<T> slice = span.Slice(rng.Next(span.Length + 1));
|
||||
return slice.Slice(0, Math.Min(slice.Length, rng.Next(maxLength + 1)));
|
||||
}
|
||||
|
||||
private static void AssertionFailed(ReadOnlySpan<char> haystack, string[] needle, SearchValues<string> searchValues, StringComparison comparisonType, int expected, int actual)
|
||||
{
|
||||
Type implType = searchValues.GetType();
|
||||
string impl = $"{implType.Name} [{string.Join(", ", implType.GenericTypeArguments.Select(t => t.Name))}]";
|
||||
|
||||
string readableHaystack = ReadableAsciiOrSerialized(haystack.ToString());
|
||||
string readableNeedle = string.Join(", ", needle.Select(ReadableAsciiOrSerialized));
|
||||
|
||||
Assert.True(false, $"Expected {expected}, got {actual} for impl='{impl}' comparison={comparisonType} needle='{readableNeedle}', haystack='{readableHaystack}'");
|
||||
|
||||
static string ReadableAsciiOrSerialized(string value)
|
||||
{
|
||||
foreach (char c in value)
|
||||
{
|
||||
if (!char.IsAsciiLetterOrDigit(c))
|
||||
{
|
||||
return $"[ {string.Join(", ", value.Select(c => int.CreateChecked(c)))} ]";
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,14 +18,13 @@
|
|||
<Compile Include="MemoryMarshal\CreateSpan.cs" />
|
||||
<Compile Include="MemoryMarshal\CreateReadOnlySpan.cs" />
|
||||
<Compile Include="MemoryMarshal\CreateReadOnlySpanFromNullTerminated.cs" />
|
||||
<Compile Include="$(CommonPath)..\tests\System\RealFormatterTestsBase.cs"
|
||||
Link="ParsersAndFormatters\Formatter\RealFormatterTestsBase.cs" />
|
||||
<Compile Include="$(CommonPath)..\tests\System\RealFormatterTestsBase.cs" Link="ParsersAndFormatters\Formatter\RealFormatterTestsBase.cs" />
|
||||
<Compile Include="ParsersAndFormatters\Formatter\RealFormatterTests.cs" />
|
||||
<Compile Include="$(CommonPath)..\tests\System\RealParserTestsBase.cs"
|
||||
Link="ParsersAndFormatters\Parser\RealParserTestsBase.cs" />
|
||||
<Compile Include="$(CommonPath)..\tests\System\RealParserTestsBase.cs" Link="ParsersAndFormatters\Parser\RealParserTestsBase.cs" />
|
||||
<Compile Include="ParsersAndFormatters\Parser\RealParserTests.cs" />
|
||||
<Compile Include="ReadOnlySpan\Contains.byte.cs" />
|
||||
<Compile Include="ReadOnlySpan\Contains.T.cs" />
|
||||
<Compile Include="Span\StringSearchValues.cs" />
|
||||
<Compile Include="Span\Reflection.cs" />
|
||||
<Compile Include="SequenceReader\Advance.cs" />
|
||||
<Compile Include="SequenceReader\BasicTests.cs" />
|
||||
|
@ -276,9 +275,7 @@
|
|||
<Compile Include="Base64\Base64ValidationUnitTests.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="$(CommonTestPath)System\Buffers\NativeMemoryManager.cs"
|
||||
Link="Common\System\Buffers\NativeMemoryManager.cs" />
|
||||
<Compile Include="$(CommonPath)System\MutableDecimal.cs"
|
||||
Link="Common\System\MutableDecimal.cs" />
|
||||
<Compile Include="$(CommonTestPath)System\Buffers\NativeMemoryManager.cs" Link="Common\System\Buffers\NativeMemoryManager.cs" />
|
||||
<Compile Include="$(CommonPath)System\MutableDecimal.cs" Link="Common\System\MutableDecimal.cs" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
|
|
|
@ -4250,4 +4250,7 @@
|
|||
<data name="OutOfMemory_StringTooLong" xml:space="preserve">
|
||||
<value>String length exceeded supported range.</value>
|
||||
</data>
|
||||
<data name="Argument_SearchValues_UnsupportedStringComparison" xml:space="preserve">
|
||||
<value>SearchValues<string> supports only StringComparison.Ordinal and StringComparison.OrdinalIgnoreCase.</value>
|
||||
</data>
|
||||
</root>
|
||||
|
|
|
@ -440,6 +440,27 @@
|
|||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\SearchValuesDebugView.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\EmptySearchValues.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\ProbabilisticMap.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\AhoCorasick.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\AhoCorasickBuilder.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\AhoCorasickNode.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\CharacterFrequencyHelper.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\EightPackedReferences.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\RabinKarp.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\StringSearchValuesHelper.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\TeddyBucketizer.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\TeddyHelper.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBucketizedN2.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBucketizedN3.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyNonBucketizedN2.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyNonBucketizedN3.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBase.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\MultiStringIgnoreCaseSearchValuesFallback.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesThreeChars.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesFallback.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValues.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesBase.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesAhoCorasick.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesRabinKarp.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\IndexOutOfRangeException.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\InsufficientExecutionStackException.cs" />
|
||||
<Compile Include="$(MSBuildThisFileDirectory)System\InsufficientMemoryException.cs" />
|
||||
|
|
|
@ -16,6 +16,9 @@ namespace System.Globalization
|
|||
Debug.Assert(char.IsLowSurrogate(l));
|
||||
|
||||
UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(h, l)), out hr, out lr);
|
||||
|
||||
Debug.Assert(char.IsHighSurrogate(hr));
|
||||
Debug.Assert(char.IsLowSurrogate(lr));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
@ -25,6 +28,9 @@ namespace System.Globalization
|
|||
Debug.Assert(char.IsLowSurrogate(l));
|
||||
|
||||
UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(CharUnicodeInfo.ToLower(UnicodeUtility.GetScalarFromUtf16SurrogatePair(h, l)), out hr, out lr);
|
||||
|
||||
Debug.Assert(char.IsHighSurrogate(hr));
|
||||
Debug.Assert(char.IsLowSurrogate(lr));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
|
|
@ -190,6 +190,24 @@ namespace System.Globalization
|
|||
return dst;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static char ToUpperOrdinal(char c)
|
||||
{
|
||||
if (GlobalizationMode.Invariant)
|
||||
{
|
||||
return InvariantModeCasing.ToUpper(c);
|
||||
}
|
||||
|
||||
if (GlobalizationMode.UseNls)
|
||||
{
|
||||
return char.IsAscii(c)
|
||||
? ToUpperAsciiInvariant(c)
|
||||
: Invariant.ChangeCase(c, toUpper: true);
|
||||
}
|
||||
|
||||
return OrdinalCasing.ToUpper(c);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal void ChangeCaseToLower(ReadOnlySpan<char> source, Span<char> destination)
|
||||
{
|
||||
|
@ -436,7 +454,7 @@ namespace System.Globalization
|
|||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static char ToUpperAsciiInvariant(char c)
|
||||
internal static char ToUpperAsciiInvariant(char c)
|
||||
{
|
||||
if (char.IsAsciiLetterLower(c))
|
||||
{
|
||||
|
|
|
@ -416,6 +416,11 @@ namespace System
|
|||
public static bool ContainsAny<T>(this Span<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
ContainsAny((ReadOnlySpan<T>)span, values);
|
||||
|
||||
/// <inheritdoc cref="ContainsAny(ReadOnlySpan{char}, SearchValues{string})"/>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool ContainsAny(this Span<char> span, SearchValues<string> values) =>
|
||||
ContainsAny((ReadOnlySpan<char>)span, values);
|
||||
|
||||
/// <inheritdoc cref="ContainsAnyExcept{T}(ReadOnlySpan{T}, T)"/>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool ContainsAnyExcept<T>(this Span<T> span, T value) where T : IEquatable<T>? =>
|
||||
|
@ -452,7 +457,7 @@ namespace System
|
|||
ContainsAnyExceptInRange((ReadOnlySpan<T>)span, lowInclusive, highInclusive);
|
||||
|
||||
/// <summary>
|
||||
/// Searches for any occurance of the specified <paramref name="value0"/> or <paramref name="value1"/>, and returns true if found. If not found, returns false.
|
||||
/// Searches for any occurrence of the specified <paramref name="value0"/> or <paramref name="value1"/>, and returns true if found. If not found, returns false.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="value0">One of the values to search for.</param>
|
||||
|
@ -462,7 +467,7 @@ namespace System
|
|||
IndexOfAny(span, value0, value1) >= 0;
|
||||
|
||||
/// <summary>
|
||||
/// Searches for any occurance of the specified <paramref name="value0"/>, <paramref name="value1"/>, or <paramref name="value2"/>, and returns true if found. If not found, returns false.
|
||||
/// Searches for any occurrence of the specified <paramref name="value0"/>, <paramref name="value1"/>, or <paramref name="value2"/>, and returns true if found. If not found, returns false.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="value0">One of the values to search for.</param>
|
||||
|
@ -473,7 +478,7 @@ namespace System
|
|||
IndexOfAny(span, value0, value1, value2) >= 0;
|
||||
|
||||
/// <summary>
|
||||
/// Searches for any occurance of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
|
||||
/// Searches for any occurrence of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
|
@ -482,7 +487,7 @@ namespace System
|
|||
IndexOfAny(span, values) >= 0;
|
||||
|
||||
/// <summary>
|
||||
/// Searches for any occurance of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
|
||||
/// Searches for any occurrence of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
|
@ -490,6 +495,15 @@ namespace System
|
|||
public static bool ContainsAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
IndexOfAny(span, values) >= 0;
|
||||
|
||||
/// <summary>
|
||||
/// Searches for any occurrence of any of the specified substring <paramref name="values"/> and returns true if found. If not found, returns false.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool ContainsAny(this ReadOnlySpan<char> span, SearchValues<string> values) =>
|
||||
IndexOfAny(span, values) >= 0;
|
||||
|
||||
/// <summary>
|
||||
/// Searches for any value other than the specified <paramref name="value"/>.
|
||||
/// </summary>
|
||||
|
@ -1021,8 +1035,15 @@ namespace System
|
|||
/// If all of the values are in <paramref name="values"/>, returns -1.
|
||||
/// </returns>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int IndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
SearchValues<T>.IndexOfAnyExcept(span, values);
|
||||
public static int IndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.IndexOfAnyExcept(span);
|
||||
}
|
||||
|
||||
/// <summary>Searches for the last index of any value other than the specified <paramref name="value"/>.</summary>
|
||||
/// <typeparam name="T">The type of the span and values.</typeparam>
|
||||
|
@ -1324,8 +1345,15 @@ namespace System
|
|||
/// If all of the values are in <paramref name="values"/>, returns -1.
|
||||
/// </returns>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int LastIndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
SearchValues<T>.LastIndexOfAnyExcept(span, values);
|
||||
public static int LastIndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.LastIndexOfAnyExcept(span);
|
||||
}
|
||||
|
||||
/// <inheritdoc cref="IndexOfAnyInRange{T}(ReadOnlySpan{T}, T, T)"/>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
@ -1872,6 +1900,15 @@ namespace System
|
|||
public static int IndexOfAny<T>(this Span<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
IndexOfAny((ReadOnlySpan<T>)span, values);
|
||||
|
||||
/// <summary>
|
||||
/// Searches for the first index of any of the specified substring values.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int IndexOfAny(this Span<char> span, SearchValues<string> values) =>
|
||||
IndexOfAny((ReadOnlySpan<char>)span, values);
|
||||
|
||||
/// <summary>
|
||||
/// Searches for the first index of any of the specified values similar to calling IndexOf several times with the logical OR operator. If not found, returns -1.
|
||||
/// </summary>
|
||||
|
@ -2058,8 +2095,31 @@ namespace System
|
|||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int IndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
SearchValues<T>.IndexOfAny(span, values);
|
||||
public static int IndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.IndexOfAny(span);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for the first index of any of the specified substring values.
|
||||
/// </summary>
|
||||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int IndexOfAny(this ReadOnlySpan<char> span, SearchValues<string> values)
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.IndexOfAnyMultiString(span);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for the last index of any of the specified values similar to calling LastIndexOf several times with the logical OR operator. If not found, returns -1.
|
||||
|
@ -2332,8 +2392,15 @@ namespace System
|
|||
/// <param name="span">The span to search.</param>
|
||||
/// <param name="values">The set of values to search for.</param>
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static int LastIndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
|
||||
SearchValues<T>.LastIndexOfAny(span, values);
|
||||
public static int LastIndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.LastIndexOfAny(span);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether two sequences are equal by comparing the elements using IEquatable{T}.Equals(T).
|
||||
|
|
|
@ -23,5 +23,8 @@ namespace System.Buffers
|
|||
|
||||
internal override int LastIndexOfAnyExcept(ReadOnlySpan<T> span) =>
|
||||
span.Length - 1;
|
||||
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
-1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,11 @@ namespace System.Buffers
|
|||
{
|
||||
internal static bool IsVectorizationSupported => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool BitmapContains(ref Vector256<byte> bitmap, char c) =>
|
||||
c <= 127 &&
|
||||
(bitmap.GetElementUnsafe(c & 0xF) & (1 << (c >> 4))) != 0;
|
||||
|
||||
internal static unsafe void ComputeBitmap256(ReadOnlySpan<byte> values, out Vector256<byte> bitmap0, out Vector256<byte> bitmap1, out BitVector256 lookup)
|
||||
{
|
||||
// The exact format of these bitmaps differs from the other ComputeBitmap overloads as it's meant for the full [0, 255] range algorithm.
|
||||
|
@ -1022,7 +1027,7 @@ namespace System.Buffers
|
|||
{
|
||||
if (typeof(T) == typeof(short))
|
||||
{
|
||||
result = FixUpPackedVector256Result(result);
|
||||
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
|
||||
}
|
||||
|
||||
uint mask = TNegator.ExtractMask(result);
|
||||
|
@ -1038,7 +1043,7 @@ namespace System.Buffers
|
|||
{
|
||||
if (typeof(T) == typeof(short))
|
||||
{
|
||||
result = FixUpPackedVector256Result(result);
|
||||
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
|
||||
}
|
||||
|
||||
uint mask = TNegator.ExtractMask(result);
|
||||
|
@ -1060,7 +1065,7 @@ namespace System.Buffers
|
|||
{
|
||||
if (typeof(T) == typeof(short))
|
||||
{
|
||||
result = FixUpPackedVector256Result(result);
|
||||
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
|
||||
}
|
||||
|
||||
uint mask = TNegator.ExtractMask(result);
|
||||
|
@ -1076,7 +1081,7 @@ namespace System.Buffers
|
|||
{
|
||||
if (typeof(T) == typeof(short))
|
||||
{
|
||||
result = FixUpPackedVector256Result(result);
|
||||
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
|
||||
}
|
||||
|
||||
uint mask = TNegator.ExtractMask(result);
|
||||
|
@ -1091,18 +1096,6 @@ namespace System.Buffers
|
|||
return offsetInVector - Vector256<short>.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static Vector256<byte> FixUpPackedVector256Result(Vector256<byte> result)
|
||||
{
|
||||
Debug.Assert(Avx2.IsSupported);
|
||||
// Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in
|
||||
// 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
|
||||
// We want to swap the X and Y bits
|
||||
// 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2
|
||||
return Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
|
||||
}
|
||||
|
||||
internal interface INegator
|
||||
{
|
||||
static abstract bool NegateIfNeeded(bool result);
|
||||
|
|
|
@ -365,8 +365,7 @@ namespace System.Buffers
|
|||
|
||||
if (result != Vector256<byte>.Zero)
|
||||
{
|
||||
// Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate).
|
||||
result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
|
||||
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
|
||||
|
||||
uint mask = result.ExtractMostSignificantBits();
|
||||
do
|
||||
|
|
|
@ -8,7 +8,8 @@ namespace System.Buffers
|
|||
{
|
||||
/// <summary>
|
||||
/// Provides an immutable, read-only set of values optimized for efficient searching.
|
||||
/// Instances are created by <see cref="SearchValues.Create(ReadOnlySpan{byte})"/> or <see cref="SearchValues.Create(ReadOnlySpan{char})"/>.
|
||||
/// Instances are created by <see cref="SearchValues.Create(ReadOnlySpan{byte})"/>, <see cref="SearchValues.Create(ReadOnlySpan{char})"/>, or
|
||||
/// <see cref="SearchValues.Create(ReadOnlySpan{string}, StringComparison)"/>.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of the values to search for.</typeparam>
|
||||
/// <remarks>
|
||||
|
@ -38,49 +39,8 @@ namespace System.Buffers
|
|||
internal virtual int LastIndexOfAny(ReadOnlySpan<T> span) => throw new UnreachableException();
|
||||
internal virtual int LastIndexOfAnyExcept(ReadOnlySpan<T> span) => throw new UnreachableException();
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int IndexOfAny(ReadOnlySpan<T> span, SearchValues<T> values)
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.IndexOfAny(span);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int IndexOfAnyExcept(ReadOnlySpan<T> span, SearchValues<T> values)
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.IndexOfAnyExcept(span);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int LastIndexOfAny(ReadOnlySpan<T> span, SearchValues<T> values)
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.LastIndexOfAny(span);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal static int LastIndexOfAnyExcept(ReadOnlySpan<T> span, SearchValues<T> values)
|
||||
{
|
||||
if (values is null)
|
||||
{
|
||||
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
|
||||
}
|
||||
|
||||
return values.LastIndexOfAnyExcept(span);
|
||||
}
|
||||
// This is only implemented and used by SearchValues<string>.
|
||||
internal virtual int IndexOfAnyMultiString(ReadOnlySpan<char> span) => throw new UnreachableException();
|
||||
|
||||
private string DebuggerDisplay
|
||||
{
|
||||
|
|
|
@ -10,8 +10,6 @@ using System.Runtime.Intrinsics.Arm;
|
|||
using System.Runtime.Intrinsics.Wasm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
#pragma warning disable 8500 // address of managed types
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
/// <summary>
|
||||
|
@ -167,6 +165,22 @@ namespace System.Buffers
|
|||
return new ProbabilisticCharSearchValues(probabilisticValues);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates an optimized representation of <paramref name="values"/> used for efficient searching.
|
||||
/// <para>Only <see cref="StringComparison.Ordinal"/> or <see cref="StringComparison.OrdinalIgnoreCase"/> may be used.</para>
|
||||
/// </summary>
|
||||
/// <param name="values">The set of values.</param>
|
||||
/// <param name="comparisonType">Specifies whether to use <see cref="StringComparison.Ordinal"/> or <see cref="StringComparison.OrdinalIgnoreCase"/> search semantics.</param>
|
||||
public static SearchValues<string> Create(ReadOnlySpan<string> values, StringComparison comparisonType)
|
||||
{
|
||||
if (comparisonType is not (StringComparison.Ordinal or StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new ArgumentException(SR.Argument_SearchValues_UnsupportedStringComparison, nameof(comparisonType));
|
||||
}
|
||||
|
||||
return StringSearchValues.Create(values, ignoreCase: comparisonType == StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static bool TryGetSingleRange<T>(ReadOnlySpan<T> values, out T minInclusive, out T maxInclusive)
|
||||
where T : struct, INumber<T>, IMinMaxValue<T>
|
||||
{
|
||||
|
@ -211,12 +225,12 @@ namespace System.Buffers
|
|||
static abstract bool Value { get; }
|
||||
}
|
||||
|
||||
private readonly struct TrueConst : IRuntimeConst
|
||||
internal readonly struct TrueConst : IRuntimeConst
|
||||
{
|
||||
public static bool Value => true;
|
||||
}
|
||||
|
||||
private readonly struct FalseConst : IRuntimeConst
|
||||
internal readonly struct FalseConst : IRuntimeConst
|
||||
{
|
||||
public static bool Value => false;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,674 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Numerics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
using static System.Buffers.StringSearchValuesHelper;
|
||||
using static System.Buffers.TeddyHelper;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
// This is an implementation of the "Teddy" vectorized multi-substring matching algorithm.
|
||||
//
|
||||
// We have several vectorized string searching approaches implemented as part of SearchValues, among them are:
|
||||
// - 'IndexOfAnyAsciiSearcher', which can quickly find the next position of any character in a set.
|
||||
// - 'SingleStringSearchValuesThreeChars', which can determine the likely positions where a value may start.
|
||||
// The fast scan for starting positions is followed by a verification step that rules out false positives.
|
||||
// To reduce the number of false positives, the initial scan looks for multiple characters at different positions,
|
||||
// and only considers candidates where all of those match at the same time.
|
||||
//
|
||||
// Teddy combines the two to search for multiple values at the same time.
|
||||
// Similar to 'SingleStringSearchValuesThreeChars', it employs the starting positions scan and verification steps.
|
||||
// To reduce the number of values we have to check during verification, it also checks multiple characters in the initial scan.
|
||||
// We could implement that by just merging the two approaches: check for any of the value characters at position 0, 1, 2, then
|
||||
// AND those results together and verify potential matches. The issue with this approach is that we would always have to check
|
||||
// all values in the verification step, and we would be hitting many false positives as the number of values increased.
|
||||
// For example, if you are searching for "Teddy" and "Bear", position 0 could be either 'T' or 'B', position 1 could be 'e',
|
||||
// and position 2 could be 'd' or 'a'. We would do separate comparisons for each of those positions and then AND together the result.
|
||||
// Because there is no correlation between the values, we would get false positives for inputs like "Bed" and "Tea",
|
||||
// and we wouldn't know whether the match location was because of "Teddy" or "Bear", and thus which to proceed to verify.
|
||||
//
|
||||
// What is special about Teddy is how we perform that initial scan to not only determine the possible starting locations,
|
||||
// but also which values are the potential matches at each of those offsets.
|
||||
// Instead of encoding all starting characters at a given position into a bitmap that can only answer yes/no whether a given
|
||||
// character is present in the set, we want to encode both the character and the values in which it appears.
|
||||
// We only have 128* bits to work with, so we do this by encoding 8 bits of information for each nibble (half byte).
|
||||
// Those 8 bits represent a bitmask of values that contain that nibble at that location.
|
||||
// If we compare the input against two such bitmaps and AND the results together, we can determine which positions in the input
|
||||
// contained a matching character, and which of our values matched said character at that position.
|
||||
// We repeat this a few more times (checking 3 bytes or 6 nibbles for N=3) at different offsets to reduce the number of false positives.
|
||||
// See 'TeddyBucketizer.GenerateNonBucketizedFingerprint' for details around how such a bitmap is constructed.
|
||||
//
|
||||
// For example if we are searching for strings "Teddy" and "Bear", we will look for 'T' or 'B' at position 0, 'e' at position 1, ...
|
||||
// To look for 'T' (0x54) or 'B' (0x42), we will check for a high nibble of 5 or 4, and lower nibble of 4 or 2.
|
||||
// Each value's presence is indicated by 1 bit. We will use 1 (0b00000001) for the first value ("Teddy") and 2 (0b00000010) for "Bear".
|
||||
// Our bitmaps will look like so (1 is set for high 5 and low 4, 2 is set for high 4 and low 2):
|
||||
// bitmapHigh: [0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
// bitmapLow: [0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
// ^ ^ ^ ^
|
||||
//
|
||||
// To map an input nibble to its corresponding bitmask, we use 'Shuffle(bitmap, nibble)'.
|
||||
// For an input like "TeddyBearFactory", our result will be
|
||||
// input: [T, e, d, d, y, B, e, a, r, F, a, c, t, o, r, y]
|
||||
// inputHigh: [5, 6, 6, 6, 7, 4, 6, 6, 7, 4, 6, 6, 7, 6, 7, 7] (values in hex)
|
||||
// inputLow: [4, 5, 4, 4, 9, 2, 5, 1, 2, 6, 1, 3, 4, F, 2, 9] (values in hex)
|
||||
// resultHigh: [1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]
|
||||
// resultLow: [1, 0, 1, 1, 0, 2, 0, 0, 2, 0, 0, 0, 1, 0, 2, 0]
|
||||
// result: [1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] (resultHigh & resultLow)
|
||||
// ^ ^
|
||||
// Note how we had quite a few false positives for individual nibbles that we ruled away after checking both nibbles.
|
||||
// See 'TeddyHelper.ProcessInputN3' for details about how we combine results for multiple characters at different offsets.
|
||||
//
|
||||
// The description above states that we can only encode the information about 8 values. To get around that limitation
|
||||
// we group multiple values together into buckets. Instead of looking for positions where a single value may match,
|
||||
// we look for positions where any value from a given bucket may match.
|
||||
// When creating the bitmap we don't set the bit for just one nibble value, but for each of the values in that bucket.
|
||||
// For example if "Teddy" and "Bear" were both in the same bucket, the high nibble bitmap would map both 5 and 4 to the same bucket.
|
||||
// We may see more false positives ('R' (0x52) and 'D' (0x44) would now also map to the same bucket), but we get to search for
|
||||
// many more values at the same time. Instead of 8 values, we are now capable of looking for 8 buckets of values at the same time.
|
||||
// See 'TeddyBucketizer.Bucketize' for details about how values are grouped into buckets.
|
||||
// See 'TeddyBucketizer.GenerateBucketizedFingerprint' for details around how such a bitmap is constructed.
|
||||
//
|
||||
// Teddy works in terms of bytes, but .NET chars represent UTF-16 code units.
|
||||
// We currently only use Teddy if the 2 or 3 starting characters are all ASCII. This limitation could be lifted in the future if needed.
|
||||
// Since we know that all of the characters we are looking for are ASCII, we also know that only other ASCII characters will match against them.
|
||||
// Making use of that fact, we narrow UTF-16 code units into bytes when reading the input (see 'TeddyHelper.LoadAndPack16AsciiChars').
|
||||
// While such narrowing does corrupt non-ASCII values, they are all mapped to values outside of ASCII, so they won't match anyway.
|
||||
// ASCII values remain unaffected since their high byte in UTF-16 representation is 0.
|
||||
//
|
||||
// To handle case-insensitive matching, all values are normalized to their uppercase equivalents ahead of time and the bitmaps are
|
||||
// generated as if all characters were uppercase. During the search, the input is also transformed into uppercase before being compared.
|
||||
//
|
||||
// * With wider vectors (256- and 512-bit), we have more bits available, but we currently only duplicate the original 128 bits
|
||||
// and perform the search on more characters at a time. We could instead choose to encode more information per nibble to trade
|
||||
// the number of characters we check per loop iteration for fewer false positives we then have to rule out during the verification step.
|
||||
//
|
||||
// For an alternative description of the algorithm, see
|
||||
// https://github.com/BurntSushi/aho-corasick/blob/8d735471fc12f0ca570cead8e17342274fae6331/src/packed/teddy/README.md
|
||||
// Has an O(i * m) worst-case, with the expected time closer to O(n) for good bucket distributions.
|
||||
internal abstract class AsciiStringSearchValuesTeddyBase<TBucketized, TStartCaseSensitivity, TCaseSensitivity> : StringSearchValuesRabinKarp<TCaseSensitivity>
|
||||
where TBucketized : struct, SearchValues.IRuntimeConst
|
||||
where TStartCaseSensitivity : struct, ICaseSensitivity // Refers to the characters being matched by Teddy
|
||||
where TCaseSensitivity : struct, ICaseSensitivity // Refers to the rest of the value for the verification step
|
||||
{
|
||||
// We may be using N2 or N3 mode depending on whether we're checking 2 or 3 starting bytes for each bucket.
|
||||
// The result of ProcessInputN2 and ProcessInputN3 are offset by 1 and 2 positions respectively (MatchStartOffsetN2 and MatchStartOffsetN3).
|
||||
// See the full description of TeddyHelper.ProcessInputN3 for more details about why these constants exist.
|
||||
private const int MatchStartOffsetN2 = 1;
|
||||
private const int MatchStartOffsetN3 = 2;
|
||||
private const int CharsPerIterationVector128 = 16;
|
||||
private const int CharsPerIterationAvx2 = 32;
|
||||
private const int CharsPerIterationAvx512 = 64;
|
||||
|
||||
// We may have up to 8 buckets.
|
||||
// If we have <= 8 strings, the buckets will be the strings themselves, and TBucketized.Value will be false.
|
||||
// If we have more than 8, the buckets will be string[], and TBucketized.Value will be true.
|
||||
private readonly EightPackedReferences _buckets;
|
||||
|
||||
private readonly Vector512<byte>
|
||||
_n0Low, _n0High,
|
||||
_n1Low, _n1High,
|
||||
_n2Low, _n2High;
|
||||
|
||||
protected AsciiStringSearchValuesTeddyBase(ReadOnlySpan<string> values, HashSet<string> uniqueValues, int n) : base(values, uniqueValues)
|
||||
{
|
||||
Debug.Assert(!TBucketized.Value);
|
||||
Debug.Assert(n is 2 or 3);
|
||||
|
||||
_buckets = new EightPackedReferences(MemoryMarshal.CreateReadOnlySpan(
|
||||
ref Unsafe.As<string, object>(ref MemoryMarshal.GetReference(values)),
|
||||
values.Length));
|
||||
|
||||
(_n0Low, _n0High) = TeddyBucketizer.GenerateNonBucketizedFingerprint(values, offset: 0);
|
||||
(_n1Low, _n1High) = TeddyBucketizer.GenerateNonBucketizedFingerprint(values, offset: 1);
|
||||
|
||||
if (n == 3)
|
||||
{
|
||||
(_n2Low, _n2High) = TeddyBucketizer.GenerateNonBucketizedFingerprint(values, offset: 2);
|
||||
}
|
||||
}
|
||||
|
||||
protected AsciiStringSearchValuesTeddyBase(string[][] buckets, ReadOnlySpan<string> values, HashSet<string> uniqueValues, int n) : base(values, uniqueValues)
|
||||
{
|
||||
Debug.Assert(TBucketized.Value);
|
||||
Debug.Assert(n is 2 or 3);
|
||||
|
||||
_buckets = new EightPackedReferences(buckets);
|
||||
|
||||
(_n0Low, _n0High) = TeddyBucketizer.GenerateBucketizedFingerprint(buckets, offset: 0);
|
||||
(_n1Low, _n1High) = TeddyBucketizer.GenerateBucketizedFingerprint(buckets, offset: 1);
|
||||
|
||||
if (n == 3)
|
||||
{
|
||||
(_n2Low, _n2High) = TeddyBucketizer.GenerateBucketizedFingerprint(buckets, offset: 2);
|
||||
}
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
protected int IndexOfAnyN2(ReadOnlySpan<char> span)
|
||||
{
|
||||
// The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported
|
||||
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
|
||||
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN2)
|
||||
{
|
||||
return IndexOfAnyN2Avx512(span);
|
||||
}
|
||||
|
||||
if (Avx2.IsSupported && span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN2)
|
||||
{
|
||||
return IndexOfAnyN2Avx2(span);
|
||||
}
|
||||
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
|
||||
|
||||
return IndexOfAnyN2Vector128(span);
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
protected int IndexOfAnyN3(ReadOnlySpan<char> span)
|
||||
{
|
||||
// The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported
|
||||
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
|
||||
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN3)
|
||||
{
|
||||
return IndexOfAnyN3Avx512(span);
|
||||
}
|
||||
|
||||
if (Avx2.IsSupported && span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN3)
|
||||
{
|
||||
return IndexOfAnyN3Avx2(span);
|
||||
}
|
||||
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
|
||||
|
||||
return IndexOfAnyN3Vector128(span);
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
private int IndexOfAnyN2Vector128(ReadOnlySpan<char> span)
|
||||
{
|
||||
// See comments in 'IndexOfAnyN3Vector128' below.
|
||||
// This method is the same, but compares 2 starting chars instead of 3.
|
||||
if (span.Length < CharsPerIterationVector128 + MatchStartOffsetN2)
|
||||
{
|
||||
return ShortInputFallback(span);
|
||||
}
|
||||
|
||||
ref char searchSpace = ref MemoryMarshal.GetReference(span);
|
||||
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationVector128);
|
||||
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN2);
|
||||
|
||||
Vector128<byte> n0Low = _n0Low._lower._lower, n0High = _n0High._lower._lower;
|
||||
Vector128<byte> n1Low = _n1Low._lower._lower, n1High = _n1High._lower._lower;
|
||||
Vector128<byte> prev0 = Vector128<byte>.AllBitsSet;
|
||||
|
||||
Loop:
|
||||
ValidateReadPosition(span, ref searchSpace);
|
||||
Vector128<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack16AsciiChars(ref searchSpace));
|
||||
|
||||
(Vector128<byte> result, prev0) = ProcessInputN2(input, prev0, n0Low, n0High, n1Low, n1High);
|
||||
|
||||
if (result != Vector128<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
ContinueLoop:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationVector128);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationVector128)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We're switching which characters we will process in the next iteration.
|
||||
// prev0 no longer points to the characters just before the current input, so we must reset it.
|
||||
prev0 = Vector128<byte>.AllBitsSet;
|
||||
searchSpace = ref lastSearchSpaceStart;
|
||||
}
|
||||
goto Loop;
|
||||
|
||||
CandidateFound:
|
||||
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN2, out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto ContinueLoop;
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private int IndexOfAnyN2Avx2(ReadOnlySpan<char> span)
|
||||
{
|
||||
// See comments in 'IndexOfAnyN3Vector128' below.
|
||||
// This method is the same, but operates on 32 input characters at a time and compares 2 starting chars instead of 3.
|
||||
Debug.Assert(span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN2);
|
||||
|
||||
ref char searchSpace = ref MemoryMarshal.GetReference(span);
|
||||
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx2);
|
||||
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN2);
|
||||
|
||||
Vector256<byte> n0Low = _n0Low._lower, n0High = _n0High._lower;
|
||||
Vector256<byte> n1Low = _n1Low._lower, n1High = _n1High._lower;
|
||||
Vector256<byte> prev0 = Vector256<byte>.AllBitsSet;
|
||||
|
||||
Loop:
|
||||
ValidateReadPosition(span, ref searchSpace);
|
||||
Vector256<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack32AsciiChars(ref searchSpace));
|
||||
|
||||
(Vector256<byte> result, prev0) = ProcessInputN2(input, prev0, n0Low, n0High, n1Low, n1High);
|
||||
|
||||
if (result != Vector256<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
ContinueLoop:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx2);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx2)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We're switching which characters we will process in the next iteration.
|
||||
// prev0 no longer points to the characters just before the current input, so we must reset it.
|
||||
prev0 = Vector256<byte>.AllBitsSet;
|
||||
searchSpace = ref lastSearchSpaceStart;
|
||||
}
|
||||
goto Loop;
|
||||
|
||||
CandidateFound:
|
||||
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN2, out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto ContinueLoop;
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
private int IndexOfAnyN2Avx512(ReadOnlySpan<char> span)
|
||||
{
|
||||
// See comments in 'IndexOfAnyN3Vector128' below.
|
||||
// This method is the same, but operates on 64 input characters at a time and compares 2 starting chars instead of 3.
|
||||
Debug.Assert(span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN2);
|
||||
|
||||
ref char searchSpace = ref MemoryMarshal.GetReference(span);
|
||||
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx512);
|
||||
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN2);
|
||||
|
||||
Vector512<byte> n0Low = _n0Low, n0High = _n0High;
|
||||
Vector512<byte> n1Low = _n1Low, n1High = _n1High;
|
||||
Vector512<byte> prev0 = Vector512<byte>.AllBitsSet;
|
||||
|
||||
Loop:
|
||||
ValidateReadPosition(span, ref searchSpace);
|
||||
Vector512<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack64AsciiChars(ref searchSpace));
|
||||
|
||||
(Vector512<byte> result, prev0) = ProcessInputN2(input, prev0, n0Low, n0High, n1Low, n1High);
|
||||
|
||||
if (result != Vector512<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
ContinueLoop:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx512);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx512)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We're switching which characters we will process in the next iteration.
|
||||
// prev0 no longer points to the characters just before the current input, so we must reset it.
|
||||
prev0 = Vector512<byte>.AllBitsSet;
|
||||
searchSpace = ref lastSearchSpaceStart;
|
||||
}
|
||||
goto Loop;
|
||||
|
||||
CandidateFound:
|
||||
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN2, out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto ContinueLoop;
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
private int IndexOfAnyN3Vector128(ReadOnlySpan<char> span)
|
||||
{
|
||||
// We can't process inputs shorter than 18 characters in a vectorized manner here.
|
||||
if (span.Length < CharsPerIterationVector128 + MatchStartOffsetN3)
|
||||
{
|
||||
return ShortInputFallback(span);
|
||||
}
|
||||
|
||||
ref char searchSpace = ref MemoryMarshal.GetReference(span);
|
||||
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationVector128);
|
||||
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN3);
|
||||
|
||||
// All the input bitmaps are Vector128<byte>, duplicated 4 times up to Vector512<byte>.
|
||||
// They are stored as Vector512 to lower the overhead of routines that do load the full Vector512<byte>.
|
||||
// When using the Vector128 routine, we just load the first of those duplicates (._lower._lower).
|
||||
Vector128<byte> n0Low = _n0Low._lower._lower, n0High = _n0High._lower._lower;
|
||||
Vector128<byte> n1Low = _n1Low._lower._lower, n1High = _n1High._lower._lower;
|
||||
Vector128<byte> n2Low = _n2Low._lower._lower, n2High = _n2High._lower._lower;
|
||||
|
||||
// As matching is offset by 2 positions (MatchStartOffsetN3), we must remember the result of the previous loop iteration.
|
||||
// See the full description of TeddyHelper.ProcessInputN3 for more details about why these exist.
|
||||
// When doing the first loop iteration, there is no previous iteration, so we have to assume that the input did match (AllBitsSet)
|
||||
// for those positions. This makes it more likely to hit a false-positive at the very beginning, but TryFindMatch will discard them.
|
||||
Vector128<byte> prev0 = Vector128<byte>.AllBitsSet;
|
||||
Vector128<byte> prev1 = Vector128<byte>.AllBitsSet;
|
||||
|
||||
Loop:
|
||||
// Load the input characters and normalize them to their uppercase variant if we're ignoring casing.
|
||||
// These characters may not be ASCII, but we know that the starting 3 characters of each value are.
|
||||
ValidateReadPosition(span, ref searchSpace);
|
||||
Vector128<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack16AsciiChars(ref searchSpace));
|
||||
|
||||
// Find which buckets contain potential matches for each input position.
|
||||
// For a bucket to be marked as a potential match, its fingerprint must match for all 3 starting characters (all 6 nibbles).
|
||||
(Vector128<byte> result, prev0, prev1) = ProcessInputN3(input, prev0, prev1, n0Low, n0High, n1Low, n1High, n2Low, n2High);
|
||||
|
||||
if (result != Vector128<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
ContinueLoop:
|
||||
// We haven't found a match. Update the input position and check if we've reached the end.
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationVector128);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationVector128)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We're switching which characters we will process in the next iteration.
|
||||
// prev0 and prev1 no longer point to the characters just before the current input, so we must reset them.
|
||||
// Just like with the first iteration, we must assume that these positions did match (AllBitsSet).
|
||||
prev0 = Vector128<byte>.AllBitsSet;
|
||||
prev1 = Vector128<byte>.AllBitsSet;
|
||||
searchSpace = ref lastSearchSpaceStart;
|
||||
}
|
||||
goto Loop;
|
||||
|
||||
CandidateFound:
|
||||
// We found potential matches, but they may be false-positives, so we must verify each one.
|
||||
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN3, out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto ContinueLoop;
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private int IndexOfAnyN3Avx2(ReadOnlySpan<char> span)
|
||||
{
|
||||
// See comments in 'IndexOfAnyN3Vector128' above.
|
||||
// This method is the same, but operates on 32 input characters at a time.
|
||||
Debug.Assert(span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN3);
|
||||
|
||||
ref char searchSpace = ref MemoryMarshal.GetReference(span);
|
||||
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx2);
|
||||
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN3);
|
||||
|
||||
Vector256<byte> n0Low = _n0Low._lower, n0High = _n0High._lower;
|
||||
Vector256<byte> n1Low = _n1Low._lower, n1High = _n1High._lower;
|
||||
Vector256<byte> n2Low = _n2Low._lower, n2High = _n2High._lower;
|
||||
Vector256<byte> prev0 = Vector256<byte>.AllBitsSet;
|
||||
Vector256<byte> prev1 = Vector256<byte>.AllBitsSet;
|
||||
|
||||
Loop:
|
||||
ValidateReadPosition(span, ref searchSpace);
|
||||
Vector256<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack32AsciiChars(ref searchSpace));
|
||||
|
||||
(Vector256<byte> result, prev0, prev1) = ProcessInputN3(input, prev0, prev1, n0Low, n0High, n1Low, n1High, n2Low, n2High);
|
||||
|
||||
if (result != Vector256<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
ContinueLoop:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx2);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx2)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We're switching which characters we will process in the next iteration.
|
||||
// prev0 and prev1 no longer point to the characters just before the current input, so we must reset them.
|
||||
prev0 = Vector256<byte>.AllBitsSet;
|
||||
prev1 = Vector256<byte>.AllBitsSet;
|
||||
searchSpace = ref lastSearchSpaceStart;
|
||||
}
|
||||
goto Loop;
|
||||
|
||||
CandidateFound:
|
||||
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN3, out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto ContinueLoop;
|
||||
}
|
||||
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
private int IndexOfAnyN3Avx512(ReadOnlySpan<char> span)
|
||||
{
|
||||
// See comments in 'IndexOfAnyN3Vector128' above.
|
||||
// This method is the same, but operates on 64 input characters at a time.
|
||||
Debug.Assert(span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN3);
|
||||
|
||||
ref char searchSpace = ref MemoryMarshal.GetReference(span);
|
||||
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx512);
|
||||
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN3);
|
||||
|
||||
Vector512<byte> n0Low = _n0Low, n0High = _n0High;
|
||||
Vector512<byte> n1Low = _n1Low, n1High = _n1High;
|
||||
Vector512<byte> n2Low = _n2Low, n2High = _n2High;
|
||||
Vector512<byte> prev0 = Vector512<byte>.AllBitsSet;
|
||||
Vector512<byte> prev1 = Vector512<byte>.AllBitsSet;
|
||||
|
||||
Loop:
|
||||
ValidateReadPosition(span, ref searchSpace);
|
||||
Vector512<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack64AsciiChars(ref searchSpace));
|
||||
|
||||
(Vector512<byte> result, prev0, prev1) = ProcessInputN3(input, prev0, prev1, n0Low, n0High, n1Low, n1High, n2Low, n2High);
|
||||
|
||||
if (result != Vector512<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
ContinueLoop:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx512);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx512)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We're switching which characters we will process in the next iteration.
|
||||
// prev0 and prev1 no longer point to the characters just before the current input, so we must reset them.
|
||||
prev0 = Vector512<byte>.AllBitsSet;
|
||||
prev1 = Vector512<byte>.AllBitsSet;
|
||||
searchSpace = ref lastSearchSpaceStart;
|
||||
}
|
||||
goto Loop;
|
||||
|
||||
CandidateFound:
|
||||
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN3, out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto ContinueLoop;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool TryFindMatch(ReadOnlySpan<char> span, ref char searchSpace, Vector128<byte> result, int matchStartOffset, out int offsetFromStart)
|
||||
{
|
||||
// 'resultMask' encodes the input positions where at least one bucket may contain a match.
|
||||
// These positions are offset by 'matchStartOffset' places.
|
||||
uint resultMask = (~Vector128.Equals(result, Vector128<byte>.Zero)).ExtractMostSignificantBits();
|
||||
|
||||
do
|
||||
{
|
||||
int matchOffset = BitOperations.TrailingZeroCount(resultMask);
|
||||
|
||||
// Calculate where in the input span this potential match begins.
|
||||
ref char matchRef = ref Unsafe.Add(ref searchSpace, matchOffset - matchStartOffset);
|
||||
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref matchRef) / 2);
|
||||
int lengthRemaining = span.Length - offsetFromStart;
|
||||
|
||||
ValidateReadPosition(span, ref matchRef, lengthRemaining);
|
||||
|
||||
// 'candidateMask' encodes which buckets contain potential matches, starting at 'matchRef'.
|
||||
uint candidateMask = result.GetElementUnsafe(matchOffset);
|
||||
|
||||
do
|
||||
{
|
||||
// Verify each bucket to see if we've found a match.
|
||||
int candidateOffset = BitOperations.TrailingZeroCount(candidateMask);
|
||||
|
||||
object? bucket = _buckets[candidateOffset];
|
||||
Debug.Assert(bucket is not null);
|
||||
|
||||
if (TBucketized.Value
|
||||
? StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string[]>(bucket))
|
||||
: StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string>(bucket)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
candidateMask = BitOperations.ResetLowestSetBit(candidateMask);
|
||||
}
|
||||
while (candidateMask != 0);
|
||||
|
||||
resultMask = BitOperations.ResetLowestSetBit(resultMask);
|
||||
}
|
||||
while (resultMask != 0);
|
||||
|
||||
offsetFromStart = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool TryFindMatch(ReadOnlySpan<char> span, ref char searchSpace, Vector256<byte> result, int matchStartOffset, out int offsetFromStart)
|
||||
{
|
||||
// See comments in 'TryFindMatch' for Vector128<byte> above.
|
||||
// This method is the same, but checks the potential matches for 32 input positions.
|
||||
uint resultMask = (~Vector256.Equals(result, Vector256<byte>.Zero)).ExtractMostSignificantBits();
|
||||
|
||||
do
|
||||
{
|
||||
int matchOffset = BitOperations.TrailingZeroCount(resultMask);
|
||||
|
||||
ref char matchRef = ref Unsafe.Add(ref searchSpace, matchOffset - matchStartOffset);
|
||||
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref matchRef) / 2);
|
||||
int lengthRemaining = span.Length - offsetFromStart;
|
||||
|
||||
ValidateReadPosition(span, ref matchRef, lengthRemaining);
|
||||
|
||||
uint candidateMask = result.GetElementUnsafe(matchOffset);
|
||||
|
||||
do
|
||||
{
|
||||
int candidateOffset = BitOperations.TrailingZeroCount(candidateMask);
|
||||
|
||||
object? bucket = _buckets[candidateOffset];
|
||||
Debug.Assert(bucket is not null);
|
||||
|
||||
if (TBucketized.Value
|
||||
? StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string[]>(bucket))
|
||||
: StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string>(bucket)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
candidateMask = BitOperations.ResetLowestSetBit(candidateMask);
|
||||
}
|
||||
while (candidateMask != 0);
|
||||
|
||||
resultMask = BitOperations.ResetLowestSetBit(resultMask);
|
||||
}
|
||||
while (resultMask != 0);
|
||||
|
||||
offsetFromStart = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool TryFindMatch(ReadOnlySpan<char> span, ref char searchSpace, Vector512<byte> result, int matchStartOffset, out int offsetFromStart)
|
||||
{
|
||||
// See comments in 'TryFindMatch' for Vector128<byte> above.
|
||||
// This method is the same, but checks the potential matches for 64 input positions.
|
||||
ulong resultMask = (~Vector512.Equals(result, Vector512<byte>.Zero)).ExtractMostSignificantBits();
|
||||
|
||||
do
|
||||
{
|
||||
int matchOffset = BitOperations.TrailingZeroCount(resultMask);
|
||||
|
||||
ref char matchRef = ref Unsafe.Add(ref searchSpace, matchOffset - matchStartOffset);
|
||||
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref matchRef) / 2);
|
||||
int lengthRemaining = span.Length - offsetFromStart;
|
||||
|
||||
ValidateReadPosition(span, ref matchRef, lengthRemaining);
|
||||
|
||||
uint candidateMask = result.GetElementUnsafe(matchOffset);
|
||||
|
||||
do
|
||||
{
|
||||
int candidateOffset = BitOperations.TrailingZeroCount(candidateMask);
|
||||
|
||||
object? bucket = _buckets[candidateOffset];
|
||||
Debug.Assert(bucket is not null);
|
||||
|
||||
if (TBucketized.Value
|
||||
? StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string[]>(bucket))
|
||||
: StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string>(bucket)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
candidateMask = BitOperations.ResetLowestSetBit(candidateMask);
|
||||
}
|
||||
while (candidateMask != 0);
|
||||
|
||||
resultMask = BitOperations.ResetLowestSetBit(resultMask);
|
||||
}
|
||||
while (resultMask != 0);
|
||||
|
||||
offsetFromStart = 0;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class AsciiStringSearchValuesTeddyBucketizedN2<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.TrueConst, TStartCaseSensitivity, TCaseSensitivity>
|
||||
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
{
|
||||
public AsciiStringSearchValuesTeddyBucketizedN2(string[][] buckets, ReadOnlySpan<string> values, HashSet<string> uniqueValues)
|
||||
: base(buckets, values, uniqueValues, n: 2)
|
||||
{ }
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
IndexOfAnyN2(span);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class AsciiStringSearchValuesTeddyBucketizedN3<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.TrueConst, TStartCaseSensitivity, TCaseSensitivity>
|
||||
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
{
|
||||
public AsciiStringSearchValuesTeddyBucketizedN3(string[][] buckets, ReadOnlySpan<string> values, HashSet<string> uniqueValues)
|
||||
: base(buckets, values, uniqueValues, n: 3)
|
||||
{ }
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
IndexOfAnyN3(span);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class AsciiStringSearchValuesTeddyNonBucketizedN2<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.FalseConst, TStartCaseSensitivity, TCaseSensitivity>
|
||||
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
{
|
||||
public AsciiStringSearchValuesTeddyNonBucketizedN2(ReadOnlySpan<string> values, HashSet<string> uniqueValues)
|
||||
: base(values, uniqueValues, n: 2)
|
||||
{ }
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
IndexOfAnyN2(span);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class AsciiStringSearchValuesTeddyNonBucketizedN3<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.FalseConst, TStartCaseSensitivity, TCaseSensitivity>
|
||||
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
{
|
||||
public AsciiStringSearchValuesTeddyNonBucketizedN3(ReadOnlySpan<string> values, HashSet<string> uniqueValues)
|
||||
: base(values, uniqueValues, n: 3)
|
||||
{ }
|
||||
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
IndexOfAnyN3(span);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,355 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
/// <summary>
|
||||
/// An implementation of the Aho-Corasick algorithm we use as a fallback when we can't use Teddy
|
||||
/// (either due to missing hardware intrinsics, or due to characteristics of the values used).
|
||||
/// https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
|
||||
/// Works in O(n).
|
||||
/// </summary>
|
||||
internal readonly struct AhoCorasick
|
||||
{
|
||||
private readonly AhoCorasickNode[] _nodes;
|
||||
private readonly Vector256<byte> _startingCharsAsciiBitmap;
|
||||
|
||||
public AhoCorasick(AhoCorasickNode[] nodes, Vector256<byte> startingAsciiBitmap)
|
||||
{
|
||||
_nodes = nodes;
|
||||
_startingCharsAsciiBitmap = startingAsciiBitmap;
|
||||
}
|
||||
|
||||
public readonly bool ShouldUseAsciiFastScan
|
||||
{
|
||||
get
|
||||
{
|
||||
Vector256<byte> bitmap = _startingCharsAsciiBitmap;
|
||||
|
||||
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && bitmap != default)
|
||||
{
|
||||
// If there are a lot of starting characters such that we often find one early,
|
||||
// the ASCII fast scan may end up performing worse than checking one character at a time.
|
||||
// Avoid using this optimization if the combined frequency of starting chars is too high.
|
||||
//
|
||||
// For reference, the combined frequency of characters based on CharacterFrequencyHelper.AsciiFrequency:
|
||||
// - All digits is ~ 5 %
|
||||
// - All lowercase letters is ~ 57.2 %
|
||||
// - All uppercase letters is ~ 7.4 %
|
||||
//
|
||||
// This limit is based on experimentation with different texts and sets of values.
|
||||
// Above ~50 %, the cost of calling into the vectorized helper is higher than checking char by char on average.
|
||||
const float MaxCombinedFrequency = 50f;
|
||||
|
||||
float frequency = 0;
|
||||
|
||||
for (int i = 0; i < 128; i++)
|
||||
{
|
||||
if (IndexOfAnyAsciiSearcher.BitmapContains(ref bitmap, (char)i))
|
||||
{
|
||||
frequency += CharacterFrequencyHelper.AsciiFrequency[i];
|
||||
}
|
||||
}
|
||||
|
||||
return frequency <= MaxCombinedFrequency;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public readonly int IndexOfAny<TCaseSensitivity, TFastScanVariant>(ReadOnlySpan<char> span)
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TFastScanVariant : struct, IFastScan
|
||||
{
|
||||
return typeof(TCaseSensitivity) == typeof(StringSearchValuesHelper.CaseInsensitiveUnicode)
|
||||
? IndexOfAnyCaseInsensitiveUnicode<TFastScanVariant>(span)
|
||||
: IndexOfAnyCore<TCaseSensitivity, TFastScanVariant>(span);
|
||||
}
|
||||
|
||||
private readonly int IndexOfAnyCore<TCaseSensitivity, TFastScanVariant>(ReadOnlySpan<char> span)
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TFastScanVariant : struct, IFastScan
|
||||
{
|
||||
Debug.Assert(typeof(TCaseSensitivity) != typeof(StringSearchValuesHelper.CaseInsensitiveUnicode));
|
||||
|
||||
ref AhoCorasickNode nodes = ref MemoryMarshal.GetArrayDataReference(_nodes);
|
||||
int nodeIndex = 0;
|
||||
int result = -1;
|
||||
int i = 0;
|
||||
|
||||
FastScan:
|
||||
Debug.Assert(nodeIndex == 0);
|
||||
// We are currently in the root node and trying to find the next position of any starting character.
|
||||
// If all the values start with an ASCII character, use a vectorized helper to quickly skip over characters that can't start a match.
|
||||
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && typeof(TFastScanVariant) == typeof(IndexOfAnyAsciiFastScan))
|
||||
{
|
||||
int remainingLength = span.Length - i;
|
||||
|
||||
if (remainingLength >= Vector128<ushort>.Count)
|
||||
{
|
||||
// If '\0' is one of the starting chars and we're running on Ssse3 hardware, this may return false-positives.
|
||||
// False-positives here are okay, we'll just rule them out below. While we could flow the Ssse3AndWasmHandleZeroInNeedle
|
||||
// generic through, we expect such values to be rare enough that introducing more code is not worth it.
|
||||
int offset = IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<IndexOfAnyAsciiSearcher.DontNegate, IndexOfAnyAsciiSearcher.Default>(
|
||||
ref Unsafe.As<char, short>(ref Unsafe.Add(ref MemoryMarshal.GetReference(span), i)),
|
||||
remainingLength,
|
||||
ref Unsafe.AsRef(in _startingCharsAsciiBitmap));
|
||||
|
||||
if (offset < 0)
|
||||
{
|
||||
goto Return;
|
||||
}
|
||||
|
||||
i += offset;
|
||||
goto LoopWithoutRangeCheck;
|
||||
}
|
||||
}
|
||||
|
||||
Loop:
|
||||
if ((uint)i >= (uint)span.Length)
|
||||
{
|
||||
goto Return;
|
||||
}
|
||||
|
||||
LoopWithoutRangeCheck:
|
||||
// Read the next input character and either find the next potential match prefix or transition back to the root node.
|
||||
Debug.Assert((uint)i < (uint)span.Length);
|
||||
char c = TCaseSensitivity.TransformInput(Unsafe.Add(ref MemoryMarshal.GetReference(span), i));
|
||||
|
||||
while (true)
|
||||
{
|
||||
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
|
||||
ref AhoCorasickNode node = ref Unsafe.Add(ref nodes, (uint)nodeIndex);
|
||||
|
||||
if (node.TryGetChild(c, out int childIndex))
|
||||
{
|
||||
// We were able to extend the current match. If this node contains a potential match, remember that.
|
||||
nodeIndex = childIndex;
|
||||
|
||||
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
|
||||
int matchLength = Unsafe.Add(ref nodes, (uint)nodeIndex).MatchLength;
|
||||
if (matchLength != 0)
|
||||
{
|
||||
// Any result we find from here on out may only be lower (longer match with a start closer to the beginning of the input).
|
||||
Debug.Assert(result == -1 || result >= i + 1 - matchLength);
|
||||
result = i + 1 - matchLength;
|
||||
}
|
||||
|
||||
i++;
|
||||
goto Loop;
|
||||
}
|
||||
|
||||
if (nodeIndex == 0)
|
||||
{
|
||||
// We are back at the root node and none of the values start with the current character.
|
||||
if (result >= 0)
|
||||
{
|
||||
// If we've already found a match, we can't find an earlier one anymore. This is the result
|
||||
goto Return;
|
||||
}
|
||||
|
||||
// Go back to searching for the next possible starting character.
|
||||
i++;
|
||||
goto FastScan;
|
||||
}
|
||||
|
||||
// Follow the next suffix link.
|
||||
nodeIndex = node.SuffixLink;
|
||||
|
||||
if (nodeIndex < 0)
|
||||
{
|
||||
// A node with a suffix link of -1 indicates a match, see AhoCorasickBuilder.AddSuffixLinks.
|
||||
Debug.Assert(nodeIndex == -1);
|
||||
Debug.Assert(result >= 0);
|
||||
goto Return;
|
||||
}
|
||||
|
||||
// Try to match the current character again at the suffix link node.
|
||||
}
|
||||
|
||||
Return:
|
||||
return result;
|
||||
}
|
||||
|
||||
// Mostly a copy of IndexOfAnyCore, but we may read two characters at a time in the case of surrogate pairs.
|
||||
private readonly int IndexOfAnyCaseInsensitiveUnicode<TFastScanVariant>(ReadOnlySpan<char> span)
|
||||
where TFastScanVariant : struct, IFastScan
|
||||
{
|
||||
const char LowSurrogateNotSet = '\0';
|
||||
|
||||
ref AhoCorasickNode nodes = ref MemoryMarshal.GetArrayDataReference(_nodes);
|
||||
int nodeIndex = 0;
|
||||
int result = -1;
|
||||
int i = 0;
|
||||
char lowSurrogateUpper = LowSurrogateNotSet;
|
||||
|
||||
FastScan:
|
||||
// We are currently in the root node and trying to find the next position of any starting character.
|
||||
// If all the values start with an ASCII character, use a vectorized helper to quickly skip over characters that can't start a match.
|
||||
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && typeof(TFastScanVariant) == typeof(IndexOfAnyAsciiFastScan))
|
||||
{
|
||||
if (lowSurrogateUpper != LowSurrogateNotSet)
|
||||
{
|
||||
// We read a surrogate pair in the previous loop iteration and processed the high surrogate.
|
||||
// Continue with the stored low surrogate.
|
||||
goto LoopWithoutRangeCheck;
|
||||
}
|
||||
|
||||
int remainingLength = span.Length - i;
|
||||
|
||||
if (remainingLength >= Vector128<ushort>.Count)
|
||||
{
|
||||
int offset = IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<IndexOfAnyAsciiSearcher.DontNegate, IndexOfAnyAsciiSearcher.Default>(
|
||||
ref Unsafe.As<char, short>(ref Unsafe.Add(ref MemoryMarshal.GetReference(span), i)),
|
||||
remainingLength,
|
||||
ref Unsafe.AsRef(in _startingCharsAsciiBitmap));
|
||||
|
||||
if (offset < 0)
|
||||
{
|
||||
goto Return;
|
||||
}
|
||||
|
||||
i += offset;
|
||||
goto LoopWithoutRangeCheck;
|
||||
}
|
||||
}
|
||||
|
||||
Loop:
|
||||
if ((uint)i >= (uint)span.Length)
|
||||
{
|
||||
goto Return;
|
||||
}
|
||||
|
||||
LoopWithoutRangeCheck:
|
||||
// Read the next input character and either find the next potential match prefix or transition back to the root node.
|
||||
Debug.Assert((uint)i < (uint)span.Length);
|
||||
char c;
|
||||
if (lowSurrogateUpper != LowSurrogateNotSet)
|
||||
{
|
||||
// We have just processed the high surrogate. Continue with the low surrogate we read in the previous iteration.
|
||||
c = lowSurrogateUpper;
|
||||
lowSurrogateUpper = LowSurrogateNotSet;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Read the next character, check if it's a high surrogate, and transform it to its Ordinal uppercase representation.
|
||||
c = Unsafe.Add(ref MemoryMarshal.GetReference(span), i);
|
||||
char lowSurrogate;
|
||||
|
||||
if (char.IsHighSurrogate(c) &&
|
||||
(uint)(i + 1) < (uint)span.Length &&
|
||||
char.IsLowSurrogate(lowSurrogate = Unsafe.Add(ref MemoryMarshal.GetReference(span), i + 1)))
|
||||
{
|
||||
if (GlobalizationMode.UseNls)
|
||||
{
|
||||
SurrogateToUpperNLS(c, lowSurrogate, out c, out lowSurrogateUpper);
|
||||
}
|
||||
else
|
||||
{
|
||||
SurrogateCasing.ToUpper(c, lowSurrogate, out c, out lowSurrogateUpper);
|
||||
}
|
||||
|
||||
Debug.Assert(lowSurrogateUpper != LowSurrogateNotSet);
|
||||
}
|
||||
else
|
||||
{
|
||||
c = TextInfo.ToUpperOrdinal(c);
|
||||
}
|
||||
|
||||
#if DEBUG
|
||||
// The above logic must match Ordinal.ToUpperOrdinal exactly.
|
||||
Span<char> destination = new char[2]; // Avoid stackalloc in a loop
|
||||
Ordinal.ToUpperOrdinal(span.Slice(i, i + 1 == span.Length ? 1 : 2), destination);
|
||||
Debug.Assert(c == destination[0]);
|
||||
Debug.Assert(lowSurrogateUpper == LowSurrogateNotSet || lowSurrogateUpper == destination[1]);
|
||||
#endif
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
|
||||
ref AhoCorasickNode node = ref Unsafe.Add(ref nodes, (uint)nodeIndex);
|
||||
|
||||
if (node.TryGetChild(c, out int childIndex))
|
||||
{
|
||||
// We were able to extend the current match. If this node contains a potential match, remember that.
|
||||
nodeIndex = childIndex;
|
||||
|
||||
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
|
||||
int matchLength = Unsafe.Add(ref nodes, (uint)nodeIndex).MatchLength;
|
||||
if (matchLength != 0)
|
||||
{
|
||||
// Any result we find from here on out may only be lower (longer match with a start closer to the beginning of the input).
|
||||
Debug.Assert(result == -1 || result >= i + 1 - matchLength);
|
||||
result = i + 1 - matchLength;
|
||||
}
|
||||
|
||||
i++;
|
||||
goto Loop;
|
||||
}
|
||||
|
||||
if (nodeIndex == 0)
|
||||
{
|
||||
// We are back at the root node and none of the values start with the current character.
|
||||
if (result >= 0)
|
||||
{
|
||||
// If we've already found a match, we can't find an earlier one anymore. This is the result
|
||||
goto Return;
|
||||
}
|
||||
|
||||
// Go back to searching for the next possible starting character.
|
||||
i++;
|
||||
goto FastScan;
|
||||
}
|
||||
|
||||
// Follow the next suffix link.
|
||||
nodeIndex = node.SuffixLink;
|
||||
|
||||
if (nodeIndex < 0)
|
||||
{
|
||||
// A node with a suffix link of -1 indicates a match, see AhoCorasickBuilder.AddSuffixLinks.
|
||||
Debug.Assert(nodeIndex == -1);
|
||||
Debug.Assert(result >= 0);
|
||||
goto Return;
|
||||
}
|
||||
|
||||
// Try to match the current character again at the suffix link node.
|
||||
}
|
||||
|
||||
Return:
|
||||
return result;
|
||||
}
|
||||
|
||||
private static void SurrogateToUpperNLS(char h, char l, out char hr, out char lr)
|
||||
{
|
||||
Debug.Assert(char.IsHighSurrogate(h));
|
||||
Debug.Assert(char.IsLowSurrogate(l));
|
||||
|
||||
Span<char> chars = stackalloc char[] { h, l };
|
||||
Span<char> destination = stackalloc char[2];
|
||||
|
||||
int written = Ordinal.ToUpperOrdinal(chars, destination);
|
||||
Debug.Assert(written == 2);
|
||||
|
||||
hr = destination[0];
|
||||
lr = destination[1];
|
||||
|
||||
Debug.Assert(char.IsHighSurrogate(hr));
|
||||
Debug.Assert(char.IsLowSurrogate(lr));
|
||||
}
|
||||
|
||||
public interface IFastScan { }
|
||||
|
||||
public readonly struct IndexOfAnyAsciiFastScan : IFastScan { }
|
||||
|
||||
public readonly struct NoFastScan : IFastScan { }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,224 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.Intrinsics;
|
||||
using System.Text;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
/// <summary>
|
||||
/// Separated out of <see cref="AhoCorasick"/> to allow us to defer some computation costs in case we decide not to build the full thing.
|
||||
/// </summary>
|
||||
internal ref struct AhoCorasickBuilder
|
||||
{
|
||||
private readonly ReadOnlySpan<string> _values;
|
||||
private readonly bool _ignoreCase;
|
||||
private ValueListBuilder<AhoCorasickNode> _nodes;
|
||||
private ValueListBuilder<int> _parents;
|
||||
private Vector256<byte> _startingCharsAsciiBitmap;
|
||||
|
||||
public AhoCorasickBuilder(ReadOnlySpan<string> values, bool ignoreCase, ref HashSet<string>? unreachableValues)
|
||||
{
|
||||
Debug.Assert(!values.IsEmpty);
|
||||
Debug.Assert(!string.IsNullOrEmpty(values[0]));
|
||||
|
||||
#if DEBUG
|
||||
// The input should have been sorted by length
|
||||
for (int i = 1; i < values.Length; i++)
|
||||
{
|
||||
Debug.Assert(values[i - 1].Length <= values[i].Length);
|
||||
}
|
||||
#endif
|
||||
|
||||
_values = values;
|
||||
_ignoreCase = ignoreCase;
|
||||
BuildTrie(ref unreachableValues);
|
||||
}
|
||||
|
||||
public AhoCorasick Build()
|
||||
{
|
||||
AddSuffixLinks();
|
||||
|
||||
Debug.Assert(_nodes[0].MatchLength == 0, "The root node shouldn't have a match.");
|
||||
|
||||
for (int i = 0; i < _nodes.Length; i++)
|
||||
{
|
||||
_nodes[i].OptimizeChildren();
|
||||
}
|
||||
|
||||
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported)
|
||||
{
|
||||
GenerateStartingAsciiCharsBitmap();
|
||||
}
|
||||
|
||||
return new AhoCorasick(_nodes.AsSpan().ToArray(), _startingCharsAsciiBitmap);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_nodes.Dispose();
|
||||
_parents.Dispose();
|
||||
}
|
||||
|
||||
private void BuildTrie(ref HashSet<string>? unreachableValues)
|
||||
{
|
||||
_nodes.Append(new AhoCorasickNode());
|
||||
_parents.Append(0);
|
||||
|
||||
foreach (string value in _values)
|
||||
{
|
||||
int nodeIndex = 0;
|
||||
ref AhoCorasickNode node = ref _nodes[nodeIndex];
|
||||
|
||||
for (int i = 0; i < value.Length; i++)
|
||||
{
|
||||
char c = value[i];
|
||||
|
||||
if (!node.TryGetChild(c, out int childIndex))
|
||||
{
|
||||
childIndex = _nodes.Length;
|
||||
node.AddChild(c, childIndex);
|
||||
_nodes.Append(new AhoCorasickNode());
|
||||
_parents.Append(nodeIndex);
|
||||
}
|
||||
|
||||
node = ref _nodes[childIndex];
|
||||
nodeIndex = childIndex;
|
||||
|
||||
if (node.MatchLength != 0)
|
||||
{
|
||||
// A previous value is an exact prefix of this one.
|
||||
// We're looking for the index of the first match, not necessarily the longest one, so we can skip this value.
|
||||
// We've already normalized the values, so we can do ordinal comparisons here.
|
||||
unreachableValues ??= new HashSet<string>(StringComparer.Ordinal);
|
||||
unreachableValues.Add(value);
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == value.Length - 1)
|
||||
{
|
||||
node.MatchLength = value.Length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void AddSuffixLinks()
|
||||
{
|
||||
// Besides the list of children which continue the current value, each node also contains a suffix link
|
||||
// which points to the node with the longest suffix of the current node.
|
||||
// When we're searching and can't find a child to extend the current string with, we will follow
|
||||
// suffix links to find the longest string that does match up until the current point.
|
||||
//
|
||||
// For example if we have strings "DOTNET" and "OTTER", we want
|
||||
// the 'O' and 'T' in "dotnet" to point into 'O' and 'T' in "OTTER".
|
||||
// If our text contains the word "dotter", we will walk it character by character.
|
||||
// Once we get to "DOTNET" and read the next character 'T', we can no longer continue with "DOTNET",
|
||||
// and will instead follow the suffix link to "ot" in "OTTER" where we can continue the search.
|
||||
//
|
||||
// We also remember when a node's suffix link points to the end of a different value, such that it is itself a match.
|
||||
// If we also had the word "POTTERY", the 'R' would contain a suffix link to the 'R' in "OTTER",
|
||||
// but also mark that it is already a length=5 match.
|
||||
//
|
||||
// +---> D O T N E T
|
||||
// | | |
|
||||
// | +--+ |
|
||||
// root--+ | |
|
||||
// | | +--+
|
||||
// | v v
|
||||
// +---> O T T E R
|
||||
// | ^ ^ ^ ^ ^
|
||||
// | | | | | | -- this is also a length=5 match
|
||||
// | | | | | |
|
||||
// +> P O T T E R Y
|
||||
|
||||
var queue = new Queue<(char Char, int Index)>();
|
||||
queue.Enqueue(((char)0, 0));
|
||||
|
||||
while (queue.TryDequeue(out (char Char, int Index) trieNode))
|
||||
{
|
||||
ref AhoCorasickNode node = ref _nodes[trieNode.Index];
|
||||
int parent = _parents[trieNode.Index];
|
||||
int suffixLink = _nodes[parent].SuffixLink;
|
||||
|
||||
// If this node doesn't represent the first character of a value (doesn't immediately follow the root node),
|
||||
// it may have a have a non-zero suffix link.
|
||||
if (parent != 0)
|
||||
{
|
||||
while (suffixLink >= 0)
|
||||
{
|
||||
ref AhoCorasickNode suffixNode = ref _nodes[suffixLink];
|
||||
|
||||
if (suffixNode.TryGetChild(trieNode.Char, out int childSuffixLink))
|
||||
{
|
||||
suffixLink = childSuffixLink;
|
||||
break;
|
||||
}
|
||||
|
||||
if (suffixLink == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
suffixLink = suffixNode.SuffixLink;
|
||||
}
|
||||
}
|
||||
|
||||
if (node.MatchLength != 0)
|
||||
{
|
||||
// This node represents the end of a match.
|
||||
// Mark it in a special way we can recognize when searching.
|
||||
node.SuffixLink = -1;
|
||||
|
||||
// If a node is a match, there is no need to assign suffix links to its children.
|
||||
// If a child does not match, such that we would look at its suffix link,
|
||||
// we have already saw an earlier match node that is definitely the earliest possible match.
|
||||
}
|
||||
else
|
||||
{
|
||||
node.SuffixLink = suffixLink;
|
||||
|
||||
if (suffixLink >= 0)
|
||||
{
|
||||
// Remember if this node's suffix link points to a node that is itself a match.
|
||||
node.MatchLength = _nodes[suffixLink].MatchLength;
|
||||
}
|
||||
|
||||
node.AddChildrenToQueue(queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If all the values start with ASCII characters, we can use IndexOfAnyAsciiSearcher
|
||||
// to quickly skip to the next possible starting location in the input.
|
||||
private void GenerateStartingAsciiCharsBitmap()
|
||||
{
|
||||
scoped ValueListBuilder<char> startingChars = new ValueListBuilder<char>(stackalloc char[128]);
|
||||
|
||||
foreach (string value in _values)
|
||||
{
|
||||
char c = value[0];
|
||||
|
||||
if (_ignoreCase)
|
||||
{
|
||||
startingChars.Append(char.ToLowerInvariant(c));
|
||||
startingChars.Append(char.ToUpperInvariant(c));
|
||||
}
|
||||
else
|
||||
{
|
||||
startingChars.Append(c);
|
||||
}
|
||||
}
|
||||
|
||||
if (Ascii.IsValid(startingChars.AsSpan()))
|
||||
{
|
||||
IndexOfAnyAsciiSearcher.ComputeBitmap(startingChars.AsSpan(), out _startingCharsAsciiBitmap, out _);
|
||||
}
|
||||
|
||||
startingChars.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal struct AhoCorasickNode
|
||||
{
|
||||
private static object EmptyChildrenSentinel => Array.Empty<int>();
|
||||
|
||||
public int SuffixLink;
|
||||
public int MatchLength;
|
||||
|
||||
// This is not a radix tree so we may have a lot of very sparse nodes (single child).
|
||||
// We save 1 child separately to avoid allocating a separate collection in such cases.
|
||||
private int _firstChildChar;
|
||||
private int _firstChildIndex;
|
||||
private object _children; // Either int[] or Dictionary<char, int>
|
||||
|
||||
public AhoCorasickNode()
|
||||
{
|
||||
_firstChildChar = -1;
|
||||
_children = EmptyChildrenSentinel;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public readonly bool TryGetChild(char c, out int index)
|
||||
{
|
||||
if (_firstChildChar == c)
|
||||
{
|
||||
index = _firstChildIndex;
|
||||
return true;
|
||||
}
|
||||
|
||||
object children = _children;
|
||||
Debug.Assert(children is int[] || children is Dictionary<char, int>);
|
||||
|
||||
if (children.GetType() == typeof(int[]))
|
||||
{
|
||||
int[] table = Unsafe.As<int[]>(children);
|
||||
if (c < (uint)table.Length)
|
||||
{
|
||||
index = table[c];
|
||||
if (index >= 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return Unsafe.As<Dictionary<char, int>>(children).TryGetValue(c, out index);
|
||||
}
|
||||
|
||||
index = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
public void AddChild(char c, int index)
|
||||
{
|
||||
if (_firstChildChar < 0)
|
||||
{
|
||||
_firstChildChar = c;
|
||||
_firstChildIndex = index;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ReferenceEquals(_children, EmptyChildrenSentinel))
|
||||
{
|
||||
_children = new Dictionary<char, int>();
|
||||
}
|
||||
|
||||
((Dictionary<char, int>)_children).Add(c, index);
|
||||
}
|
||||
}
|
||||
|
||||
public readonly void AddChildrenToQueue(Queue<(char Char, int Index)> queue)
|
||||
{
|
||||
if (_firstChildChar >= 0)
|
||||
{
|
||||
queue.Enqueue(((char)_firstChildChar, _firstChildIndex));
|
||||
|
||||
if (_children is Dictionary<char, int> children)
|
||||
{
|
||||
foreach ((char childChar, int childIndex) in children)
|
||||
{
|
||||
queue.Enqueue((childChar, childIndex));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Debug.Assert(ReferenceEquals(_children, EmptyChildrenSentinel));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void OptimizeChildren()
|
||||
{
|
||||
if (_children is Dictionary<char, int> children)
|
||||
{
|
||||
children.Add((char)_firstChildChar, _firstChildIndex);
|
||||
|
||||
float frequency = -2;
|
||||
|
||||
// We have the _firstChildChar field that will always be checked first.
|
||||
// Improve throughput by setting it to the child character with the highest frequency.
|
||||
foreach ((char childChar, int childIndex) in children)
|
||||
{
|
||||
float newFrequency = char.IsAscii(childChar) ? CharacterFrequencyHelper.AsciiFrequency[childChar] : -1;
|
||||
|
||||
if (newFrequency > frequency)
|
||||
{
|
||||
frequency = newFrequency;
|
||||
_firstChildChar = childChar;
|
||||
_firstChildIndex = childIndex;
|
||||
}
|
||||
}
|
||||
|
||||
children.Remove((char)_firstChildChar);
|
||||
|
||||
if (TryCreateJumpTable(children, out int[]? table))
|
||||
{
|
||||
_children = table;
|
||||
}
|
||||
}
|
||||
|
||||
static bool TryCreateJumpTable(Dictionary<char, int> children, [NotNullWhen(true)] out int[]? table)
|
||||
{
|
||||
// We can use either a Dictionary<char, int> or int[] to map child characters to node indexes.
|
||||
// int[] is generally faster but consumes more memory for characters with high values.
|
||||
// We try to find the right balance between memory usage and lookup performance.
|
||||
// Currently we will sacrifice up to ~2x the memory consumption to use int[] for faster lookups.
|
||||
const int AcceptableSizeMultiplier = 2;
|
||||
|
||||
Debug.Assert(children.Count > 0);
|
||||
|
||||
int maxValue = -1;
|
||||
|
||||
foreach ((char childChar, _) in children)
|
||||
{
|
||||
maxValue = Math.Max(maxValue, childChar);
|
||||
}
|
||||
|
||||
int tableSize = TableMemoryFootprintBytesEstimate(maxValue);
|
||||
int dictionarySize = DictionaryMemoryFootprintBytesEstimate(children.Count);
|
||||
|
||||
if (tableSize > dictionarySize * AcceptableSizeMultiplier)
|
||||
{
|
||||
// We would have a lot of empty entries. Avoid wasting too much memory.
|
||||
table = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
table = new int[maxValue + 1];
|
||||
Array.Fill(table, -1);
|
||||
|
||||
foreach ((char childChar, int childIndex) in children)
|
||||
{
|
||||
table[childChar] = childIndex;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
static int TableMemoryFootprintBytesEstimate(int maxValue)
|
||||
{
|
||||
// An approximate number of bytes consumed by an
|
||||
// int[] table with a known number of entries.
|
||||
// Only used as a heuristic, so numbers don't have to be exact.
|
||||
return 32 + (maxValue * sizeof(int));
|
||||
}
|
||||
|
||||
static int DictionaryMemoryFootprintBytesEstimate(int childCount)
|
||||
{
|
||||
// An approximate number of bytes consumed by a
|
||||
// Dictionary<char, int> with a known number of entries.
|
||||
// Only used as a heuristic, so numbers don't have to be exact.
|
||||
return childCount switch
|
||||
{
|
||||
< 4 => 192,
|
||||
< 8 => 272,
|
||||
< 12 => 352,
|
||||
_ => childCount * 25
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal static class CharacterFrequencyHelper
|
||||
{
|
||||
// Same as RegexPrefixAnalyzer.Frequency.
|
||||
// https://github.com/dotnet/runtime/blob/a355d5f7db162714ee19533ca55074aa2cbd8a8c/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs#L956C43-L956C53
|
||||
public static ReadOnlySpan<float> AsciiFrequency => new float[]
|
||||
{
|
||||
0.000f /* '\x00' */, 0.000f /* '\x01' */, 0.000f /* '\x02' */, 0.000f /* '\x03' */, 0.000f /* '\x04' */, 0.000f /* '\x05' */, 0.000f /* '\x06' */, 0.000f /* '\x07' */,
|
||||
0.000f /* '\x08' */, 0.001f /* '\x09' */, 0.000f /* '\x0A' */, 0.000f /* '\x0B' */, 0.000f /* '\x0C' */, 0.000f /* '\x0D' */, 0.000f /* '\x0E' */, 0.000f /* '\x0F' */,
|
||||
0.000f /* '\x10' */, 0.000f /* '\x11' */, 0.000f /* '\x12' */, 0.000f /* '\x13' */, 0.003f /* '\x14' */, 0.000f /* '\x15' */, 0.000f /* '\x16' */, 0.000f /* '\x17' */,
|
||||
0.000f /* '\x18' */, 0.004f /* '\x19' */, 0.000f /* '\x1A' */, 0.000f /* '\x1B' */, 0.006f /* '\x1C' */, 0.006f /* '\x1D' */, 0.000f /* '\x1E' */, 0.000f /* '\x1F' */,
|
||||
8.952f /* ' ' */, 0.065f /* ' !' */, 0.420f /* ' "' */, 0.010f /* ' #' */, 0.011f /* ' $' */, 0.005f /* ' %' */, 0.070f /* ' &' */, 0.050f /* ' '' */,
|
||||
3.911f /* ' (' */, 3.910f /* ' )' */, 0.356f /* ' *' */, 2.775f /* ' +' */, 1.411f /* ' ,' */, 0.173f /* ' -' */, 2.054f /* ' .' */, 0.677f /* ' /' */,
|
||||
1.199f /* ' 0' */, 0.870f /* ' 1' */, 0.729f /* ' 2' */, 0.491f /* ' 3' */, 0.335f /* ' 4' */, 0.269f /* ' 5' */, 0.435f /* ' 6' */, 0.240f /* ' 7' */,
|
||||
0.234f /* ' 8' */, 0.196f /* ' 9' */, 0.144f /* ' :' */, 0.983f /* ' ;' */, 0.357f /* ' <' */, 0.661f /* ' =' */, 0.371f /* ' >' */, 0.088f /* ' ?' */,
|
||||
0.007f /* ' @' */, 0.763f /* ' A' */, 0.229f /* ' B' */, 0.551f /* ' C' */, 0.306f /* ' D' */, 0.449f /* ' E' */, 0.337f /* ' F' */, 0.162f /* ' G' */,
|
||||
0.131f /* ' H' */, 0.489f /* ' I' */, 0.031f /* ' J' */, 0.035f /* ' K' */, 0.301f /* ' L' */, 0.205f /* ' M' */, 0.253f /* ' N' */, 0.228f /* ' O' */,
|
||||
0.288f /* ' P' */, 0.034f /* ' Q' */, 0.380f /* ' R' */, 0.730f /* ' S' */, 0.675f /* ' T' */, 0.265f /* ' U' */, 0.309f /* ' V' */, 0.137f /* ' W' */,
|
||||
0.084f /* ' X' */, 0.023f /* ' Y' */, 0.023f /* ' Z' */, 0.591f /* ' [' */, 0.085f /* ' \' */, 0.590f /* ' ]' */, 0.013f /* ' ^' */, 0.797f /* ' _' */,
|
||||
0.001f /* ' `' */, 4.596f /* ' a' */, 1.296f /* ' b' */, 2.081f /* ' c' */, 2.005f /* ' d' */, 6.903f /* ' e' */, 1.494f /* ' f' */, 1.019f /* ' g' */,
|
||||
1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */,
|
||||
1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */,
|
||||
0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */,
|
||||
};
|
||||
|
||||
public static void GetSingleStringMultiCharacterOffsets(string value, bool ignoreCase, out int ch2Offset, out int ch3Offset)
|
||||
{
|
||||
Debug.Assert(value.Length > 1);
|
||||
Debug.Assert(!ignoreCase || char.IsAscii(value[0]));
|
||||
|
||||
ch2Offset = IndexOfAsciiCharWithLowestFrequency(value, ignoreCase);
|
||||
ch3Offset = 0;
|
||||
|
||||
if (ch2Offset < 0)
|
||||
{
|
||||
// We have fewer than 2 ASCII chars in the value.
|
||||
Debug.Assert(!ignoreCase);
|
||||
|
||||
// We don't have a frequency table for non-ASCII characters, pick a random one.
|
||||
ch2Offset = value.Length - 1;
|
||||
}
|
||||
|
||||
if (value.Length > 2)
|
||||
{
|
||||
ch3Offset = IndexOfAsciiCharWithLowestFrequency(value, ignoreCase, excludeIndex: ch2Offset);
|
||||
|
||||
if (ch3Offset < 0)
|
||||
{
|
||||
// We have fewer than 3 ASCII chars in the value.
|
||||
if (ignoreCase)
|
||||
{
|
||||
// We can still use N=2.
|
||||
ch3Offset = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// We don't have a frequency table for non-ASCII characters, pick a random one.
|
||||
ch3Offset = value.Length - 1;
|
||||
|
||||
if (ch2Offset == ch3Offset)
|
||||
{
|
||||
ch2Offset--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Debug.Assert(ch2Offset != 0);
|
||||
Debug.Assert(ch2Offset != ch3Offset);
|
||||
|
||||
if (ch3Offset > 0 && ch3Offset < ch2Offset)
|
||||
{
|
||||
(ch2Offset, ch3Offset) = (ch3Offset, ch2Offset);
|
||||
}
|
||||
}
|
||||
|
||||
private static int IndexOfAsciiCharWithLowestFrequency(ReadOnlySpan<char> span, bool ignoreCase, int excludeIndex = -1)
|
||||
{
|
||||
float minFrequency = float.MaxValue;
|
||||
int minIndex = -1;
|
||||
|
||||
// Exclude i = 0 as we've already decided to use the first character.
|
||||
for (int i = 1; i < span.Length; i++)
|
||||
{
|
||||
if (i == excludeIndex)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
char c = span[i];
|
||||
|
||||
// We don't have a frequency table for non-ASCII characters, so they are ignored.
|
||||
if (char.IsAscii(c))
|
||||
{
|
||||
float frequency = AsciiFrequency[c];
|
||||
|
||||
if (ignoreCase)
|
||||
{
|
||||
// Include the alternative character that will also match.
|
||||
frequency += AsciiFrequency[c ^ 0x20];
|
||||
}
|
||||
|
||||
// Avoiding characters from the front of the value for the 2nd and 3rd character
|
||||
// results in 18 % fewer false positive 3-char matches on "The Adventures of Sherlock Holmes".
|
||||
if (i <= 2)
|
||||
{
|
||||
frequency *= 1.5f;
|
||||
}
|
||||
|
||||
if (frequency <= minFrequency)
|
||||
{
|
||||
minFrequency = frequency;
|
||||
minIndex = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return minIndex;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
[InlineArray(8)]
|
||||
internal struct EightPackedReferences
|
||||
{
|
||||
#pragma warning disable CA1823 // Unused field -- https://github.com/dotnet/roslyn-analyzers/issues/6788
|
||||
private object? _ref0;
|
||||
#pragma warning restore CA1823
|
||||
|
||||
public EightPackedReferences(ReadOnlySpan<object> values)
|
||||
{
|
||||
Debug.Assert(values.Length is > 0 and <= 8, $"Got {values.Length} values");
|
||||
|
||||
values.CopyTo(this!);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,175 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using static System.Buffers.StringSearchValuesHelper;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
/// <summary>
|
||||
/// An implementation of the Rabin-Karp algorithm we use as a fallback for
|
||||
/// short inputs that we can't handle with Teddy.
|
||||
/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
|
||||
/// Has an O(i * m) worst-case, but we will only use it for very short inputs.
|
||||
/// </summary>
|
||||
internal readonly struct RabinKarp
|
||||
{
|
||||
// The number of values we'll accept before falling back to Aho-Corasick.
|
||||
// This also affects when Teddy may be used.
|
||||
public const int MaxValues = 80;
|
||||
|
||||
// This is a tradeoff between memory consumption and the number of false positives
|
||||
// we have to rule out during the verification step.
|
||||
private const nuint BucketCount = 64;
|
||||
|
||||
// 18 = Vector128<byte>.Count + 2 (MatchStartOffset for N=3)
|
||||
// The logic in this class is not safe from overflows, but we avoid any issues by
|
||||
// only calling into it for inputs that are too short for Teddy to handle.
|
||||
private const int MaxInputLength = 18 - 1;
|
||||
|
||||
// We're using nuint as the rolling hash, so we can spread the hash over more bits on 64bit.
|
||||
private static int HashShiftPerElement => IntPtr.Size == 8 ? 2 : 1;
|
||||
|
||||
private readonly string[]?[] _buckets;
|
||||
private readonly int _hashLength;
|
||||
private readonly nuint _hashUpdateMultiplier;
|
||||
|
||||
public RabinKarp(ReadOnlySpan<string> values)
|
||||
{
|
||||
Debug.Assert(values.Length <= MaxValues);
|
||||
|
||||
int minimumLength = int.MaxValue;
|
||||
foreach (string value in values)
|
||||
{
|
||||
minimumLength = Math.Min(minimumLength, value.Length);
|
||||
}
|
||||
|
||||
Debug.Assert(minimumLength > 1);
|
||||
|
||||
_hashLength = minimumLength;
|
||||
_hashUpdateMultiplier = (nuint)1 << ((minimumLength - 1) * HashShiftPerElement);
|
||||
|
||||
if (minimumLength > MaxInputLength)
|
||||
{
|
||||
// All the values are long. They'll either be handled by Teddy or won't match at all.
|
||||
// There's no point in allocating the buckets as they will never be accessed.
|
||||
_buckets = null!;
|
||||
return;
|
||||
}
|
||||
|
||||
string[]?[] buckets = _buckets = new string[BucketCount][];
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
nuint hash = 0;
|
||||
for (int i = 0; i < minimumLength; i++)
|
||||
{
|
||||
hash = (hash << HashShiftPerElement) + value[i];
|
||||
}
|
||||
|
||||
nuint bucket = hash % BucketCount;
|
||||
string[] newBucket;
|
||||
|
||||
// Start with a bucket containing 1 element and reallocate larger ones if needed.
|
||||
// As MaxValues is similar to BucketCount, we will have 1 value per bucket on average.
|
||||
if (buckets[bucket] is string[] existingBucket)
|
||||
{
|
||||
newBucket = new string[existingBucket.Length + 1];
|
||||
existingBucket.AsSpan().CopyTo(newBucket);
|
||||
}
|
||||
else
|
||||
{
|
||||
newBucket = new string[1];
|
||||
}
|
||||
|
||||
newBucket[^1] = value;
|
||||
buckets[bucket] = newBucket;
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public readonly int IndexOfAny<TCaseSensitivity>(ReadOnlySpan<char> span)
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
return typeof(TCaseSensitivity) == typeof(CaseInsensitiveUnicode)
|
||||
? IndexOfAnyCaseInsensitiveUnicode(span)
|
||||
: IndexOfAnyCore<TCaseSensitivity>(span);
|
||||
}
|
||||
|
||||
private readonly int IndexOfAnyCore<TCaseSensitivity>(ReadOnlySpan<char> span)
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
Debug.Assert(typeof(TCaseSensitivity) != typeof(CaseInsensitiveUnicode));
|
||||
Debug.Assert(span.Length <= MaxInputLength, "Teddy should have handled short inputs.");
|
||||
|
||||
ref char current = ref MemoryMarshal.GetReference(span);
|
||||
|
||||
int hashLength = _hashLength;
|
||||
|
||||
if (span.Length >= hashLength)
|
||||
{
|
||||
ref char end = ref Unsafe.Add(ref MemoryMarshal.GetReference(span), (uint)(span.Length - hashLength));
|
||||
|
||||
nuint hash = 0;
|
||||
for (uint i = 0; i < hashLength; i++)
|
||||
{
|
||||
hash = (hash << HashShiftPerElement) + TCaseSensitivity.TransformInput(Unsafe.Add(ref current, i));
|
||||
}
|
||||
|
||||
Debug.Assert(_buckets is not null);
|
||||
ref string[]? bucketsRef = ref MemoryMarshal.GetArrayDataReference(_buckets);
|
||||
|
||||
while (true)
|
||||
{
|
||||
ValidateReadPosition(span, ref current);
|
||||
|
||||
if (Unsafe.Add(ref bucketsRef, hash % BucketCount) is string[] bucket)
|
||||
{
|
||||
int startOffset = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref current) / sizeof(char));
|
||||
|
||||
if (StartsWith<TCaseSensitivity>(ref current, span.Length - startOffset, bucket))
|
||||
{
|
||||
return startOffset;
|
||||
}
|
||||
}
|
||||
|
||||
if (!Unsafe.IsAddressLessThan(ref current, ref end))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
char previous = TCaseSensitivity.TransformInput(current);
|
||||
char next = TCaseSensitivity.TransformInput(Unsafe.Add(ref current, (uint)hashLength));
|
||||
|
||||
// Update the hash by removing the previous character and adding the next one.
|
||||
hash = ((hash - (previous * _hashUpdateMultiplier)) << HashShiftPerElement) + next;
|
||||
current = ref Unsafe.Add(ref current, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
private readonly int IndexOfAnyCaseInsensitiveUnicode(ReadOnlySpan<char> span)
|
||||
{
|
||||
Debug.Assert(span.Length <= MaxInputLength, "Teddy should have handled long inputs.");
|
||||
|
||||
if (_hashLength > span.Length)
|
||||
{
|
||||
// Can't possibly match, all the values are longer than our input span.
|
||||
return -1;
|
||||
}
|
||||
|
||||
Span<char> upperCase = stackalloc char[MaxInputLength].Slice(0, span.Length);
|
||||
|
||||
int charsWritten = Ordinal.ToUpperOrdinal(span, upperCase);
|
||||
Debug.Assert(charsWritten == upperCase.Length);
|
||||
|
||||
// CaseSensitive instead of CaseInsensitiveUnicode as we've already done the case conversion.
|
||||
return IndexOfAnyCore<CaseSensitive>(upperCase);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,191 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
// Provides implementations for helpers shared across multiple SearchValues<string> implementations,
|
||||
// such as normalizing and matching values under different case sensitivity rules.
|
||||
internal static class StringSearchValuesHelper
|
||||
{
|
||||
[Conditional("DEBUG")]
|
||||
public static void ValidateReadPosition(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, int offset = 0)
|
||||
{
|
||||
Debug.Assert(searchSpaceLength >= 0);
|
||||
|
||||
ValidateReadPosition(MemoryMarshal.CreateReadOnlySpan(ref searchSpaceStart, searchSpaceLength), ref searchSpace, offset);
|
||||
}
|
||||
|
||||
[Conditional("DEBUG")]
|
||||
public static void ValidateReadPosition(ReadOnlySpan<char> span, ref char searchSpace, int offset = 0)
|
||||
{
|
||||
Debug.Assert(offset >= 0);
|
||||
|
||||
nint currentByteOffset = Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref searchSpace);
|
||||
Debug.Assert(currentByteOffset >= 0);
|
||||
Debug.Assert((currentByteOffset & 1) == 0);
|
||||
|
||||
int currentOffset = (int)(currentByteOffset / 2);
|
||||
int availableLength = span.Length - currentOffset;
|
||||
Debug.Assert(offset <= availableLength);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool StartsWith<TCaseSensitivity>(ref char matchStart, int lengthRemaining, string[] candidates)
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
foreach (string candidate in candidates)
|
||||
{
|
||||
if (StartsWith<TCaseSensitivity>(ref matchStart, lengthRemaining, candidate))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool StartsWith<TCaseSensitivity>(ref char matchStart, int lengthRemaining, string candidate)
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
Debug.Assert(lengthRemaining > 0);
|
||||
|
||||
if (lengthRemaining < candidate.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return TCaseSensitivity.Equals(ref matchStart, candidate);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static bool ScalarEquals<TCaseSensitivity>(ref char matchStart, string candidate)
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
for (int i = 0; i < candidate.Length; i++)
|
||||
{
|
||||
if (TCaseSensitivity.TransformInput(Unsafe.Add(ref matchStart, i)) != candidate[i])
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public interface ICaseSensitivity
|
||||
{
|
||||
static abstract char TransformInput(char input);
|
||||
static abstract Vector128<byte> TransformInput(Vector128<byte> input);
|
||||
static abstract Vector256<byte> TransformInput(Vector256<byte> input);
|
||||
static abstract Vector512<byte> TransformInput(Vector512<byte> input);
|
||||
static abstract bool Equals(ref char matchStart, string candidate);
|
||||
}
|
||||
|
||||
// Performs no case transformations.
|
||||
public readonly struct CaseSensitive : ICaseSensitivity
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static char TransformInput(char input) => input;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<byte> TransformInput(Vector128<byte> input) => input;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector256<byte> TransformInput(Vector256<byte> input) => input;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector512<byte> TransformInput(Vector512<byte> input) => input;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool Equals(ref char matchStart, string candidate) =>
|
||||
ScalarEquals<CaseSensitive>(ref matchStart, candidate);
|
||||
}
|
||||
|
||||
// Transforms inputs to their uppercase variants with the assumption that all input characters are ASCII letters.
|
||||
// These helpers may produce wrong results for other characters, and the callers must account for that.
|
||||
public readonly struct CaseInsensitiveAsciiLetters : ICaseSensitivity
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static char TransformInput(char input) => (char)(input & ~0x20);
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<byte> TransformInput(Vector128<byte> input) => input & Vector128.Create(unchecked((byte)~0x20));
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector256<byte> TransformInput(Vector256<byte> input) => input & Vector256.Create(unchecked((byte)~0x20));
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector512<byte> TransformInput(Vector512<byte> input) => input & Vector512.Create(unchecked((byte)~0x20));
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool Equals(ref char matchStart, string candidate) =>
|
||||
ScalarEquals<CaseInsensitiveAsciiLetters>(ref matchStart, candidate);
|
||||
}
|
||||
|
||||
// Transforms inputs to their uppercase variants with the assumption that all input characters are ASCII.
|
||||
// These helpers may produce wrong results for non-ASCII inputs, and the callers must account for that.
|
||||
public readonly struct CaseInsensitiveAscii : ICaseSensitivity
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static char TransformInput(char input) => TextInfo.ToUpperAsciiInvariant(input);
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<byte> TransformInput(Vector128<byte> input)
|
||||
{
|
||||
Vector128<byte> subtraction = Vector128.Create((byte)(128 + 'a'));
|
||||
Vector128<byte> comparison = Vector128.Create((byte)(128 + 26));
|
||||
Vector128<byte> caseConversion = Vector128.Create((byte)0x20);
|
||||
|
||||
Vector128<byte> matches = Vector128.LessThan((input - subtraction).AsSByte(), comparison.AsSByte()).AsByte();
|
||||
return input ^ (matches & caseConversion);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector256<byte> TransformInput(Vector256<byte> input)
|
||||
{
|
||||
Vector256<byte> subtraction = Vector256.Create((byte)(128 + 'a'));
|
||||
Vector256<byte> comparison = Vector256.Create((byte)(128 + 26));
|
||||
Vector256<byte> caseConversion = Vector256.Create((byte)0x20);
|
||||
|
||||
Vector256<byte> matches = Vector256.LessThan((input - subtraction).AsSByte(), comparison.AsSByte()).AsByte();
|
||||
return input ^ (matches & caseConversion);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector512<byte> TransformInput(Vector512<byte> input)
|
||||
{
|
||||
Vector512<byte> subtraction = Vector512.Create((byte)(128 + 'a'));
|
||||
Vector512<byte> comparison = Vector512.Create((byte)(128 + 26));
|
||||
Vector512<byte> caseConversion = Vector512.Create((byte)0x20);
|
||||
|
||||
Vector512<byte> matches = Vector512.LessThan((input - subtraction).AsSByte(), comparison.AsSByte()).AsByte();
|
||||
return input ^ (matches & caseConversion);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool Equals(ref char matchStart, string candidate) =>
|
||||
ScalarEquals<CaseInsensitiveAscii>(ref matchStart, candidate);
|
||||
}
|
||||
|
||||
// We can't efficiently map non-ASCII inputs to their Ordinal uppercase variants,
|
||||
// so this helper is only used for the verification of the whole input.
|
||||
public readonly struct CaseInsensitiveUnicode : ICaseSensitivity
|
||||
{
|
||||
public static char TransformInput(char input) => throw new UnreachableException();
|
||||
public static Vector128<byte> TransformInput(Vector128<byte> input) => throw new UnreachableException();
|
||||
public static Vector256<byte> TransformInput(Vector256<byte> input) => throw new UnreachableException();
|
||||
public static Vector512<byte> TransformInput(Vector512<byte> input) => throw new UnreachableException();
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static bool Equals(ref char matchStart, string candidate) =>
|
||||
Ordinal.EqualsIgnoreCase(ref matchStart, ref candidate.GetRawStringData(), candidate.Length);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Numerics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal static class TeddyBucketizer
|
||||
{
|
||||
// This method is the same as GenerateBucketizedFingerprint below, but each bucket only contains 1 value.
|
||||
public static (Vector512<byte> Low, Vector512<byte> High) GenerateNonBucketizedFingerprint(ReadOnlySpan<string> values, int offset)
|
||||
{
|
||||
Debug.Assert(values.Length <= 8);
|
||||
|
||||
Vector128<byte> low = default;
|
||||
Vector128<byte> high = default;
|
||||
|
||||
for (int i = 0; i < values.Length; i++)
|
||||
{
|
||||
string value = values[i];
|
||||
|
||||
int bit = 1 << i;
|
||||
|
||||
char c = value[offset];
|
||||
Debug.Assert(char.IsAscii(c));
|
||||
|
||||
int lowNibble = c & 0xF;
|
||||
int highNibble = c >> 4;
|
||||
|
||||
low.SetElementUnsafe(lowNibble, (byte)(low.GetElementUnsafe(lowNibble) | bit));
|
||||
high.SetElementUnsafe(highNibble, (byte)(high.GetElementUnsafe(highNibble) | bit));
|
||||
}
|
||||
|
||||
return (DuplicateTo512(low), DuplicateTo512(high));
|
||||
}
|
||||
|
||||
// We can have up to 8 buckets, and their positions are encoded by 1 bit each.
|
||||
// Every bitmap encodes a mapping of each of the possible 16 nibble values into an 8-bit bitmap.
|
||||
// For example if bucket 0 contains strings ["foo", "bar"], the bitmaps will have the first bit (0th bucket) set like the following:
|
||||
// 'f' is 0x66, 'b' is 0x62, so n0Low has the bit set at index 2 and 6, n0High has it set at index 6.
|
||||
// 'o' is 0x6F, 'a' is 0x61, so n1Low has the bit set at index 1 and 15, n1High has it set at index 6.
|
||||
// 'o' is 0x6F, 'r' is 0x72, so n2Low has the bit set at index 2 and 15, n2High has it set at index 6 and 7.
|
||||
// We repeat this for each bucket and then OR together the bitmaps (fingerprints) of each bucket to generate a single bitmap for each nibble.
|
||||
public static (Vector512<byte> Low, Vector512<byte> High) GenerateBucketizedFingerprint(string[][] valueBuckets, int offset)
|
||||
{
|
||||
Debug.Assert(valueBuckets.Length <= 8);
|
||||
|
||||
Vector128<byte> low = default;
|
||||
Vector128<byte> high = default;
|
||||
|
||||
for (int i = 0; i < valueBuckets.Length; i++)
|
||||
{
|
||||
int bit = 1 << i;
|
||||
|
||||
foreach (string value in valueBuckets[i])
|
||||
{
|
||||
char c = value[offset];
|
||||
Debug.Assert(char.IsAscii(c));
|
||||
|
||||
int lowNibble = c & 0xF;
|
||||
int highNibble = c >> 4;
|
||||
|
||||
low.SetElementUnsafe(lowNibble, (byte)(low.GetElementUnsafe(lowNibble) | bit));
|
||||
high.SetElementUnsafe(highNibble, (byte)(high.GetElementUnsafe(highNibble) | bit));
|
||||
}
|
||||
}
|
||||
|
||||
return (DuplicateTo512(low), DuplicateTo512(high));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static Vector512<byte> DuplicateTo512(Vector128<byte> vector)
|
||||
{
|
||||
Vector256<byte> vector256 = Vector256.Create(vector, vector);
|
||||
return Vector512.Create(vector256, vector256);
|
||||
}
|
||||
|
||||
public static string[][] Bucketize(ReadOnlySpan<string> values, int bucketCount, int n)
|
||||
{
|
||||
Debug.Assert(bucketCount == 8, "This may change if we end up supporting the 'fat Teddy' variant.");
|
||||
Debug.Assert(values.Length > bucketCount, "Should be using a non-bucketized implementation.");
|
||||
Debug.Assert(values.Length <= RabinKarp.MaxValues);
|
||||
|
||||
// Stores the offset of the bucket each value should be assigned to.
|
||||
// This lets us avoid allocating temporary lists to build up each bucket.
|
||||
Span<int> bucketIndexes = stackalloc int[RabinKarp.MaxValues].Slice(0, values.Length);
|
||||
|
||||
// Group patterns with the same prefix into the same bucket to avoid wasting time during verification steps.
|
||||
Dictionary<int, int> prefixToBucket = new(bucketCount);
|
||||
|
||||
int bucketCounter = 0;
|
||||
|
||||
for (int i = 0; i < values.Length; i++)
|
||||
{
|
||||
string value = values[i];
|
||||
|
||||
int prefix = 0;
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
Debug.Assert(char.IsAscii(value[j]));
|
||||
prefix = (prefix << 8) | value[j];
|
||||
}
|
||||
|
||||
if (!prefixToBucket.TryGetValue(prefix, out int bucketIndex))
|
||||
{
|
||||
// Potential optimization: We currently merge values with different prefixes into buckets randomly (round-robin).
|
||||
// We could employ a more sophisticated strategy here, e.g. by trying to minimize the number of
|
||||
// values in each bucket, or by minimizing the PopCount of final merged fingerprints.
|
||||
// Example of the latter: https://gist.github.com/MihaZupan/831324d1d646b69ae0ba4b54e3446a49
|
||||
|
||||
bucketIndex = bucketCounter++ % bucketCount;
|
||||
prefixToBucket.Add(prefix, bucketIndex);
|
||||
}
|
||||
|
||||
bucketIndexes[i] = bucketIndex;
|
||||
}
|
||||
|
||||
string[][] buckets = new string[bucketCount][];
|
||||
|
||||
for (int bucketIndex = 0; bucketIndex < buckets.Length; bucketIndex++)
|
||||
{
|
||||
string[] strings = buckets[bucketIndex] = new string[bucketIndexes.Count(bucketIndex)];
|
||||
|
||||
int count = 0;
|
||||
for (int i = 0; i < bucketIndexes.Length; i++)
|
||||
{
|
||||
if (bucketIndexes[i] == bucketIndex)
|
||||
{
|
||||
strings[count++] = values[i];
|
||||
}
|
||||
}
|
||||
Debug.Assert(count == strings.Length);
|
||||
}
|
||||
|
||||
return buckets;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,436 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
/// <summary>
|
||||
/// Contains the implementation of core vectorized Teddy matching operations.
|
||||
/// They determine which buckets contain potential matches for each input position.
|
||||
/// </summary>
|
||||
internal static class TeddyHelper
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
public static (Vector128<byte> Result, Vector128<byte> Prev0) ProcessInputN2(
|
||||
Vector128<byte> input,
|
||||
Vector128<byte> prev0,
|
||||
Vector128<byte> n0Low, Vector128<byte> n0High,
|
||||
Vector128<byte> n1Low, Vector128<byte> n1High)
|
||||
{
|
||||
// See the full description of ProcessInputN3 below for more details.
|
||||
// This method follows the same pattern as ProcessInputN3, but compares 2 bytes of each bucket at a time instead of 3.
|
||||
// We are dealing with 4 input nibble bitmaps instead of 6, and only 1 result from the previous iteration instead of 2.
|
||||
(Vector128<byte> low, Vector128<byte> high) = GetNibbles(input);
|
||||
|
||||
// Shuffle each nibble with the 2 corresponding bitmaps to determine which positions match any bucket.
|
||||
Vector128<byte> match0 = Shuffle(n0Low, n0High, low, high);
|
||||
Vector128<byte> result1 = Shuffle(n1Low, n1High, low, high);
|
||||
|
||||
// RightShift1 shifts the match0 vector to the right by 1 place and shifts in 1 byte from the previous iteration.
|
||||
Vector128<byte> result0 = RightShift1(prev0, match0);
|
||||
|
||||
// AND the results together to obtain a list of only buckets that match at all 4 nibble positions.
|
||||
Vector128<byte> result = result0 & result1;
|
||||
|
||||
// Return the result and the current matches for byte 0.
|
||||
// The next loop iteration, 'match0' will be passed back to this method as 'prev0'.
|
||||
return (result, match0);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
public static (Vector256<byte> Result, Vector256<byte> Prev0) ProcessInputN2(
|
||||
Vector256<byte> input,
|
||||
Vector256<byte> prev0,
|
||||
Vector256<byte> n0Low, Vector256<byte> n0High,
|
||||
Vector256<byte> n1Low, Vector256<byte> n1High)
|
||||
{
|
||||
// See comments in 'ProcessInputN2' for Vector128<byte> above.
|
||||
// This method is the same, but operates on 32 input characters at a time.
|
||||
(Vector256<byte> low, Vector256<byte> high) = GetNibbles(input);
|
||||
|
||||
Vector256<byte> match0 = Shuffle(n0Low, n0High, low, high);
|
||||
Vector256<byte> result1 = Shuffle(n1Low, n1High, low, high);
|
||||
|
||||
Vector256<byte> result0 = RightShift1(prev0, match0);
|
||||
|
||||
Vector256<byte> result = result0 & result1;
|
||||
|
||||
return (result, match0);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
public static (Vector512<byte> Result, Vector512<byte> Prev0) ProcessInputN2(
|
||||
Vector512<byte> input,
|
||||
Vector512<byte> prev0,
|
||||
Vector512<byte> n0Low, Vector512<byte> n0High,
|
||||
Vector512<byte> n1Low, Vector512<byte> n1High)
|
||||
{
|
||||
// See comments in 'ProcessInputN2' for Vector128<byte> above.
|
||||
// This method is the same, but operates on 64 input characters at a time.
|
||||
(Vector512<byte> low, Vector512<byte> high) = GetNibbles(input);
|
||||
|
||||
Vector512<byte> match0 = Shuffle(n0Low, n0High, low, high);
|
||||
Vector512<byte> result1 = Shuffle(n1Low, n1High, low, high);
|
||||
|
||||
Vector512<byte> result0 = RightShift1(prev0, match0);
|
||||
|
||||
Vector512<byte> result = result0 & result1;
|
||||
|
||||
return (result, match0);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
public static (Vector128<byte> Result, Vector128<byte> Prev0, Vector128<byte> Prev1) ProcessInputN3(
|
||||
Vector128<byte> input,
|
||||
Vector128<byte> prev0, Vector128<byte> prev1,
|
||||
Vector128<byte> n0Low, Vector128<byte> n0High,
|
||||
Vector128<byte> n1Low, Vector128<byte> n1High,
|
||||
Vector128<byte> n2Low, Vector128<byte> n2High)
|
||||
{
|
||||
// This is the core operation of the Teddy algorithm that determines which of the buckets contain potential matches.
|
||||
// Every input bitmap argument (n0Low, n0High, ...) encodes a mapping of each of the possible 16 nibble values into an 8-bit bitmap.
|
||||
// We test each nibble in the input against these bitmaps to determine which buckets match a given nibble.
|
||||
// We then AND together these results to obtain only a list of buckets that match at all 6 nibble positions.
|
||||
// Each byte of the result represents an 8-bit bitmask of buckets that may match at each position.
|
||||
(Vector128<byte> low, Vector128<byte> high) = GetNibbles(input);
|
||||
|
||||
// Shuffle each nibble with the 3 corresponding bitmaps to determine which positions match any bucket.
|
||||
Vector128<byte> match0 = Shuffle(n0Low, n0High, low, high);
|
||||
Vector128<byte> match1 = Shuffle(n1Low, n1High, low, high);
|
||||
Vector128<byte> result2 = Shuffle(n2Low, n2High, low, high);
|
||||
|
||||
// match0 contain the information for bucket matches at position 0.
|
||||
// match1 contain the information for bucket matches at position 1.
|
||||
// result2 contain the information for bucket matches at position 2.
|
||||
// If we imagine that we only have 1 bucket with 1 string "ABC", the bitmaps we've just obtained encode the following information:
|
||||
// match0 tells us at which positions we matched the letter 'A'
|
||||
// match1 tells us at which positions we matched the letter 'B'
|
||||
// result2 tells us at which positions we matched the letter 'C'
|
||||
// If input represents the text "BC text ABC text", they would contain:
|
||||
// input: [B, C, , t, e, x, t, , A, B, C, , t, e, x, t]
|
||||
// match0: [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
|
||||
// match1: [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
|
||||
// result2: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
|
||||
// ^ ^ ^
|
||||
// Note how the input contains the string ABC, but the matches are not aligned, so we can't just AND them together.
|
||||
// To solve this, we shift 'match0' to the right by 2 places and 'match1' to the right by 1 place.
|
||||
// result0: [?, ?, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
|
||||
// result1: [?, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
|
||||
// result2: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
|
||||
// ^ ^ ^
|
||||
// The results are now aligned, but we don't know whether the first two positions matched result0 and result1.
|
||||
// To replace the missing bytes, we remember the matches from the previous loop iteration, and look at their last 2 bytes.
|
||||
// If the previous loop iteration ended on the character 'A', we might even have an earlier match.
|
||||
// For example, if the previous input was "Random strings A":
|
||||
// prev0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
|
||||
// result0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
// ^ ^
|
||||
// We will merge the last two bytes of 'prev0' into 'result0' and the last byte of 'prev1' into 'result1'
|
||||
// result0: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
|
||||
// result1: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
|
||||
// result2: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
|
||||
//
|
||||
// RightShift1 and RightShift2 perform the above operation of shifting the match vectors
|
||||
// to the right by 1 and 2 places and shifting in the bytes from the previous iteration.
|
||||
Vector128<byte> result0 = RightShift2(prev0, match0);
|
||||
Vector128<byte> result1 = RightShift1(prev1, match1);
|
||||
|
||||
// AND the results together to obtain a list of only buckets that match at all 6 nibble positions.
|
||||
// result: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
|
||||
// ^ ^
|
||||
// Note that we found the match at index 1, even though that match started 2 bytes earlier, at the end of the previous iteration.
|
||||
// The caller must account for that when verifying potential matches, see 'MatchStartOffsetN3 = 2' in 'AsciiStringSearchValuesTeddyBase'.
|
||||
Vector128<byte> result = result0 & result1 & result2;
|
||||
|
||||
// Return the result and the current matches for byte 0 and 1.
|
||||
// The next loop iteration, 'match0' and 'match1' will be passed back to this method as 'prev0' and 'prev1'.
|
||||
return (result, match0, match1);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
public static (Vector256<byte> Result, Vector256<byte> Prev0, Vector256<byte> Prev1) ProcessInputN3(
|
||||
Vector256<byte> input,
|
||||
Vector256<byte> prev0, Vector256<byte> prev1,
|
||||
Vector256<byte> n0Low, Vector256<byte> n0High,
|
||||
Vector256<byte> n1Low, Vector256<byte> n1High,
|
||||
Vector256<byte> n2Low, Vector256<byte> n2High)
|
||||
{
|
||||
// See comments in 'ProcessInputN3' for Vector128<byte> above.
|
||||
// This method is the same, but operates on 32 input characters at a time.
|
||||
(Vector256<byte> low, Vector256<byte> high) = GetNibbles(input);
|
||||
|
||||
Vector256<byte> match0 = Shuffle(n0Low, n0High, low, high);
|
||||
Vector256<byte> match1 = Shuffle(n1Low, n1High, low, high);
|
||||
Vector256<byte> result2 = Shuffle(n2Low, n2High, low, high);
|
||||
|
||||
Vector256<byte> result0 = RightShift2(prev0, match0);
|
||||
Vector256<byte> result1 = RightShift1(prev1, match1);
|
||||
|
||||
Vector256<byte> result = result0 & result1 & result2;
|
||||
|
||||
return (result, match0, match1);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
public static (Vector512<byte> Result, Vector512<byte> Prev0, Vector512<byte> Prev1) ProcessInputN3(
|
||||
Vector512<byte> input,
|
||||
Vector512<byte> prev0, Vector512<byte> prev1,
|
||||
Vector512<byte> n0Low, Vector512<byte> n0High,
|
||||
Vector512<byte> n1Low, Vector512<byte> n1High,
|
||||
Vector512<byte> n2Low, Vector512<byte> n2High)
|
||||
{
|
||||
// See comments in 'ProcessInputN3' for Vector128<byte> above.
|
||||
// This method is the same, but operates on 64 input characters at a time.
|
||||
(Vector512<byte> low, Vector512<byte> high) = GetNibbles(input);
|
||||
|
||||
Vector512<byte> match0 = Shuffle(n0Low, n0High, low, high);
|
||||
Vector512<byte> match1 = Shuffle(n1Low, n1High, low, high);
|
||||
Vector512<byte> result2 = Shuffle(n2Low, n2High, low, high);
|
||||
|
||||
Vector512<byte> result0 = RightShift2(prev0, match0);
|
||||
Vector512<byte> result1 = RightShift1(prev1, match1);
|
||||
|
||||
Vector512<byte> result = result0 & result1 & result2;
|
||||
|
||||
return (result, match0, match1);
|
||||
}
|
||||
|
||||
// Read two Vector512<ushort> and concatenate their lower bytes together into a single Vector512<byte>.
|
||||
// On X86, characters above 32767 are turned into 0, but we account for that by not using Teddy if any of the string values contain a 0.
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Sse2))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd))]
|
||||
public static Vector128<byte> LoadAndPack16AsciiChars(ref char source)
|
||||
{
|
||||
Vector128<ushort> source0 = Vector128.LoadUnsafe(ref source);
|
||||
Vector128<ushort> source1 = Vector128.LoadUnsafe(ref source, (nuint)Vector128<ushort>.Count);
|
||||
|
||||
return Sse2.IsSupported
|
||||
? Sse2.PackUnsignedSaturate(source0.AsInt16(), source1.AsInt16())
|
||||
: AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(source0), source1);
|
||||
}
|
||||
|
||||
// Read two Vector512<ushort> and concatenate their lower bytes together into a single Vector512<byte>.
|
||||
// Characters above 32767 are turned into 0, but we account for that by not using Teddy if any of the string values contain a 0.
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
public static Vector256<byte> LoadAndPack32AsciiChars(ref char source)
|
||||
{
|
||||
Vector256<ushort> source0 = Vector256.LoadUnsafe(ref source);
|
||||
Vector256<ushort> source1 = Vector256.LoadUnsafe(ref source, (nuint)Vector256<ushort>.Count);
|
||||
|
||||
Vector256<byte> packed = Avx2.PackUnsignedSaturate(source0.AsInt16(), source1.AsInt16());
|
||||
|
||||
return PackedSpanHelpers.FixUpPackedVector256Result(packed);
|
||||
}
|
||||
|
||||
// Read two Vector512<ushort> and concatenate their lower bytes together into a single Vector512<byte>.
|
||||
// Characters above 32767 are turned into 0, but we account for that by not using Teddy if any of the string values contain a 0.
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
public static Vector512<byte> LoadAndPack64AsciiChars(ref char source)
|
||||
{
|
||||
Vector512<ushort> source0 = Vector512.LoadUnsafe(ref source);
|
||||
Vector512<ushort> source1 = Vector512.LoadUnsafe(ref source, (nuint)Vector512<ushort>.Count);
|
||||
|
||||
Vector512<byte> packed = Avx512BW.PackUnsignedSaturate(source0.AsInt16(), source1.AsInt16());
|
||||
|
||||
return PackedSpanHelpers.FixUpPackedVector512Result(packed);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd))]
|
||||
private static (Vector128<byte> Low, Vector128<byte> High) GetNibbles(Vector128<byte> input)
|
||||
{
|
||||
// 'low' is not strictly correct here, but we take advantage of Ssse3.Shuffle's behavior
|
||||
// of doing an implicit 'AND 0xF' in order to skip the redundant AND.
|
||||
Vector128<byte> low = Ssse3.IsSupported
|
||||
? input
|
||||
: input & Vector128.Create((byte)0xF);
|
||||
|
||||
Vector128<byte> high = input >>> 4;
|
||||
|
||||
return (low, high);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static (Vector256<byte> Low, Vector256<byte> High) GetNibbles(Vector256<byte> input)
|
||||
{
|
||||
// 'low' is not strictly correct here, but we take advantage of Avx2.Shuffle's behavior
|
||||
// of doing an implicit 'AND 0xF' in order to skip the redundant AND.
|
||||
Vector256<byte> low = input;
|
||||
|
||||
Vector256<byte> high = input >>> 4;
|
||||
|
||||
return (low, high);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static (Vector512<byte> Low, Vector512<byte> High) GetNibbles(Vector512<byte> input)
|
||||
{
|
||||
// 'low' is not strictly correct here, but we take advantage of Avx512BW.Shuffle's behavior
|
||||
// of doing an implicit 'AND 0xF' in order to skip the redundant AND.
|
||||
Vector512<byte> low = input;
|
||||
|
||||
Vector512<byte> high = input >>> 4;
|
||||
|
||||
return (low, high);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
private static Vector128<byte> Shuffle(Vector128<byte> maskLow, Vector128<byte> maskHigh, Vector128<byte> low, Vector128<byte> high)
|
||||
{
|
||||
return Vector128.ShuffleUnsafe(maskLow, low) & Vector128.ShuffleUnsafe(maskHigh, high);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static Vector256<byte> Shuffle(Vector256<byte> maskLow, Vector256<byte> maskHigh, Vector256<byte> low, Vector256<byte> high)
|
||||
{
|
||||
return Avx2.Shuffle(maskLow, low) & Avx2.Shuffle(maskHigh, high);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
private static Vector512<byte> Shuffle(Vector512<byte> maskLow, Vector512<byte> maskHigh, Vector512<byte> low, Vector512<byte> high)
|
||||
{
|
||||
return Avx512BW.Shuffle(maskLow, low) & Avx512BW.Shuffle(maskHigh, high);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
private static Vector128<byte> RightShift1(Vector128<byte> left, Vector128<byte> right)
|
||||
{
|
||||
// Given input vectors like
|
||||
// left: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
||||
// right: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
|
||||
// We want to shift the last element of left (15) to be the first element of the result
|
||||
// result: [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
|
||||
|
||||
if (Ssse3.IsSupported)
|
||||
{
|
||||
return Ssse3.AlignRight(right, left, 15);
|
||||
}
|
||||
else
|
||||
{
|
||||
// We create a temporary 'leftShifted' vector where the 1st element is the 16th element of the input.
|
||||
// We then use TBX to shuffle all the elements one place to the left.
|
||||
// 0xFF is used for the first element to replace it with the one from 'leftShifted'.
|
||||
|
||||
Vector128<byte> leftShifted = Vector128.Shuffle(left, Vector128.Create(15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte());
|
||||
return AdvSimd.Arm64.VectorTableLookupExtension(leftShifted, right, Vector128.Create(0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14));
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Ssse3))]
|
||||
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
|
||||
private static Vector128<byte> RightShift2(Vector128<byte> left, Vector128<byte> right)
|
||||
{
|
||||
// Given input vectors like
|
||||
// left: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
||||
// right: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
|
||||
// We want to shift the last two elements of left (14, 15) to be the first elements of the result
|
||||
// result: [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
|
||||
|
||||
if (Ssse3.IsSupported)
|
||||
{
|
||||
return Ssse3.AlignRight(right, left, 14);
|
||||
}
|
||||
else
|
||||
{
|
||||
// We create a temporary 'leftShifted' vector where the 1st and 2nd element are the 15th and 16th element of the input.
|
||||
// We then use TBX to shuffle all the elements two places to the left.
|
||||
// 0xFF is used for the first two elements to replace them with the ones from 'leftShifted'.
|
||||
|
||||
Vector128<byte> leftShifted = Vector128.Shuffle(left, Vector128.Create(14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte());
|
||||
return AdvSimd.Arm64.VectorTableLookupExtension(leftShifted, right, Vector128.Create(0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13));
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static Vector256<byte> RightShift1(Vector256<byte> left, Vector256<byte> right)
|
||||
{
|
||||
// Given input vectors like
|
||||
// left: 0, 1, 2, 3, 4, 5, ... , 26, 27, 28, 29, 30, [31]
|
||||
// right: 32, 33, 34, 35, 36, 37, ... , 58, 59, 60, 61, 62, 63
|
||||
// We want to shift the last element of left (31) to be the first element of the result
|
||||
// result: [31], 32, 33, 34, 35, 36, ... , 57, 58, 59, 60, 61, 62
|
||||
//
|
||||
// Avx2.AlignRight acts like two separate Ssse3.AlignRight calls on the lower and upper halves of the source operands.
|
||||
// Result of Avx2.AlignRight(right, left, 15) is
|
||||
// lower: [15], 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
|
||||
// upper: [31], 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62
|
||||
// note how elements at indexes 0 and 16 are off by 16 places.
|
||||
// We want to read 31 instead of 15 and 47 instead of 31.
|
||||
//
|
||||
// To achieve that we create a temporary value where we combine the second half of the first operand and the first half of the second operand (Permute2x128).
|
||||
// left: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ] control: (1 << 0)
|
||||
// right: [ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 ], 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 control: (2 << 4)
|
||||
// result: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, [31], 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, [47]
|
||||
// This effectively shifts the 0th and 16th element by 16 places (note values 31 and 47).
|
||||
|
||||
Vector256<byte> leftShifted = Avx2.Permute2x128(left, right, (1 << 0) + (2 << 4));
|
||||
return Avx2.AlignRight(right, leftShifted, 15);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static Vector256<byte> RightShift2(Vector256<byte> left, Vector256<byte> right)
|
||||
{
|
||||
// See comments in 'RightShift1(Vector256<byte> left, Vector256<byte> right)' above.
|
||||
Vector256<byte> leftShifted = Avx2.Permute2x128(left, right, (1 << 0) + (2 << 4));
|
||||
return Avx2.AlignRight(right, leftShifted, 14);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
private static Vector512<byte> RightShift1(Vector512<byte> left, Vector512<byte> right)
|
||||
{
|
||||
// Given input vectors like
|
||||
// left: 0, 1, 2, 3, 4, 5, ... , 58, 59, 60, 61, 62, [63]
|
||||
// right: 64, 65, 66, 67, 68, 69, ... , 122, 123, 124, 125, 126, 127
|
||||
// We want to shift the last element of left (63) to be the first element of the result
|
||||
// result: [63], 64, 65, 66, 67, 68, ... , 121, 122, 123, 124, 125, 126
|
||||
//
|
||||
// Avx512BW.AlignRight acts like four separate Ssse3.AlignRight calls on each 128-bit pair of the of the source operands.
|
||||
// Result of Avx512BW.AlignRight(right, left, 15) is
|
||||
// lower: [15], 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, [31], 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
|
||||
// upper: [47], 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, [63], 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
|
||||
// note how elements at indexes 0, 16, 32 and 48 are off by 48 places.
|
||||
// We want to read 63 instead of 15, 79 instead of 31, 95 instead of 47, and 111 instead of 63.
|
||||
//
|
||||
// Similar to Avx2 above, we create a temporary value where we shift these positions by 48 places - shift 8-byte values by 6 places (PermuteVar8x64x2).
|
||||
// The indices vector below could be [6, 7, 8, 9, 10, 11, 12, 13], but we only care about the last byte in each 128-bit block (positions with value 0 don't affect the result).
|
||||
|
||||
Vector512<byte> leftShifted = Avx512F.PermuteVar8x64x2(left.AsInt64(), Vector512.Create(0, 7, 0, 9, 0, 11, 0, 13), right.AsInt64()).AsByte();
|
||||
return Avx512BW.AlignRight(right, leftShifted, 15);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
private static Vector512<byte> RightShift2(Vector512<byte> left, Vector512<byte> right)
|
||||
{
|
||||
// See comments in 'RightShift1(Vector512<byte> left, Vector512<byte> right)' above.
|
||||
Vector512<byte> leftShifted = Avx512F.PermuteVar8x64x2(left.AsInt64(), Vector512.Create(0, 7, 0, 9, 0, 11, 0, 13), right.AsInt64()).AsByte();
|
||||
return Avx512BW.AlignRight(right, leftShifted, 14);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class MultiStringIgnoreCaseSearchValuesFallback : StringSearchValuesBase
|
||||
{
|
||||
private readonly string[] _values;
|
||||
|
||||
public MultiStringIgnoreCaseSearchValuesFallback(HashSet<string> uniqueValues) : base(uniqueValues)
|
||||
{
|
||||
_values = new string[uniqueValues.Count];
|
||||
uniqueValues.CopyTo(_values, 0);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// This method is intentionally implemented in a way that checks haystack positions one at a time.
|
||||
/// See the description in <see cref="SpanHelpers.IndexOfAny{T}(ref T, int, ref T, int)"/>.
|
||||
/// </summary>
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span)
|
||||
{
|
||||
string[] values = _values;
|
||||
|
||||
for (int i = 0; i < span.Length; i++)
|
||||
{
|
||||
ReadOnlySpan<char> remaining = span.Slice(i);
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
if (remaining.StartsWith(value, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class SingleStringSearchValuesFallback<TIgnoreCase> : StringSearchValuesBase
|
||||
where TIgnoreCase : struct, SearchValues.IRuntimeConst
|
||||
{
|
||||
private readonly string _value;
|
||||
|
||||
public SingleStringSearchValuesFallback(string value, HashSet<string> uniqueValues) : base(uniqueValues)
|
||||
{
|
||||
_value = value;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
TIgnoreCase.Value
|
||||
? Ordinal.IndexOfOrdinalIgnoreCase(span, _value)
|
||||
: span.IndexOf(_value);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,416 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Numerics;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Intrinsics;
|
||||
using static System.Buffers.StringSearchValuesHelper;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
// Based on SpanHelpers.IndexOf(ref char, int, ref char, int)
|
||||
// This implementation uses 3 precomputed anchor points when searching.
|
||||
// This implementation may also be used for length=2 values, in which case two anchors point at the same position.
|
||||
// Has an O(i * m) worst-case, with the expected time closer to O(n) for most inputs.
|
||||
internal sealed class SingleStringSearchValuesThreeChars<TCaseSensitivity> : SearchValues<string>
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
private const ushort CaseConversionMask = unchecked((ushort)~0x20);
|
||||
|
||||
private readonly string _value;
|
||||
private readonly nint _minusValueTailLength;
|
||||
private readonly nuint _ch2ByteOffset;
|
||||
private readonly nuint _ch3ByteOffset;
|
||||
private readonly ushort _ch1;
|
||||
private readonly ushort _ch2;
|
||||
private readonly ushort _ch3;
|
||||
|
||||
private static bool IgnoreCase => typeof(TCaseSensitivity) != typeof(CaseSensitive);
|
||||
|
||||
public SingleStringSearchValuesThreeChars(string value)
|
||||
{
|
||||
// We could have more than one entry in 'uniqueValues' if this value is an exact prefix of all the others.
|
||||
Debug.Assert(value.Length > 1);
|
||||
|
||||
CharacterFrequencyHelper.GetSingleStringMultiCharacterOffsets(value, IgnoreCase, out int ch2Offset, out int ch3Offset);
|
||||
|
||||
Debug.Assert(ch3Offset == 0 || ch3Offset > ch2Offset);
|
||||
|
||||
_value = value;
|
||||
_minusValueTailLength = -(value.Length - 1);
|
||||
|
||||
_ch1 = value[0];
|
||||
_ch2 = value[ch2Offset];
|
||||
_ch3 = value[ch3Offset];
|
||||
|
||||
if (IgnoreCase)
|
||||
{
|
||||
_ch1 &= CaseConversionMask;
|
||||
_ch2 &= CaseConversionMask;
|
||||
_ch3 &= CaseConversionMask;
|
||||
}
|
||||
|
||||
_ch2ByteOffset = (nuint)ch2Offset * 2;
|
||||
_ch3ByteOffset = (nuint)ch3Offset * 2;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
IndexOf(ref MemoryMarshal.GetReference(span), span.Length);
|
||||
|
||||
private int IndexOf(ref char searchSpace, int searchSpaceLength)
|
||||
{
|
||||
ref char searchSpaceStart = ref searchSpace;
|
||||
|
||||
nint searchSpaceMinusValueTailLength = searchSpaceLength + _minusValueTailLength;
|
||||
|
||||
if (!Vector128.IsHardwareAccelerated || searchSpaceMinusValueTailLength < Vector128<ushort>.Count)
|
||||
{
|
||||
goto ShortInput;
|
||||
}
|
||||
|
||||
nuint ch2ByteOffset = _ch2ByteOffset;
|
||||
nuint ch3ByteOffset = _ch3ByteOffset;
|
||||
|
||||
if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512<ushort>.Count >= 0)
|
||||
{
|
||||
Vector512<ushort> ch1 = Vector512.Create(_ch1);
|
||||
Vector512<ushort> ch2 = Vector512.Create(_ch2);
|
||||
Vector512<ushort> ch3 = Vector512.Create(_ch3);
|
||||
|
||||
ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector512<ushort>.Count);
|
||||
|
||||
while (true)
|
||||
{
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count);
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count + (int)(_ch2ByteOffset / 2));
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count + (int)(_ch3ByteOffset / 2));
|
||||
|
||||
// Find which starting positions likely contain a match (likely match all 3 anchor characters).
|
||||
Vector512<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);
|
||||
|
||||
if (result != Vector512<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
LoopFooter:
|
||||
// We haven't found a match. Update the input position and check if we've reached the end.
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, Vector512<ushort>.Count);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector512<ushort>.Count)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We have fewer than 32 characters remaining. Adjust the input position such that we will do one last loop iteration.
|
||||
searchSpace = ref lastSearchSpace;
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
CandidateFound:
|
||||
// We found potential matches, but they may be false-positives, so we must verify each one.
|
||||
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto LoopFooter;
|
||||
}
|
||||
}
|
||||
else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
|
||||
{
|
||||
Vector256<ushort> ch1 = Vector256.Create(_ch1);
|
||||
Vector256<ushort> ch2 = Vector256.Create(_ch2);
|
||||
Vector256<ushort> ch3 = Vector256.Create(_ch3);
|
||||
|
||||
ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector256<ushort>.Count);
|
||||
|
||||
while (true)
|
||||
{
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector256<ushort>.Count);
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector256<ushort>.Count + (int)(_ch2ByteOffset / 2));
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector256<ushort>.Count + (int)(_ch3ByteOffset / 2));
|
||||
|
||||
// Find which starting positions likely contain a match (likely match all 3 anchor characters).
|
||||
Vector256<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);
|
||||
|
||||
if (result != Vector256<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
LoopFooter:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, Vector256<ushort>.Count);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector256<ushort>.Count)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We have fewer than 16 characters remaining. Adjust the input position such that we will do one last loop iteration.
|
||||
searchSpace = ref lastSearchSpace;
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
CandidateFound:
|
||||
// We found potential matches, but they may be false-positives, so we must verify each one.
|
||||
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto LoopFooter;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Vector128<ushort> ch1 = Vector128.Create(_ch1);
|
||||
Vector128<ushort> ch2 = Vector128.Create(_ch2);
|
||||
Vector128<ushort> ch3 = Vector128.Create(_ch3);
|
||||
|
||||
ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector128<ushort>.Count);
|
||||
|
||||
while (true)
|
||||
{
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector128<ushort>.Count);
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector128<ushort>.Count + (int)(_ch2ByteOffset / 2));
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector128<ushort>.Count + (int)(_ch3ByteOffset / 2));
|
||||
|
||||
// Find which starting positions likely contain a match (likely match all 3 anchor characters).
|
||||
Vector128<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);
|
||||
|
||||
if (result != Vector128<byte>.Zero)
|
||||
{
|
||||
goto CandidateFound;
|
||||
}
|
||||
|
||||
LoopFooter:
|
||||
searchSpace = ref Unsafe.Add(ref searchSpace, Vector128<ushort>.Count);
|
||||
|
||||
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
|
||||
{
|
||||
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector128<ushort>.Count)))
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We have fewer than 8 characters remaining. Adjust the input position such that we will do one last loop iteration.
|
||||
searchSpace = ref lastSearchSpace;
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
CandidateFound:
|
||||
// We found potential matches, but they may be false-positives, so we must verify each one.
|
||||
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
|
||||
{
|
||||
return offset;
|
||||
}
|
||||
goto LoopFooter;
|
||||
}
|
||||
}
|
||||
|
||||
ShortInput:
|
||||
string value = _value;
|
||||
char valueHead = value.GetRawStringData();
|
||||
|
||||
for (nint i = 0; i < searchSpaceMinusValueTailLength; i++)
|
||||
{
|
||||
ref char cur = ref Unsafe.Add(ref searchSpace, i);
|
||||
|
||||
// CaseInsensitiveUnicode doesn't support single-character transformations, so we skip checking the first character first.
|
||||
if ((typeof(TCaseSensitivity) == typeof(CaseInsensitiveUnicode) || TCaseSensitivity.TransformInput(cur) == valueHead) &&
|
||||
TCaseSensitivity.Equals(ref cur, value))
|
||||
{
|
||||
return (int)i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static Vector128<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector128<ushort> ch1, Vector128<ushort> ch2, Vector128<ushort> ch3)
|
||||
{
|
||||
// Load 3 vectors from the input.
|
||||
// One from the current search space, the other two at an offset based on the distance of those characters from the first one.
|
||||
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
|
||||
{
|
||||
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace));
|
||||
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
|
||||
Vector128<ushort> cmpCh3 = Vector128.Equals(ch3, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
|
||||
// AND all 3 together to get a mask of possible match positions that match in at least 3 places.
|
||||
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
|
||||
}
|
||||
else
|
||||
{
|
||||
// For each, AND the value with ~0x20 so that letters are uppercased.
|
||||
// For characters that aren't ASCII letters, this may produce wrong results, but only false-positives.
|
||||
// We will take care of those in the verification step if the other characters also indicate a possible match.
|
||||
Vector128<ushort> caseConversion = Vector128.Create(CaseConversionMask);
|
||||
|
||||
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace) & caseConversion);
|
||||
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
|
||||
Vector128<ushort> cmpCh3 = Vector128.Equals(ch3, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
|
||||
// AND all 3 together to get a mask of possible match positions that likely match in at least 3 places.
|
||||
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static Vector256<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector256<ushort> ch1, Vector256<ushort> ch2, Vector256<ushort> ch3)
|
||||
{
|
||||
// See comments in 'GetComparisonResult' for Vector128<byte> above.
|
||||
// This method is the same, but operates on 32 input characters at a time.
|
||||
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
|
||||
{
|
||||
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace));
|
||||
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
|
||||
Vector256<ushort> cmpCh3 = Vector256.Equals(ch3, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
|
||||
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
|
||||
}
|
||||
else
|
||||
{
|
||||
Vector256<ushort> caseConversion = Vector256.Create(CaseConversionMask);
|
||||
|
||||
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace) & caseConversion);
|
||||
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
|
||||
Vector256<ushort> cmpCh3 = Vector256.Equals(ch3, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
|
||||
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private static Vector512<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector512<ushort> ch1, Vector512<ushort> ch2, Vector512<ushort> ch3)
|
||||
{
|
||||
// See comments in 'GetComparisonResult' for Vector128<byte> above.
|
||||
// This method is the same, but operates on 64 input characters at a time.
|
||||
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
|
||||
{
|
||||
Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace));
|
||||
Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
|
||||
Vector512<ushort> cmpCh3 = Vector512.Equals(ch3, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
|
||||
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
|
||||
}
|
||||
else
|
||||
{
|
||||
Vector512<ushort> caseConversion = Vector512.Create(CaseConversionMask);
|
||||
|
||||
Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace) & caseConversion);
|
||||
Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
|
||||
Vector512<ushort> cmpCh3 = Vector512.Equals(ch3, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
|
||||
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
|
||||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, uint mask, out int offsetFromStart)
|
||||
{
|
||||
// 'mask' encodes the input positions where at least 3 characters likely matched.
|
||||
// Verify each one to see if we've found a match, otherwise return back to the vectorized loop.
|
||||
do
|
||||
{
|
||||
int bitPos = BitOperations.TrailingZeroCount(mask);
|
||||
Debug.Assert(bitPos % 2 == 0);
|
||||
|
||||
ref char matchRef = ref Unsafe.AddByteOffset(ref searchSpace, bitPos);
|
||||
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length);
|
||||
|
||||
if (TCaseSensitivity.Equals(ref matchRef, _value))
|
||||
{
|
||||
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2);
|
||||
return true;
|
||||
}
|
||||
|
||||
mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
|
||||
}
|
||||
while (mask != 0);
|
||||
|
||||
offsetFromStart = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, ulong mask, out int offsetFromStart)
|
||||
{
|
||||
// 'mask' encodes the input positions where at least 3 characters likely matched.
|
||||
// Verify each one to see if we've found a match, otherwise return back to the vectorized loop.
|
||||
do
|
||||
{
|
||||
int bitPos = BitOperations.TrailingZeroCount(mask);
|
||||
Debug.Assert(bitPos % 2 == 0);
|
||||
|
||||
ref char matchRef = ref Unsafe.AddByteOffset(ref searchSpace, bitPos);
|
||||
|
||||
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length);
|
||||
|
||||
if (TCaseSensitivity.Equals(ref matchRef, _value))
|
||||
{
|
||||
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2);
|
||||
return true;
|
||||
}
|
||||
|
||||
mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
|
||||
}
|
||||
while (mask != 0);
|
||||
|
||||
offsetFromStart = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
internal override bool ContainsCore(string value) =>
|
||||
_value.Equals(value, IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
|
||||
|
||||
internal override string[] GetValues() =>
|
||||
new string[] { _value };
|
||||
|
||||
internal override int IndexOfAny(ReadOnlySpan<string> span) =>
|
||||
IndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
|
||||
|
||||
internal override int IndexOfAnyExcept(ReadOnlySpan<string> span) =>
|
||||
IndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
|
||||
|
||||
internal override int LastIndexOfAny(ReadOnlySpan<string> span) =>
|
||||
LastIndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
|
||||
|
||||
internal override int LastIndexOfAnyExcept(ReadOnlySpan<string> span) =>
|
||||
LastIndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
|
||||
|
||||
private int IndexOfAny<TNegator>(ReadOnlySpan<string> span)
|
||||
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
|
||||
{
|
||||
for (int i = 0; i < span.Length; i++)
|
||||
{
|
||||
if (TNegator.NegateIfNeeded(ContainsCore(span[i])))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int LastIndexOfAny<TNegator>(ReadOnlySpan<string> span)
|
||||
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
|
||||
{
|
||||
for (int i = span.Length - 1; i >= 0; i--)
|
||||
{
|
||||
if (TNegator.NegateIfNeeded(ContainsCore(span[i])))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,414 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.Intrinsics;
|
||||
using System.Runtime.Intrinsics.Arm;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
using System.Text;
|
||||
using static System.Buffers.StringSearchValuesHelper;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal static class StringSearchValues
|
||||
{
|
||||
private static readonly SearchValues<char> s_asciiLetters =
|
||||
SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||
|
||||
private static readonly SearchValues<char> s_allAsciiExceptLowercase =
|
||||
SearchValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`{|}~\u007F");
|
||||
|
||||
public static SearchValues<string> Create(ReadOnlySpan<string> values, bool ignoreCase)
|
||||
{
|
||||
if (values.Length == 0)
|
||||
{
|
||||
return new EmptySearchValues<string>();
|
||||
}
|
||||
|
||||
if (values.Length == 1)
|
||||
{
|
||||
// Avoid additional overheads for single-value inputs.
|
||||
string value = values[0];
|
||||
ArgumentNullException.ThrowIfNull(value, nameof(values));
|
||||
string normalizedValue = NormalizeIfNeeded(value, ignoreCase);
|
||||
|
||||
AnalyzeValues(new ReadOnlySpan<string>(ref normalizedValue), ref ignoreCase, out bool ascii, out bool asciiLettersOnly, out _, out _);
|
||||
return CreateForSingleValue(normalizedValue, uniqueValues: null, ignoreCase, ascii, asciiLettersOnly);
|
||||
}
|
||||
|
||||
var uniqueValues = new HashSet<string>(values.Length, ignoreCase ? StringComparer.OrdinalIgnoreCase : StringComparer.Ordinal);
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(value, nameof(values));
|
||||
|
||||
uniqueValues.Add(value);
|
||||
}
|
||||
|
||||
if (uniqueValues.Contains(string.Empty))
|
||||
{
|
||||
return new SingleStringSearchValuesFallback<SearchValues.FalseConst>(string.Empty, uniqueValues);
|
||||
}
|
||||
|
||||
Span<string> normalizedValues = new string[uniqueValues.Count];
|
||||
int i = 0;
|
||||
foreach (string value in uniqueValues)
|
||||
{
|
||||
normalizedValues[i++] = NormalizeIfNeeded(value, ignoreCase);
|
||||
}
|
||||
Debug.Assert(i == normalizedValues.Length);
|
||||
|
||||
// Aho-Corasick's ctor expects values to be sorted by length.
|
||||
normalizedValues.Sort(static (a, b) => a.Length.CompareTo(b.Length));
|
||||
|
||||
// We may not end up choosing Aho-Corasick as the implementation, but it has a nice property of
|
||||
// finding all the unreachable values during the construction stage, so we build the trie early.
|
||||
HashSet<string>? unreachableValues = null;
|
||||
var ahoCorasickBuilder = new AhoCorasickBuilder(normalizedValues, ignoreCase, ref unreachableValues);
|
||||
|
||||
if (unreachableValues is not null)
|
||||
{
|
||||
// Some values are exact prefixes of other values.
|
||||
// Exclude those values now to reduce the number of buckets and make verification steps cheaper during searching.
|
||||
normalizedValues = RemoveUnreachableValues(normalizedValues, unreachableValues);
|
||||
}
|
||||
|
||||
SearchValues<string> searchValues = CreateFromNormalizedValues(normalizedValues, uniqueValues, ignoreCase, ref ahoCorasickBuilder);
|
||||
ahoCorasickBuilder.Dispose();
|
||||
return searchValues;
|
||||
|
||||
static string NormalizeIfNeeded(string value, bool ignoreCase)
|
||||
{
|
||||
if (ignoreCase && value.AsSpan().ContainsAnyExcept(s_allAsciiExceptLowercase))
|
||||
{
|
||||
string upperCase = string.FastAllocateString(value.Length);
|
||||
int charsWritten = Ordinal.ToUpperOrdinal(value, new Span<char>(ref upperCase.GetRawStringData(), upperCase.Length));
|
||||
Debug.Assert(charsWritten == upperCase.Length);
|
||||
value = upperCase;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
static Span<string> RemoveUnreachableValues(Span<string> values, HashSet<string> unreachableValues)
|
||||
{
|
||||
int newCount = 0;
|
||||
foreach (string value in values)
|
||||
{
|
||||
if (!unreachableValues.Contains(value))
|
||||
{
|
||||
values[newCount++] = value;
|
||||
}
|
||||
}
|
||||
|
||||
Debug.Assert(newCount <= values.Length - unreachableValues.Count);
|
||||
Debug.Assert(newCount > 0);
|
||||
|
||||
return values.Slice(0, newCount);
|
||||
}
|
||||
}
|
||||
|
||||
private static SearchValues<string> CreateFromNormalizedValues(
|
||||
ReadOnlySpan<string> values,
|
||||
HashSet<string> uniqueValues,
|
||||
bool ignoreCase,
|
||||
ref AhoCorasickBuilder ahoCorasickBuilder)
|
||||
{
|
||||
AnalyzeValues(values, ref ignoreCase, out bool allAscii, out bool asciiLettersOnly, out bool nonAsciiAffectedByCaseConversion, out int minLength);
|
||||
|
||||
if (values.Length == 1)
|
||||
{
|
||||
// We may reach this if we've removed unreachable values and ended up with only 1 remaining.
|
||||
return CreateForSingleValue(values[0], uniqueValues, ignoreCase, allAscii, asciiLettersOnly);
|
||||
}
|
||||
|
||||
if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) &&
|
||||
TryGetTeddyAcceleratedValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion, minLength) is { } searchValues)
|
||||
{
|
||||
return searchValues;
|
||||
}
|
||||
|
||||
// Fall back to Aho-Corasick for all other multi-value sets.
|
||||
AhoCorasick ahoCorasick = ahoCorasickBuilder.Build();
|
||||
|
||||
if (!ignoreCase)
|
||||
{
|
||||
return PickAhoCorasickImplementation<CaseSensitive>(ahoCorasick, uniqueValues);
|
||||
}
|
||||
|
||||
if (nonAsciiAffectedByCaseConversion)
|
||||
{
|
||||
if (ContainsIncompleteSurrogatePairs(values))
|
||||
{
|
||||
// Aho-Corasick can't deal with the matching semantics of standalone surrogate code units.
|
||||
// We will use a slow but correct O(n * m) fallback implementation.
|
||||
return new MultiStringIgnoreCaseSearchValuesFallback(uniqueValues);
|
||||
}
|
||||
|
||||
return PickAhoCorasickImplementation<CaseInsensitiveUnicode>(ahoCorasick, uniqueValues);
|
||||
}
|
||||
|
||||
if (asciiLettersOnly)
|
||||
{
|
||||
return PickAhoCorasickImplementation<CaseInsensitiveAsciiLetters>(ahoCorasick, uniqueValues);
|
||||
}
|
||||
|
||||
return PickAhoCorasickImplementation<CaseInsensitiveAscii>(ahoCorasick, uniqueValues);
|
||||
|
||||
static SearchValues<string> PickAhoCorasickImplementation<TCaseSensitivity>(AhoCorasick ahoCorasick, HashSet<string> uniqueValues)
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
return ahoCorasick.ShouldUseAsciiFastScan
|
||||
? new StringSearchValuesAhoCorasick<TCaseSensitivity, AhoCorasick.IndexOfAnyAsciiFastScan>(ahoCorasick, uniqueValues)
|
||||
: new StringSearchValuesAhoCorasick<TCaseSensitivity, AhoCorasick.NoFastScan>(ahoCorasick, uniqueValues);
|
||||
}
|
||||
}
|
||||
|
||||
private static SearchValues<string>? TryGetTeddyAcceleratedValues(
|
||||
ReadOnlySpan<string> values,
|
||||
HashSet<string> uniqueValues,
|
||||
bool ignoreCase,
|
||||
bool allAscii,
|
||||
bool asciiLettersOnly,
|
||||
bool nonAsciiAffectedByCaseConversion,
|
||||
int minLength)
|
||||
{
|
||||
if (minLength == 1)
|
||||
{
|
||||
// An 'N=1' implementation is possible, but callers should
|
||||
// consider using SearchValues<char> instead in such cases.
|
||||
// It can be added if Regex ends up running into this case.
|
||||
return null;
|
||||
}
|
||||
|
||||
if (values.Length > RabinKarp.MaxValues)
|
||||
{
|
||||
// The more values we have, the higher the chance of hash/fingerprint collisions.
|
||||
// To avoid spending too much time in verification steps, fallback to Aho-Corasick which guarantees O(n).
|
||||
// If it turns out that this limit is commonly exceeded, we can tweak the number of buckets
|
||||
// in the implementation, or use different variants depending on input.
|
||||
return null;
|
||||
}
|
||||
|
||||
int n = minLength == 2 ? 2 : 3;
|
||||
|
||||
if (Ssse3.IsSupported)
|
||||
{
|
||||
foreach (string value in values)
|
||||
{
|
||||
if (value.AsSpan(0, n).Contains('\0'))
|
||||
{
|
||||
// If we let null chars through here, Teddy would still work correctly, but it
|
||||
// would hit more false positives that the verification step would have to rule out.
|
||||
// While we could flow a generic flag like Ssse3AndWasmHandleZeroInNeedle through,
|
||||
// we expect such values to be rare enough that introducing more code is not worth it.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Even if the values contain non-ASCII chars, we may be able to use Teddy as long as the
|
||||
// first N characters are ASCII.
|
||||
if (!allAscii)
|
||||
{
|
||||
foreach (string value in values)
|
||||
{
|
||||
if (!Ascii.IsValid(value.AsSpan(0, n)))
|
||||
{
|
||||
// A vectorized implementation for non-ASCII values is possible.
|
||||
// It can be added if it turns out to be a common enough scenario.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!ignoreCase)
|
||||
{
|
||||
return PickTeddyImplementation<CaseSensitive, CaseSensitive>(values, uniqueValues, n);
|
||||
}
|
||||
|
||||
if (asciiLettersOnly)
|
||||
{
|
||||
return PickTeddyImplementation<CaseInsensitiveAsciiLetters, CaseInsensitiveAsciiLetters>(values, uniqueValues, n);
|
||||
}
|
||||
|
||||
// Even if the whole value isn't ASCII letters only, we can still use a faster approach
|
||||
// for the vectorized part as long as the first N characters are.
|
||||
bool asciiStartLettersOnly = true;
|
||||
bool asciiStartUnaffectedByCaseConversion = true;
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
ReadOnlySpan<char> slice = value.AsSpan(0, n);
|
||||
asciiStartLettersOnly = asciiStartLettersOnly && !slice.ContainsAnyExcept(s_asciiLetters);
|
||||
asciiStartUnaffectedByCaseConversion = asciiStartUnaffectedByCaseConversion && !slice.ContainsAny(s_asciiLetters);
|
||||
}
|
||||
|
||||
Debug.Assert(!(asciiStartLettersOnly && asciiStartUnaffectedByCaseConversion));
|
||||
|
||||
if (asciiStartUnaffectedByCaseConversion)
|
||||
{
|
||||
return nonAsciiAffectedByCaseConversion
|
||||
? PickTeddyImplementation<CaseSensitive, CaseInsensitiveUnicode>(values, uniqueValues, n)
|
||||
: PickTeddyImplementation<CaseSensitive, CaseInsensitiveAscii>(values, uniqueValues, n);
|
||||
}
|
||||
|
||||
if (nonAsciiAffectedByCaseConversion)
|
||||
{
|
||||
return asciiStartLettersOnly
|
||||
? PickTeddyImplementation<CaseInsensitiveAsciiLetters, CaseInsensitiveUnicode>(values, uniqueValues, n)
|
||||
: PickTeddyImplementation<CaseInsensitiveAscii, CaseInsensitiveUnicode>(values, uniqueValues, n);
|
||||
}
|
||||
|
||||
return asciiStartLettersOnly
|
||||
? PickTeddyImplementation<CaseInsensitiveAsciiLetters, CaseInsensitiveAscii>(values, uniqueValues, n)
|
||||
: PickTeddyImplementation<CaseInsensitiveAscii, CaseInsensitiveAscii>(values, uniqueValues, n);
|
||||
}
|
||||
|
||||
private static SearchValues<string> PickTeddyImplementation<TStartCaseSensitivity, TCaseSensitivity>(
|
||||
ReadOnlySpan<string> values,
|
||||
HashSet<string> uniqueValues,
|
||||
int n)
|
||||
where TStartCaseSensitivity : struct, ICaseSensitivity
|
||||
where TCaseSensitivity : struct, ICaseSensitivity
|
||||
{
|
||||
Debug.Assert(typeof(TStartCaseSensitivity) != typeof(CaseInsensitiveUnicode));
|
||||
Debug.Assert(values.Length > 1);
|
||||
Debug.Assert(n is 2 or 3);
|
||||
|
||||
if (values.Length > 8)
|
||||
{
|
||||
string[][] buckets = TeddyBucketizer.Bucketize(values, bucketCount: 8, n);
|
||||
|
||||
// Potential optimization: We don't have to pick the first N characters for the fingerprint.
|
||||
// Different offset selection can noticeably improve throughput (e.g. 2x).
|
||||
|
||||
return n == 2
|
||||
? new AsciiStringSearchValuesTeddyBucketizedN2<TStartCaseSensitivity, TCaseSensitivity>(buckets, values, uniqueValues)
|
||||
: new AsciiStringSearchValuesTeddyBucketizedN3<TStartCaseSensitivity, TCaseSensitivity>(buckets, values, uniqueValues);
|
||||
}
|
||||
else
|
||||
{
|
||||
return n == 2
|
||||
? new AsciiStringSearchValuesTeddyNonBucketizedN2<TStartCaseSensitivity, TCaseSensitivity>(values, uniqueValues)
|
||||
: new AsciiStringSearchValuesTeddyNonBucketizedN3<TStartCaseSensitivity, TCaseSensitivity>(values, uniqueValues);
|
||||
}
|
||||
}
|
||||
|
||||
private static SearchValues<string> CreateForSingleValue(
|
||||
string value,
|
||||
HashSet<string>? uniqueValues,
|
||||
bool ignoreCase,
|
||||
bool allAscii,
|
||||
bool asciiLettersOnly)
|
||||
{
|
||||
// We make use of optimizations that may overflow on 32bit systems for long values.
|
||||
int maxLength = IntPtr.Size == 4 ? 1_000_000_000 : int.MaxValue;
|
||||
|
||||
if (Vector128.IsHardwareAccelerated && value.Length > 1 && value.Length <= maxLength)
|
||||
{
|
||||
if (!ignoreCase)
|
||||
{
|
||||
return new SingleStringSearchValuesThreeChars<CaseSensitive>(value);
|
||||
}
|
||||
|
||||
if (asciiLettersOnly)
|
||||
{
|
||||
return new SingleStringSearchValuesThreeChars<CaseInsensitiveAsciiLetters>(value);
|
||||
}
|
||||
|
||||
if (allAscii)
|
||||
{
|
||||
return new SingleStringSearchValuesThreeChars<CaseInsensitiveAscii>(value);
|
||||
}
|
||||
|
||||
// When ignoring casing, all anchor chars we search for must be ASCII.
|
||||
if (char.IsAscii(value[0]) && value.AsSpan().LastIndexOfAnyInRange((char)0, (char)127) > 0)
|
||||
{
|
||||
return new SingleStringSearchValuesThreeChars<CaseInsensitiveUnicode>(value);
|
||||
}
|
||||
}
|
||||
|
||||
uniqueValues ??= new HashSet<string>(1, ignoreCase ? StringComparer.OrdinalIgnoreCase : StringComparer.Ordinal) { value };
|
||||
|
||||
return ignoreCase
|
||||
? new SingleStringSearchValuesFallback<SearchValues.TrueConst>(value, uniqueValues)
|
||||
: new SingleStringSearchValuesFallback<SearchValues.FalseConst>(value, uniqueValues);
|
||||
}
|
||||
|
||||
private static void AnalyzeValues(
|
||||
ReadOnlySpan<string> values,
|
||||
ref bool ignoreCase,
|
||||
out bool allAscii,
|
||||
out bool asciiLettersOnly,
|
||||
out bool nonAsciiAffectedByCaseConversion,
|
||||
out int minLength)
|
||||
{
|
||||
allAscii = true;
|
||||
asciiLettersOnly = true;
|
||||
minLength = int.MaxValue;
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
allAscii = allAscii && Ascii.IsValid(value);
|
||||
asciiLettersOnly = asciiLettersOnly && !value.AsSpan().ContainsAnyExcept(s_asciiLetters);
|
||||
minLength = Math.Min(minLength, value.Length);
|
||||
}
|
||||
|
||||
// Potential optimization: Not all characters participate in Unicode case conversion.
|
||||
// If we can determine that none of the non-ASCII characters do, we can make searching faster
|
||||
// by using the same paths as we do for ASCII-only values.
|
||||
nonAsciiAffectedByCaseConversion = ignoreCase && !allAscii;
|
||||
|
||||
// If all the characters in values are unaffected by casing, we can avoid the ignoreCase overhead.
|
||||
if (ignoreCase && !nonAsciiAffectedByCaseConversion && !asciiLettersOnly)
|
||||
{
|
||||
ignoreCase = false;
|
||||
|
||||
foreach (string value in values)
|
||||
{
|
||||
if (value.AsSpan().ContainsAny(s_asciiLetters))
|
||||
{
|
||||
ignoreCase = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static bool ContainsIncompleteSurrogatePairs(ReadOnlySpan<string> values)
|
||||
{
|
||||
foreach (string value in values)
|
||||
{
|
||||
int i = value.AsSpan().IndexOfAnyInRange(CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END);
|
||||
if (i < 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
for (; (uint)i < (uint)value.Length; i++)
|
||||
{
|
||||
if (char.IsHighSurrogate(value[i]))
|
||||
{
|
||||
if ((uint)(i + 1) >= (uint)value.Length || !char.IsLowSurrogate(value[i + 1]))
|
||||
{
|
||||
// High surrogate not followed by a low surrogate.
|
||||
return true;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
else if (char.IsLowSurrogate(value[i]))
|
||||
{
|
||||
// Low surrogate not preceded by a high surrogate.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal sealed class StringSearchValuesAhoCorasick<TCaseSensitivity, TFastScanVariant> : StringSearchValuesBase
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
where TFastScanVariant : struct, AhoCorasick.IFastScan
|
||||
{
|
||||
private readonly AhoCorasick _ahoCorasick;
|
||||
|
||||
public StringSearchValuesAhoCorasick(AhoCorasick ahoCorasick, HashSet<string> uniqueValues) : base(uniqueValues) =>
|
||||
_ahoCorasick = ahoCorasick;
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
|
||||
_ahoCorasick.IndexOfAny<TCaseSensitivity, TFastScanVariant>(span);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
/// <summary>
|
||||
/// Implements the base <see cref="SearchValues{T}"/> {Last}IndexOfAny{Except} operations.
|
||||
/// While these operations are exposed such that you can call string[].IndexOfAny(searchValues),
|
||||
/// they are not expected to be used in performance-critical paths.
|
||||
/// <see cref="MemoryExtensions.IndexOfAny(ReadOnlySpan{char}, SearchValues{string})"/> is the main
|
||||
/// reason why someone would create an instance of <see cref="string"/> <see cref="SearchValues{T}"/>.
|
||||
/// </summary>
|
||||
internal abstract class StringSearchValuesBase : SearchValues<string>
|
||||
{
|
||||
private readonly HashSet<string> _uniqueValues;
|
||||
|
||||
public StringSearchValuesBase(HashSet<string> uniqueValues) =>
|
||||
_uniqueValues = uniqueValues;
|
||||
|
||||
internal sealed override bool ContainsCore(string value) =>
|
||||
_uniqueValues.Contains(value);
|
||||
|
||||
internal sealed override string[] GetValues()
|
||||
{
|
||||
string[] values = new string[_uniqueValues.Count];
|
||||
_uniqueValues.CopyTo(values);
|
||||
return values;
|
||||
}
|
||||
|
||||
internal sealed override int IndexOfAny(ReadOnlySpan<string> span) =>
|
||||
IndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
|
||||
|
||||
internal sealed override int IndexOfAnyExcept(ReadOnlySpan<string> span) =>
|
||||
IndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
|
||||
|
||||
internal sealed override int LastIndexOfAny(ReadOnlySpan<string> span) =>
|
||||
LastIndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
|
||||
|
||||
internal sealed override int LastIndexOfAnyExcept(ReadOnlySpan<string> span) =>
|
||||
LastIndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
|
||||
|
||||
private int IndexOfAny<TNegator>(ReadOnlySpan<string> span)
|
||||
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
|
||||
{
|
||||
for (int i = 0; i < span.Length; i++)
|
||||
{
|
||||
if (TNegator.NegateIfNeeded(_uniqueValues.Contains(span[i])))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int LastIndexOfAny<TNegator>(ReadOnlySpan<string> span)
|
||||
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
|
||||
{
|
||||
for (int i = span.Length - 1; i >= 0; i--)
|
||||
{
|
||||
if (TNegator.NegateIfNeeded(_uniqueValues.Contains(span[i])))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace System.Buffers
|
||||
{
|
||||
internal abstract class StringSearchValuesRabinKarp<TCaseSensitivity> : StringSearchValuesBase
|
||||
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
|
||||
{
|
||||
private readonly RabinKarp _rabinKarp;
|
||||
|
||||
public StringSearchValuesRabinKarp(ReadOnlySpan<string> values, HashSet<string> uniqueValues) : base(uniqueValues) =>
|
||||
_rabinKarp = new RabinKarp(values);
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
protected int ShortInputFallback(ReadOnlySpan<char> span) =>
|
||||
_rabinKarp.IndexOfAny<TCaseSensitivity>(span);
|
||||
}
|
||||
}
|
|
@ -1264,7 +1264,7 @@ namespace System
|
|||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static Vector256<byte> FixUpPackedVector256Result(Vector256<byte> result)
|
||||
internal static Vector256<byte> FixUpPackedVector256Result(Vector256<byte> result)
|
||||
{
|
||||
Debug.Assert(Avx2.IsSupported);
|
||||
// Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in
|
||||
|
@ -1276,14 +1276,12 @@ namespace System
|
|||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512F))]
|
||||
private static Vector512<byte> FixUpPackedVector512Result(Vector512<byte> result)
|
||||
internal static Vector512<byte> FixUpPackedVector512Result(Vector512<byte> result)
|
||||
{
|
||||
Debug.Assert(Avx512F.IsSupported);
|
||||
// Avx512BW.PackUnsignedSaturate(Vector512.Create((short)1), Vector512.Create((short)2)) will result in
|
||||
// 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
|
||||
// We want to swap the X and Y bits
|
||||
// 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2
|
||||
return Avx512F.PermuteVar8x64(result.AsInt64(), Vector512.Create((long)0, 2, 4, 6, 1, 3, 5, 7)).AsByte();
|
||||
// Avx512BW.PackUnsignedSaturate will interleave the inputs in 8-byte blocks.
|
||||
// We want to preserve the order of the two input vectors, so we deinterleave the packed value.
|
||||
return Avx512F.PermuteVar8x64(result.AsInt64(), Vector512.Create(0, 2, 4, 6, 1, 3, 5, 7)).AsByte();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7392,6 +7392,7 @@ namespace System.Buffers
|
|||
{
|
||||
public static System.Buffers.SearchValues<byte> Create(System.ReadOnlySpan<byte> values) { throw null; }
|
||||
public static System.Buffers.SearchValues<char> Create(System.ReadOnlySpan<char> values) { throw null; }
|
||||
public static System.Buffers.SearchValues<string> Create(System.ReadOnlySpan<string> values, System.StringComparison comparisonType) { throw null; }
|
||||
}
|
||||
public partial interface IPinnable
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue