1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-09 09:34:49 +09:00

Add SearchValues<string> (#88394)

* Add SearchValues<string>
This commit is contained in:
Miha Zupan 2023-08-20 19:37:20 +02:00 committed by GitHub
parent ac02b66e41
commit 899461780a
Signed by: github
GPG key ID: 4AEE18F83AFDEB23
37 changed files with 4372 additions and 94 deletions

View file

@ -1270,3 +1270,30 @@ Licensed under the Apache License, Version 2.0.
Available at
https://github.com/SixLabors/ImageSharp/blob/f4f689ce67ecbcc35cebddba5aacb603e6d1068a/LICENSE
License for the Teddy multi-substring searching implementation
--------------------------------------
https://github.com/BurntSushi/aho-corasick
The MIT License (MIT)
Copyright (c) 2015 Andrew Gallant
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -235,6 +235,8 @@ namespace System
public static bool Contains(this System.ReadOnlySpan<char> span, System.ReadOnlySpan<char> value, System.StringComparison comparisonType) { throw null; }
public static bool Contains<T>(this System.ReadOnlySpan<T> span, T value) where T : System.IEquatable<T>? { throw null; }
public static bool Contains<T>(this System.Span<T> span, T value) where T : System.IEquatable<T>? { throw null; }
public static bool ContainsAny(this System.ReadOnlySpan<char> span, System.Buffers.SearchValues<string> values) { throw null; }
public static bool ContainsAny(this System.Span<char> span, System.Buffers.SearchValues<string> values) { throw null; }
public static bool ContainsAny<T>(this System.ReadOnlySpan<T> span, System.Buffers.SearchValues<T> values) where T : System.IEquatable<T>? { throw null; }
public static bool ContainsAny<T>(this System.ReadOnlySpan<T> span, System.ReadOnlySpan<T> values) where T : System.IEquatable<T>? { throw null; }
public static bool ContainsAny<T>(this System.ReadOnlySpan<T> span, T value0, T value1) where T : System.IEquatable<T>? { throw null; }
@ -272,6 +274,8 @@ namespace System
public static System.Text.SpanRuneEnumerator EnumerateRunes(this System.Span<char> span) { throw null; }
public static bool Equals(this System.ReadOnlySpan<char> span, System.ReadOnlySpan<char> other, System.StringComparison comparisonType) { throw null; }
public static int IndexOf(this System.ReadOnlySpan<char> span, System.ReadOnlySpan<char> value, System.StringComparison comparisonType) { throw null; }
public static int IndexOfAny(this System.ReadOnlySpan<char> span, System.Buffers.SearchValues<string> values) { throw null; }
public static int IndexOfAny(this System.Span<char> span, System.Buffers.SearchValues<string> values) { throw null; }
public static int IndexOfAny<T>(this System.ReadOnlySpan<T> span, System.Buffers.SearchValues<T> values) where T : System.IEquatable<T>? { throw null; }
public static int IndexOfAny<T>(this System.ReadOnlySpan<T> span, System.ReadOnlySpan<T> values) where T : System.IEquatable<T>? { throw null; }
public static int IndexOfAny<T>(this System.ReadOnlySpan<T> span, T value0, T value1) where T : System.IEquatable<T>? { throw null; }

View file

@ -0,0 +1,519 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Buffers;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Runtime.ExceptionServices;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.DotNet.RemoteExecutor;
using Xunit;
namespace System.Memory.Tests.Span
{
public static class StringSearchValuesTests
{
public static bool CanTestInvariantCulture => RemoteExecutor.IsSupported;
public static bool CanTestNls => RemoteExecutor.IsSupported && OperatingSystem.IsWindows();
[Theory]
[InlineData(StringComparison.Ordinal, "a")]
[InlineData(StringComparison.Ordinal, "A")]
[InlineData(StringComparison.Ordinal, "a", "ab", "abc", "bc")]
[InlineData(StringComparison.Ordinal, "A", "ab", "aBc", "Bc")]
[InlineData(StringComparison.OrdinalIgnoreCase, "a")]
[InlineData(StringComparison.OrdinalIgnoreCase, "A")]
[InlineData(StringComparison.OrdinalIgnoreCase, "A", "a")]
[InlineData(StringComparison.OrdinalIgnoreCase, "a", "Ab", "abc", "bC")]
public static void Values_ImplementsSearchValuesBase(StringComparison comparisonType, params string[] values)
{
const string ValueNotInSet = "Hello world";
SearchValues<string> stringValues = SearchValues.Create(values, comparisonType);
Assert.False(stringValues.Contains(ValueNotInSet));
AssertIndexOfAnyAndFriends(Span<string>.Empty, -1, -1, -1, -1);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet }, -1, 0, -1, 0);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, ValueNotInSet }, -1, 0, -1, 1);
foreach (string value in values)
{
string differentCase = value.ToLowerInvariant();
if (value == differentCase)
{
differentCase = value.ToUpperInvariant();
Assert.NotEqual(value, differentCase);
}
Assert.True(stringValues.Contains(value));
Assert.Equal(comparisonType == StringComparison.OrdinalIgnoreCase, stringValues.Contains(differentCase));
AssertIndexOfAnyAndFriends(new[] { value }, 0, -1, 0, -1);
AssertIndexOfAnyAndFriends(new[] { value, value }, 0, -1, 1, -1);
AssertIndexOfAnyAndFriends(new[] { value, ValueNotInSet }, 0, 1, 0, 1);
AssertIndexOfAnyAndFriends(new[] { value, ValueNotInSet, ValueNotInSet }, 0, 1, 0, 2);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, value }, 1, 0, 1, 0);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, ValueNotInSet, value }, 2, 0, 2, 1);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, value, ValueNotInSet }, 1, 0, 1, 2);
AssertIndexOfAnyAndFriends(new[] { value, ValueNotInSet, value }, 0, 1, 2, 1);
if (comparisonType == StringComparison.OrdinalIgnoreCase)
{
AssertIndexOfAnyAndFriends(new[] { differentCase }, 0, -1, 0, -1);
AssertIndexOfAnyAndFriends(new[] { differentCase, differentCase }, 0, -1, 1, -1);
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet }, 0, 1, 0, 1);
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet, ValueNotInSet }, 0, 1, 0, 2);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, differentCase }, 1, 0, 1, 0);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, ValueNotInSet, differentCase }, 2, 0, 2, 1);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, differentCase, ValueNotInSet }, 1, 0, 1, 2);
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet, differentCase }, 0, 1, 2, 1);
}
else
{
AssertIndexOfAnyAndFriends(new[] { differentCase }, -1, 0, -1, 0);
AssertIndexOfAnyAndFriends(new[] { differentCase, differentCase }, -1, 0, -1, 1);
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet }, -1, 0, -1, 1);
AssertIndexOfAnyAndFriends(new[] { ValueNotInSet, differentCase }, -1, 0, -1, 1);
AssertIndexOfAnyAndFriends(new[] { differentCase, ValueNotInSet, ValueNotInSet }, -1, 0, -1, 2);
}
}
void AssertIndexOfAnyAndFriends(Span<string> values, int any, int anyExcept, int last, int lastExcept)
{
Assert.Equal(any >= 0, last >= 0);
Assert.Equal(anyExcept >= 0, lastExcept >= 0);
Assert.Equal(any, values.IndexOfAny(stringValues));
Assert.Equal(any, ((ReadOnlySpan<string>)values).IndexOfAny(stringValues));
Assert.Equal(anyExcept, values.IndexOfAnyExcept(stringValues));
Assert.Equal(anyExcept, ((ReadOnlySpan<string>)values).IndexOfAnyExcept(stringValues));
Assert.Equal(last, values.LastIndexOfAny(stringValues));
Assert.Equal(last, ((ReadOnlySpan<string>)values).LastIndexOfAny(stringValues));
Assert.Equal(lastExcept, values.LastIndexOfAnyExcept(stringValues));
Assert.Equal(lastExcept, ((ReadOnlySpan<string>)values).LastIndexOfAnyExcept(stringValues));
Assert.Equal(any >= 0, values.ContainsAny(stringValues));
Assert.Equal(any >= 0, ((ReadOnlySpan<string>)values).ContainsAny(stringValues));
Assert.Equal(anyExcept >= 0, values.ContainsAnyExcept(stringValues));
Assert.Equal(anyExcept >= 0, ((ReadOnlySpan<string>)values).ContainsAnyExcept(stringValues));
}
}
[Theory]
// Sets with empty values
[InlineData(StringComparison.Ordinal, 0, " ", "abc, ")]
[InlineData(StringComparison.OrdinalIgnoreCase, 0, " ", "abc, ")]
[InlineData(StringComparison.Ordinal, 0, "", "")]
[InlineData(StringComparison.OrdinalIgnoreCase, 0, "", "abc, ")]
// Empty sets
[InlineData(StringComparison.Ordinal, -1, " ", null)]
[InlineData(StringComparison.OrdinalIgnoreCase, -1, " ", null)]
[InlineData(StringComparison.Ordinal, -1, "", null)]
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "", null)]
// A few simple cases
[InlineData(StringComparison.Ordinal, 1, "xbc", "abc, bc")]
[InlineData(StringComparison.Ordinal, 0, "foobar", "foo, bar")]
[InlineData(StringComparison.Ordinal, 0, "barfoo", "foo, bar")]
[InlineData(StringComparison.Ordinal, 0, "foofoo", "foo, bar")]
[InlineData(StringComparison.Ordinal, 0, "barbar", "foo, bar")]
[InlineData(StringComparison.Ordinal, 4, "bafofoo", "foo, bar")]
[InlineData(StringComparison.Ordinal, 4, "bafofoo", "bar, foo")]
[InlineData(StringComparison.Ordinal, 4, "fobabar", "foo, bar")]
[InlineData(StringComparison.Ordinal, 4, "fobabar", "bar, foo")]
// Multiple potential matches - we want the first one
[InlineData(StringComparison.Ordinal, 1, "abcd", "bc, cd")]
// Simple case sensitivity
[InlineData(StringComparison.Ordinal, -1, " ABC", "abc")]
[InlineData(StringComparison.Ordinal, 1, " abc", "abc")]
[InlineData(StringComparison.OrdinalIgnoreCase, 1, " ABC", "abc")]
// A few more complex cases that test the Aho-Corasick implementation
[InlineData(StringComparison.Ordinal, 3, "RyrIGEdt2S9", "IGEdt2, G, rIGm6i")]
[InlineData(StringComparison.Ordinal, 2, "Npww1HtmO", "NVOhQu, w, XeR")]
[InlineData(StringComparison.Ordinal, 1, "08Qq6", "8, vx, BFA4s, aLP2, hm, lmT, y, CNTB, Q, vd")]
[InlineData(StringComparison.Ordinal, 3, "A4sRYUhKZR1Vn8N", "F, scsx, nWBhrx, Q, 7Of, BX, huoJ, R")]
[InlineData(StringComparison.Ordinal, 9, "40sufu3TdzcKQfK", "3MXvo26, zPd6t, zc, c5, ypUCK3A9, K, YlX")]
[InlineData(StringComparison.Ordinal, 0, "111KtTGeWuV", "11, B51tJ, Z, j0DWudC, kuJRbcovn, 0T2vnT9")]
[InlineData(StringComparison.Ordinal, 5, "Uykbt1zWw7wylEgC", "1zWw7, Bh, 7qDgAY, w, Z, dP, V, W, Hiols, T")]
[InlineData(StringComparison.Ordinal, 6, "PI9yZx9AOWrUR", "4, A, MLbg, jACE, x9AZEYPbLr, 4bYTzw, W, 9AOW, O")]
[InlineData(StringComparison.Ordinal, 7, "KV4cRyrIGEdt2S9kbXVK", "e64, 10Yw7k, IGEdt2, G, brL, rIGm6i, Z3, FHoVN, 7P2s")]
// OrdinalIgnoreCase does not match ASCII chars with non-ASCII ones
[InlineData(StringComparison.OrdinalIgnoreCase, 4, "AAAA\u212ABKBkBBCCCC", "\u212A")]
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAAKB\u212ABkBBCCCC", "\u212A")]
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAAkB\u212ABKBBCCCC", "\u212A")]
[InlineData(StringComparison.OrdinalIgnoreCase, 4, "AAAA\u017FBSBsBBCCCC", "\u017F")]
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAASB\u017FBsBBCCCC", "\u017F")]
[InlineData(StringComparison.OrdinalIgnoreCase, 6, "AAAAsB\u017FBSBBCCCC", "\u017F")]
// A few misc non-ASCII examples
[InlineData(StringComparison.OrdinalIgnoreCase, 2, "\0\u1226\u2C5F\0\n\0\u1226\u1242", "hh\u0012\uFE00\u26FF\0\u6C00\u2C00\0b, \u2C5F\0")]
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "barkbarK", "foo, bar\u212A")]
[InlineData(StringComparison.OrdinalIgnoreCase, 4, "bar\u212AbarK", "foo, bark")]
[InlineData(StringComparison.OrdinalIgnoreCase, 0, "bar\u03A3barK", "foo, bar\u03C3")]
[InlineData(StringComparison.OrdinalIgnoreCase, 1, "bar\u03A3barK", "foo, ar\u03C3")]
[InlineData(StringComparison.OrdinalIgnoreCase, 1, " foo\u0131", "foo\u0131")]
[InlineData(StringComparison.OrdinalIgnoreCase, 1, " foo\u0131", "bar, foo\u0131")]
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "fooifooIfoo\u0130", "bar, foo\u0131")]
[InlineData(StringComparison.OrdinalIgnoreCase, -1, "fooifooIfoo\u0131", "bar, foo\u0130")]
public static void IndexOfAny(StringComparison comparisonType, int expected, string text, string? values)
{
Span<char> textSpan = text.ToArray(); // Test non-readonly Span<char> overloads
string[] valuesArray = values is null ? Array.Empty<string>() : values.Split(", ");
SearchValues<string> stringValues = SearchValues.Create(valuesArray, comparisonType);
Assert.Equal(expected, IndexOfAnyReferenceImpl(text, valuesArray, comparisonType));
Assert.Equal(expected, text.AsSpan().IndexOfAny(stringValues));
Assert.Equal(expected, textSpan.IndexOfAny(stringValues));
Assert.Equal(expected >= 0, text.AsSpan().ContainsAny(stringValues));
Assert.Equal(expected >= 0, textSpan.ContainsAny(stringValues));
}
[Fact]
public static void IndexOfAny_InvalidUtf16()
{
// Not using [InlineData] to prevent Xunit from modifying the invalid strings.
// These strings have a high surrogate without the full pair.
IndexOfAny(StringComparison.Ordinal, 1, " foo\uD800bar", "foo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.Ordinal, -1, " foo\uD801bar", "foo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.Ordinal, 2, " foo\uD800bar", "oo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.Ordinal, -1, " foo\uD801bar", "oo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " foo\uD800bar", "foo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " foo\uD801bar", "foo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 2, " foo\uD800bar", "oo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " foo\uD801bar", "oo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " fOo\uD800bar", "Foo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " fOo\uD801bar", "Foo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 2, " foo\uD800bAr", "Oo\uD800bar, bar\uD800foo");
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " foO\uD801bar", "oo\uD800baR, bar\uD800foo");
// Low surrogate without the high surrogate.
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, "\uD801\uDCD8\uD8FB\uDCD8", "foo, \uDCD8");
}
[Fact]
public static void IndexOfAny_CanProduceDifferentResultsUnderNls()
{
if (CanTestInvariantCulture)
{
RunUsingInvariantCulture(static () =>
{
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " \U00016E40", "\U00016E60");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " \U00016E40abc", "\U00016E60, abc");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 1, " abc\U00016E40", "abc\U00016E60");
});
}
if (CanTestNls)
{
RunUsingNLS(static () =>
{
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " \U00016E40", "\U00016E60");
IndexOfAny(StringComparison.OrdinalIgnoreCase, 3, " \U00016E40abc", "\U00016E60, abc");
IndexOfAny(StringComparison.OrdinalIgnoreCase, -1, " abc\U00016E40", "abc\U00016E60");
});
}
}
[Fact]
public static void Create_OnlyOrdinalComparisonIsSupported()
{
foreach (StringComparison comparisonType in Enum.GetValues<StringComparison>())
{
if (comparisonType is StringComparison.Ordinal or StringComparison.OrdinalIgnoreCase)
{
_ = SearchValues.Create(new[] { "abc" }, comparisonType);
}
else
{
Assert.Throws<ArgumentException>(() => SearchValues.Create(new[] { "abc" }, comparisonType));
}
}
}
[Fact]
public static void Create_ThrowsOnNullValues()
{
Assert.Throws<ArgumentNullException>("values", () => SearchValues.Create(new[] { "foo", null, "bar" }, StringComparison.Ordinal));
}
[Fact]
public static void TestIndexOfAny_RandomInputs()
{
var helper = new StringSearchValuesTestHelper(
expected: IndexOfAnyReferenceImpl,
searchValues: (searchSpace, values) => searchSpace.IndexOfAny(values));
helper.TestRandomInputs();
}
[ConditionalFact(nameof(CanTestInvariantCulture))]
public static void TestIndexOfAny_RandomInputs_InvariantCulture()
{
RunUsingInvariantCulture(static () =>
{
Assert.Equal("Invariant Language (Invariant Country)", CultureInfo.CurrentCulture.NativeName);
TestIndexOfAny_RandomInputs();
});
}
[ConditionalFact(nameof(CanTestNls))]
public static void TestIndexOfAny_RandomInputs_Nls()
{
RunUsingNLS(static () =>
{
Assert.NotEqual("Invariant Language (Invariant Country)", CultureInfo.CurrentCulture.NativeName);
TestIndexOfAny_RandomInputs();
});
}
[Fact]
[ActiveIssue("Manual execution only. Worth running any time SearchValues<string> logic is modified.")]
public static void TestIndexOfAny_RandomInputs_Stress()
{
RunStress();
if (CanTestInvariantCulture)
{
RunUsingInvariantCulture(static () => RunStress());
}
if (CanTestNls)
{
RunUsingNLS(static () => RunStress());
}
static void RunStress()
{
foreach (int maxNeedleCount in new[] { 2, 8, 20, 100 })
{
foreach (int maxNeedleValueLength in new[] { 8, 40 })
{
foreach (int haystackLength in new[] { 100, 1024 })
{
var helper = new StringSearchValuesTestHelper(
expected: IndexOfAnyReferenceImpl,
searchValues: (searchSpace, values) => searchSpace.IndexOfAny(values),
rngSeed: Random.Shared.Next())
{
MaxNeedleCount = maxNeedleCount,
MaxNeedleValueLength = maxNeedleValueLength,
MaxHaystackLength = haystackLength,
HaystackIterationsPerNeedle = 1_000,
};
helper.StressRandomInputs(TimeSpan.FromSeconds(5));
}
}
}
}
}
private static int IndexOfAnyReferenceImpl(ReadOnlySpan<char> searchSpace, ReadOnlySpan<string> values, StringComparison comparisonType)
{
int minIndex = int.MaxValue;
foreach (string value in values)
{
int i = searchSpace.IndexOf(value, comparisonType);
if ((uint)i < minIndex)
{
minIndex = i;
}
}
return minIndex == int.MaxValue ? -1 : minIndex;
}
private static void RunUsingInvariantCulture(Action action)
{
Assert.True(CanTestInvariantCulture);
var psi = new ProcessStartInfo();
psi.Environment.Clear();
psi.Environment.Add("DOTNET_SYSTEM_GLOBALIZATION_INVARIANT", "true");
RemoteExecutor.Invoke(action, new RemoteInvokeOptions { StartInfo = psi, TimeOut = 10 * 60 * 1000 }).Dispose();
}
private static void RunUsingNLS(Action action)
{
Assert.True(CanTestNls);
var psi = new ProcessStartInfo();
psi.Environment.Clear();
psi.Environment.Add("DOTNET_SYSTEM_GLOBALIZATION_USENLS", "true");
RemoteExecutor.Invoke(action, new RemoteInvokeOptions { StartInfo = psi, TimeOut = 10 * 60 * 1000 }).Dispose();
}
private sealed class StringSearchValuesTestHelper
{
public delegate int IndexOfAnySearchDelegate(ReadOnlySpan<char> searchSpace, ReadOnlySpan<string> values, StringComparison comparisonType);
public delegate int SearchValuesSearchDelegate(ReadOnlySpan<char> searchSpace, SearchValues<string> values);
public int MaxNeedleCount = 20;
public int MaxNeedleValueLength = 10;
public int MaxHaystackLength = 100;
public int HaystackIterationsPerNeedle = 50;
public int MinValueLength = 1;
private readonly IndexOfAnySearchDelegate _expectedDelegate;
private readonly SearchValuesSearchDelegate _searchValuesDelegate;
private readonly char[] _randomAsciiChars;
private readonly char[] _randomSimpleAsciiChars;
private readonly char[] _randomChars;
public StringSearchValuesTestHelper(IndexOfAnySearchDelegate expected, SearchValuesSearchDelegate searchValues, int rngSeed = 42)
{
_expectedDelegate = expected;
_searchValuesDelegate = searchValues;
_randomAsciiChars = new char[100 * 1024];
_randomSimpleAsciiChars = new char[100 * 1024];
_randomChars = new char[1024 * 1024];
var rng = new Random(rngSeed);
for (int i = 0; i < _randomAsciiChars.Length; i++)
{
_randomAsciiChars[i] = (char)rng.Next(0, 128);
}
for (int i = 0; i < _randomSimpleAsciiChars.Length; i++)
{
int random = rng.Next(26 * 2 + 10);
_randomSimpleAsciiChars[i] = (char)(random + (random switch
{
< 10 => '0',
< 36 => 'a' - 10,
_ => 'A' - 36,
}));
}
rng.NextBytes(MemoryMarshal.Cast<char, byte>(_randomChars));
}
public void StressRandomInputs(TimeSpan duration)
{
ExceptionDispatchInfo? exception = null;
Stopwatch s = Stopwatch.StartNew();
Parallel.For(0, Environment.ProcessorCount - 1, _ =>
{
while (s.Elapsed < duration && Volatile.Read(ref exception) is null)
{
try
{
TestRandomInputs(iterationCount: 1, rng: new Random());
}
catch (Exception ex)
{
exception = ExceptionDispatchInfo.Capture(ex);
}
}
});
exception?.Throw();
}
public void TestRandomInputs(int iterationCount = 1_000, Random? rng = null)
{
rng ??= new Random(42);
for (int iterations = 0; iterations < iterationCount; iterations++)
{
// There are more interesting corner cases with ASCII needles, test those more.
Test(rng, _randomSimpleAsciiChars, _randomSimpleAsciiChars);
Test(rng, _randomAsciiChars, _randomSimpleAsciiChars);
Test(rng, _randomSimpleAsciiChars, _randomAsciiChars);
Test(rng, _randomAsciiChars, _randomAsciiChars);
Test(rng, _randomChars, _randomSimpleAsciiChars);
Test(rng, _randomChars, _randomAsciiChars);
Test(rng, _randomChars, _randomChars);
}
}
private void Test(Random rng, ReadOnlySpan<char> haystackRandom, ReadOnlySpan<char> needleRandom)
{
string[] values = new string[rng.Next(MaxNeedleCount) + 1];
for (int i = 0; i < values.Length; i++)
{
ReadOnlySpan<char> valueSpan;
do
{
valueSpan = GetRandomSlice(rng, needleRandom, MaxNeedleValueLength);
}
while (valueSpan.Length < MinValueLength);
values[i] = valueSpan.ToString();
}
SearchValues<string> valuesOrdinal = SearchValues.Create(values, StringComparison.Ordinal);
SearchValues<string> valuesOrdinalIgnoreCase = SearchValues.Create(values, StringComparison.OrdinalIgnoreCase);
for (int i = 0; i < HaystackIterationsPerNeedle; i++)
{
Test(rng, StringComparison.Ordinal, haystackRandom, values, valuesOrdinal);
Test(rng, StringComparison.OrdinalIgnoreCase, haystackRandom, values, valuesOrdinalIgnoreCase);
}
}
private void Test(Random rng, StringComparison comparisonType, ReadOnlySpan<char> haystackRandom,
string[] needle, SearchValues<string> searchValuesInstance)
{
ReadOnlySpan<char> haystack = GetRandomSlice(rng, haystackRandom, MaxHaystackLength);
int expectedIndex = _expectedDelegate(haystack, needle, comparisonType);
int searchValuesIndex = _searchValuesDelegate(haystack, searchValuesInstance);
if (expectedIndex != searchValuesIndex)
{
AssertionFailed(haystack, needle, searchValuesInstance, comparisonType, expectedIndex, searchValuesIndex);
}
}
private static ReadOnlySpan<T> GetRandomSlice<T>(Random rng, ReadOnlySpan<T> span, int maxLength)
{
ReadOnlySpan<T> slice = span.Slice(rng.Next(span.Length + 1));
return slice.Slice(0, Math.Min(slice.Length, rng.Next(maxLength + 1)));
}
private static void AssertionFailed(ReadOnlySpan<char> haystack, string[] needle, SearchValues<string> searchValues, StringComparison comparisonType, int expected, int actual)
{
Type implType = searchValues.GetType();
string impl = $"{implType.Name} [{string.Join(", ", implType.GenericTypeArguments.Select(t => t.Name))}]";
string readableHaystack = ReadableAsciiOrSerialized(haystack.ToString());
string readableNeedle = string.Join(", ", needle.Select(ReadableAsciiOrSerialized));
Assert.True(false, $"Expected {expected}, got {actual} for impl='{impl}' comparison={comparisonType} needle='{readableNeedle}', haystack='{readableHaystack}'");
static string ReadableAsciiOrSerialized(string value)
{
foreach (char c in value)
{
if (!char.IsAsciiLetterOrDigit(c))
{
return $"[ {string.Join(", ", value.Select(c => int.CreateChecked(c)))} ]";
}
}
return value;
}
}
}
}
}

View file

@ -18,14 +18,13 @@
<Compile Include="MemoryMarshal\CreateSpan.cs" />
<Compile Include="MemoryMarshal\CreateReadOnlySpan.cs" />
<Compile Include="MemoryMarshal\CreateReadOnlySpanFromNullTerminated.cs" />
<Compile Include="$(CommonPath)..\tests\System\RealFormatterTestsBase.cs"
Link="ParsersAndFormatters\Formatter\RealFormatterTestsBase.cs" />
<Compile Include="$(CommonPath)..\tests\System\RealFormatterTestsBase.cs" Link="ParsersAndFormatters\Formatter\RealFormatterTestsBase.cs" />
<Compile Include="ParsersAndFormatters\Formatter\RealFormatterTests.cs" />
<Compile Include="$(CommonPath)..\tests\System\RealParserTestsBase.cs"
Link="ParsersAndFormatters\Parser\RealParserTestsBase.cs" />
<Compile Include="$(CommonPath)..\tests\System\RealParserTestsBase.cs" Link="ParsersAndFormatters\Parser\RealParserTestsBase.cs" />
<Compile Include="ParsersAndFormatters\Parser\RealParserTests.cs" />
<Compile Include="ReadOnlySpan\Contains.byte.cs" />
<Compile Include="ReadOnlySpan\Contains.T.cs" />
<Compile Include="Span\StringSearchValues.cs" />
<Compile Include="Span\Reflection.cs" />
<Compile Include="SequenceReader\Advance.cs" />
<Compile Include="SequenceReader\BasicTests.cs" />
@ -276,9 +275,7 @@
<Compile Include="Base64\Base64ValidationUnitTests.cs" />
</ItemGroup>
<ItemGroup>
<Compile Include="$(CommonTestPath)System\Buffers\NativeMemoryManager.cs"
Link="Common\System\Buffers\NativeMemoryManager.cs" />
<Compile Include="$(CommonPath)System\MutableDecimal.cs"
Link="Common\System\MutableDecimal.cs" />
<Compile Include="$(CommonTestPath)System\Buffers\NativeMemoryManager.cs" Link="Common\System\Buffers\NativeMemoryManager.cs" />
<Compile Include="$(CommonPath)System\MutableDecimal.cs" Link="Common\System\MutableDecimal.cs" />
</ItemGroup>
</Project>

View file

@ -4250,4 +4250,7 @@
<data name="OutOfMemory_StringTooLong" xml:space="preserve">
<value>String length exceeded supported range.</value>
</data>
<data name="Argument_SearchValues_UnsupportedStringComparison" xml:space="preserve">
<value>SearchValues&lt;string&gt; supports only StringComparison.Ordinal and StringComparison.OrdinalIgnoreCase.</value>
</data>
</root>

View file

@ -440,6 +440,27 @@
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\SearchValuesDebugView.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\EmptySearchValues.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\ProbabilisticMap.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\AhoCorasick.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\AhoCorasickBuilder.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\AhoCorasickNode.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\CharacterFrequencyHelper.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\EightPackedReferences.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\RabinKarp.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\StringSearchValuesHelper.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\TeddyBucketizer.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\Helpers\TeddyHelper.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBucketizedN2.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBucketizedN3.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyNonBucketizedN2.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyNonBucketizedN3.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\AsciiStringSearchValuesTeddyBase.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\MultiStringIgnoreCaseSearchValuesFallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesThreeChars.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\SingleStringSearchValuesFallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValues.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesBase.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesAhoCorasick.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SearchValues\Strings\StringSearchValuesRabinKarp.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\IndexOutOfRangeException.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\InsufficientExecutionStackException.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\InsufficientMemoryException.cs" />

View file

@ -16,6 +16,9 @@ namespace System.Globalization
Debug.Assert(char.IsLowSurrogate(l));
UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(h, l)), out hr, out lr);
Debug.Assert(char.IsHighSurrogate(hr));
Debug.Assert(char.IsLowSurrogate(lr));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@ -25,6 +28,9 @@ namespace System.Globalization
Debug.Assert(char.IsLowSurrogate(l));
UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(CharUnicodeInfo.ToLower(UnicodeUtility.GetScalarFromUtf16SurrogatePair(h, l)), out hr, out lr);
Debug.Assert(char.IsHighSurrogate(hr));
Debug.Assert(char.IsLowSurrogate(lr));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]

View file

@ -190,6 +190,24 @@ namespace System.Globalization
return dst;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToUpperOrdinal(char c)
{
if (GlobalizationMode.Invariant)
{
return InvariantModeCasing.ToUpper(c);
}
if (GlobalizationMode.UseNls)
{
return char.IsAscii(c)
? ToUpperAsciiInvariant(c)
: Invariant.ChangeCase(c, toUpper: true);
}
return OrdinalCasing.ToUpper(c);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void ChangeCaseToLower(ReadOnlySpan<char> source, Span<char> destination)
{
@ -436,7 +454,7 @@ namespace System.Globalization
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static char ToUpperAsciiInvariant(char c)
internal static char ToUpperAsciiInvariant(char c)
{
if (char.IsAsciiLetterLower(c))
{

View file

@ -416,6 +416,11 @@ namespace System
public static bool ContainsAny<T>(this Span<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
ContainsAny((ReadOnlySpan<T>)span, values);
/// <inheritdoc cref="ContainsAny(ReadOnlySpan{char}, SearchValues{string})"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool ContainsAny(this Span<char> span, SearchValues<string> values) =>
ContainsAny((ReadOnlySpan<char>)span, values);
/// <inheritdoc cref="ContainsAnyExcept{T}(ReadOnlySpan{T}, T)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool ContainsAnyExcept<T>(this Span<T> span, T value) where T : IEquatable<T>? =>
@ -452,7 +457,7 @@ namespace System
ContainsAnyExceptInRange((ReadOnlySpan<T>)span, lowInclusive, highInclusive);
/// <summary>
/// Searches for any occurance of the specified <paramref name="value0"/> or <paramref name="value1"/>, and returns true if found. If not found, returns false.
/// Searches for any occurrence of the specified <paramref name="value0"/> or <paramref name="value1"/>, and returns true if found. If not found, returns false.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="value0">One of the values to search for.</param>
@ -462,7 +467,7 @@ namespace System
IndexOfAny(span, value0, value1) >= 0;
/// <summary>
/// Searches for any occurance of the specified <paramref name="value0"/>, <paramref name="value1"/>, or <paramref name="value2"/>, and returns true if found. If not found, returns false.
/// Searches for any occurrence of the specified <paramref name="value0"/>, <paramref name="value1"/>, or <paramref name="value2"/>, and returns true if found. If not found, returns false.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="value0">One of the values to search for.</param>
@ -473,7 +478,7 @@ namespace System
IndexOfAny(span, value0, value1, value2) >= 0;
/// <summary>
/// Searches for any occurance of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
/// Searches for any occurrence of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
@ -482,7 +487,7 @@ namespace System
IndexOfAny(span, values) >= 0;
/// <summary>
/// Searches for any occurance of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
/// Searches for any occurrence of any of the specified <paramref name="values"/> and returns true if found. If not found, returns false.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
@ -490,6 +495,15 @@ namespace System
public static bool ContainsAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
IndexOfAny(span, values) >= 0;
/// <summary>
/// Searches for any occurrence of any of the specified substring <paramref name="values"/> and returns true if found. If not found, returns false.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool ContainsAny(this ReadOnlySpan<char> span, SearchValues<string> values) =>
IndexOfAny(span, values) >= 0;
/// <summary>
/// Searches for any value other than the specified <paramref name="value"/>.
/// </summary>
@ -1021,8 +1035,15 @@ namespace System
/// If all of the values are in <paramref name="values"/>, returns -1.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int IndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
SearchValues<T>.IndexOfAnyExcept(span, values);
public static int IndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.IndexOfAnyExcept(span);
}
/// <summary>Searches for the last index of any value other than the specified <paramref name="value"/>.</summary>
/// <typeparam name="T">The type of the span and values.</typeparam>
@ -1324,8 +1345,15 @@ namespace System
/// If all of the values are in <paramref name="values"/>, returns -1.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int LastIndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
SearchValues<T>.LastIndexOfAnyExcept(span, values);
public static int LastIndexOfAnyExcept<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.LastIndexOfAnyExcept(span);
}
/// <inheritdoc cref="IndexOfAnyInRange{T}(ReadOnlySpan{T}, T, T)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@ -1872,6 +1900,15 @@ namespace System
public static int IndexOfAny<T>(this Span<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
IndexOfAny((ReadOnlySpan<T>)span, values);
/// <summary>
/// Searches for the first index of any of the specified substring values.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int IndexOfAny(this Span<char> span, SearchValues<string> values) =>
IndexOfAny((ReadOnlySpan<char>)span, values);
/// <summary>
/// Searches for the first index of any of the specified values similar to calling IndexOf several times with the logical OR operator. If not found, returns -1.
/// </summary>
@ -2058,8 +2095,31 @@ namespace System
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int IndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
SearchValues<T>.IndexOfAny(span, values);
public static int IndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.IndexOfAny(span);
}
/// <summary>
/// Searches for the first index of any of the specified substring values.
/// </summary>
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int IndexOfAny(this ReadOnlySpan<char> span, SearchValues<string> values)
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.IndexOfAnyMultiString(span);
}
/// <summary>
/// Searches for the last index of any of the specified values similar to calling LastIndexOf several times with the logical OR operator. If not found, returns -1.
@ -2332,8 +2392,15 @@ namespace System
/// <param name="span">The span to search.</param>
/// <param name="values">The set of values to search for.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int LastIndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>? =>
SearchValues<T>.LastIndexOfAny(span, values);
public static int LastIndexOfAny<T>(this ReadOnlySpan<T> span, SearchValues<T> values) where T : IEquatable<T>?
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.LastIndexOfAny(span);
}
/// <summary>
/// Determines whether two sequences are equal by comparing the elements using IEquatable{T}.Equals(T).

View file

@ -23,5 +23,8 @@ namespace System.Buffers
internal override int LastIndexOfAnyExcept(ReadOnlySpan<T> span) =>
span.Length - 1;
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
-1;
}
}

View file

@ -19,6 +19,11 @@ namespace System.Buffers
{
internal static bool IsVectorizationSupported => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool BitmapContains(ref Vector256<byte> bitmap, char c) =>
c <= 127 &&
(bitmap.GetElementUnsafe(c & 0xF) & (1 << (c >> 4))) != 0;
internal static unsafe void ComputeBitmap256(ReadOnlySpan<byte> values, out Vector256<byte> bitmap0, out Vector256<byte> bitmap1, out BitVector256 lookup)
{
// The exact format of these bitmaps differs from the other ComputeBitmap overloads as it's meant for the full [0, 255] range algorithm.
@ -1022,7 +1027,7 @@ namespace System.Buffers
{
if (typeof(T) == typeof(short))
{
result = FixUpPackedVector256Result(result);
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
}
uint mask = TNegator.ExtractMask(result);
@ -1038,7 +1043,7 @@ namespace System.Buffers
{
if (typeof(T) == typeof(short))
{
result = FixUpPackedVector256Result(result);
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
}
uint mask = TNegator.ExtractMask(result);
@ -1060,7 +1065,7 @@ namespace System.Buffers
{
if (typeof(T) == typeof(short))
{
result = FixUpPackedVector256Result(result);
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
}
uint mask = TNegator.ExtractMask(result);
@ -1076,7 +1081,7 @@ namespace System.Buffers
{
if (typeof(T) == typeof(short))
{
result = FixUpPackedVector256Result(result);
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
}
uint mask = TNegator.ExtractMask(result);
@ -1091,18 +1096,6 @@ namespace System.Buffers
return offsetInVector - Vector256<short>.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> FixUpPackedVector256Result(Vector256<byte> result)
{
Debug.Assert(Avx2.IsSupported);
// Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in
// 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
// We want to swap the X and Y bits
// 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2
return Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
}
internal interface INegator
{
static abstract bool NegateIfNeeded(bool result);

View file

@ -365,8 +365,7 @@ namespace System.Buffers
if (result != Vector256<byte>.Zero)
{
// Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate).
result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte();
result = PackedSpanHelpers.FixUpPackedVector256Result(result);
uint mask = result.ExtractMostSignificantBits();
do

View file

@ -8,7 +8,8 @@ namespace System.Buffers
{
/// <summary>
/// Provides an immutable, read-only set of values optimized for efficient searching.
/// Instances are created by <see cref="SearchValues.Create(ReadOnlySpan{byte})"/> or <see cref="SearchValues.Create(ReadOnlySpan{char})"/>.
/// Instances are created by <see cref="SearchValues.Create(ReadOnlySpan{byte})"/>, <see cref="SearchValues.Create(ReadOnlySpan{char})"/>, or
/// <see cref="SearchValues.Create(ReadOnlySpan{string}, StringComparison)"/>.
/// </summary>
/// <typeparam name="T">The type of the values to search for.</typeparam>
/// <remarks>
@ -38,49 +39,8 @@ namespace System.Buffers
internal virtual int LastIndexOfAny(ReadOnlySpan<T> span) => throw new UnreachableException();
internal virtual int LastIndexOfAnyExcept(ReadOnlySpan<T> span) => throw new UnreachableException();
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int IndexOfAny(ReadOnlySpan<T> span, SearchValues<T> values)
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.IndexOfAny(span);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int IndexOfAnyExcept(ReadOnlySpan<T> span, SearchValues<T> values)
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.IndexOfAnyExcept(span);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int LastIndexOfAny(ReadOnlySpan<T> span, SearchValues<T> values)
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.LastIndexOfAny(span);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int LastIndexOfAnyExcept(ReadOnlySpan<T> span, SearchValues<T> values)
{
if (values is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.values);
}
return values.LastIndexOfAnyExcept(span);
}
// This is only implemented and used by SearchValues<string>.
internal virtual int IndexOfAnyMultiString(ReadOnlySpan<char> span) => throw new UnreachableException();
private string DebuggerDisplay
{

View file

@ -10,8 +10,6 @@ using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86;
#pragma warning disable 8500 // address of managed types
namespace System.Buffers
{
/// <summary>
@ -167,6 +165,22 @@ namespace System.Buffers
return new ProbabilisticCharSearchValues(probabilisticValues);
}
/// <summary>
/// Creates an optimized representation of <paramref name="values"/> used for efficient searching.
/// <para>Only <see cref="StringComparison.Ordinal"/> or <see cref="StringComparison.OrdinalIgnoreCase"/> may be used.</para>
/// </summary>
/// <param name="values">The set of values.</param>
/// <param name="comparisonType">Specifies whether to use <see cref="StringComparison.Ordinal"/> or <see cref="StringComparison.OrdinalIgnoreCase"/> search semantics.</param>
public static SearchValues<string> Create(ReadOnlySpan<string> values, StringComparison comparisonType)
{
if (comparisonType is not (StringComparison.Ordinal or StringComparison.OrdinalIgnoreCase))
{
throw new ArgumentException(SR.Argument_SearchValues_UnsupportedStringComparison, nameof(comparisonType));
}
return StringSearchValues.Create(values, ignoreCase: comparisonType == StringComparison.OrdinalIgnoreCase);
}
private static bool TryGetSingleRange<T>(ReadOnlySpan<T> values, out T minInclusive, out T maxInclusive)
where T : struct, INumber<T>, IMinMaxValue<T>
{
@ -211,12 +225,12 @@ namespace System.Buffers
static abstract bool Value { get; }
}
private readonly struct TrueConst : IRuntimeConst
internal readonly struct TrueConst : IRuntimeConst
{
public static bool Value => true;
}
private readonly struct FalseConst : IRuntimeConst
internal readonly struct FalseConst : IRuntimeConst
{
public static bool Value => false;
}

View file

@ -0,0 +1,674 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using static System.Buffers.StringSearchValuesHelper;
using static System.Buffers.TeddyHelper;
namespace System.Buffers
{
// This is an implementation of the "Teddy" vectorized multi-substring matching algorithm.
//
// We have several vectorized string searching approaches implemented as part of SearchValues, among them are:
// - 'IndexOfAnyAsciiSearcher', which can quickly find the next position of any character in a set.
// - 'SingleStringSearchValuesThreeChars', which can determine the likely positions where a value may start.
// The fast scan for starting positions is followed by a verification step that rules out false positives.
// To reduce the number of false positives, the initial scan looks for multiple characters at different positions,
// and only considers candidates where all of those match at the same time.
//
// Teddy combines the two to search for multiple values at the same time.
// Similar to 'SingleStringSearchValuesThreeChars', it employs the starting positions scan and verification steps.
// To reduce the number of values we have to check during verification, it also checks multiple characters in the initial scan.
// We could implement that by just merging the two approaches: check for any of the value characters at position 0, 1, 2, then
// AND those results together and verify potential matches. The issue with this approach is that we would always have to check
// all values in the verification step, and we would be hitting many false positives as the number of values increased.
// For example, if you are searching for "Teddy" and "Bear", position 0 could be either 'T' or 'B', position 1 could be 'e',
// and position 2 could be 'd' or 'a'. We would do separate comparisons for each of those positions and then AND together the result.
// Because there is no correlation between the values, we would get false positives for inputs like "Bed" and "Tea",
// and we wouldn't know whether the match location was because of "Teddy" or "Bear", and thus which to proceed to verify.
//
// What is special about Teddy is how we perform that initial scan to not only determine the possible starting locations,
// but also which values are the potential matches at each of those offsets.
// Instead of encoding all starting characters at a given position into a bitmap that can only answer yes/no whether a given
// character is present in the set, we want to encode both the character and the values in which it appears.
// We only have 128* bits to work with, so we do this by encoding 8 bits of information for each nibble (half byte).
// Those 8 bits represent a bitmask of values that contain that nibble at that location.
// If we compare the input against two such bitmaps and AND the results together, we can determine which positions in the input
// contained a matching character, and which of our values matched said character at that position.
// We repeat this a few more times (checking 3 bytes or 6 nibbles for N=3) at different offsets to reduce the number of false positives.
// See 'TeddyBucketizer.GenerateNonBucketizedFingerprint' for details around how such a bitmap is constructed.
//
// For example if we are searching for strings "Teddy" and "Bear", we will look for 'T' or 'B' at position 0, 'e' at position 1, ...
// To look for 'T' (0x54) or 'B' (0x42), we will check for a high nibble of 5 or 4, and lower nibble of 4 or 2.
// Each value's presence is indicated by 1 bit. We will use 1 (0b00000001) for the first value ("Teddy") and 2 (0b00000010) for "Bear".
// Our bitmaps will look like so (1 is set for high 5 and low 4, 2 is set for high 4 and low 2):
// bitmapHigh: [0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
// bitmapLow: [0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
// ^ ^ ^ ^
//
// To map an input nibble to its corresponding bitmask, we use 'Shuffle(bitmap, nibble)'.
// For an input like "TeddyBearFactory", our result will be
// input: [T, e, d, d, y, B, e, a, r, F, a, c, t, o, r, y]
// inputHigh: [5, 6, 6, 6, 7, 4, 6, 6, 7, 4, 6, 6, 7, 6, 7, 7] (values in hex)
// inputLow: [4, 5, 4, 4, 9, 2, 5, 1, 2, 6, 1, 3, 4, F, 2, 9] (values in hex)
// resultHigh: [1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]
// resultLow: [1, 0, 1, 1, 0, 2, 0, 0, 2, 0, 0, 0, 1, 0, 2, 0]
// result: [1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] (resultHigh & resultLow)
// ^ ^
// Note how we had quite a few false positives for individual nibbles that we ruled away after checking both nibbles.
// See 'TeddyHelper.ProcessInputN3' for details about how we combine results for multiple characters at different offsets.
//
// The description above states that we can only encode the information about 8 values. To get around that limitation
// we group multiple values together into buckets. Instead of looking for positions where a single value may match,
// we look for positions where any value from a given bucket may match.
// When creating the bitmap we don't set the bit for just one nibble value, but for each of the values in that bucket.
// For example if "Teddy" and "Bear" were both in the same bucket, the high nibble bitmap would map both 5 and 4 to the same bucket.
// We may see more false positives ('R' (0x52) and 'D' (0x44) would now also map to the same bucket), but we get to search for
// many more values at the same time. Instead of 8 values, we are now capable of looking for 8 buckets of values at the same time.
// See 'TeddyBucketizer.Bucketize' for details about how values are grouped into buckets.
// See 'TeddyBucketizer.GenerateBucketizedFingerprint' for details around how such a bitmap is constructed.
//
// Teddy works in terms of bytes, but .NET chars represent UTF-16 code units.
// We currently only use Teddy if the 2 or 3 starting characters are all ASCII. This limitation could be lifted in the future if needed.
// Since we know that all of the characters we are looking for are ASCII, we also know that only other ASCII characters will match against them.
// Making use of that fact, we narrow UTF-16 code units into bytes when reading the input (see 'TeddyHelper.LoadAndPack16AsciiChars').
// While such narrowing does corrupt non-ASCII values, they are all mapped to values outside of ASCII, so they won't match anyway.
// ASCII values remain unaffected since their high byte in UTF-16 representation is 0.
//
// To handle case-insensitive matching, all values are normalized to their uppercase equivalents ahead of time and the bitmaps are
// generated as if all characters were uppercase. During the search, the input is also transformed into uppercase before being compared.
//
// * With wider vectors (256- and 512-bit), we have more bits available, but we currently only duplicate the original 128 bits
// and perform the search on more characters at a time. We could instead choose to encode more information per nibble to trade
// the number of characters we check per loop iteration for fewer false positives we then have to rule out during the verification step.
//
// For an alternative description of the algorithm, see
// https://github.com/BurntSushi/aho-corasick/blob/8d735471fc12f0ca570cead8e17342274fae6331/src/packed/teddy/README.md
// Has an O(i * m) worst-case, with the expected time closer to O(n) for good bucket distributions.
internal abstract class AsciiStringSearchValuesTeddyBase<TBucketized, TStartCaseSensitivity, TCaseSensitivity> : StringSearchValuesRabinKarp<TCaseSensitivity>
where TBucketized : struct, SearchValues.IRuntimeConst
where TStartCaseSensitivity : struct, ICaseSensitivity // Refers to the characters being matched by Teddy
where TCaseSensitivity : struct, ICaseSensitivity // Refers to the rest of the value for the verification step
{
// We may be using N2 or N3 mode depending on whether we're checking 2 or 3 starting bytes for each bucket.
// The result of ProcessInputN2 and ProcessInputN3 are offset by 1 and 2 positions respectively (MatchStartOffsetN2 and MatchStartOffsetN3).
// See the full description of TeddyHelper.ProcessInputN3 for more details about why these constants exist.
private const int MatchStartOffsetN2 = 1;
private const int MatchStartOffsetN3 = 2;
private const int CharsPerIterationVector128 = 16;
private const int CharsPerIterationAvx2 = 32;
private const int CharsPerIterationAvx512 = 64;
// We may have up to 8 buckets.
// If we have <= 8 strings, the buckets will be the strings themselves, and TBucketized.Value will be false.
// If we have more than 8, the buckets will be string[], and TBucketized.Value will be true.
private readonly EightPackedReferences _buckets;
private readonly Vector512<byte>
_n0Low, _n0High,
_n1Low, _n1High,
_n2Low, _n2High;
protected AsciiStringSearchValuesTeddyBase(ReadOnlySpan<string> values, HashSet<string> uniqueValues, int n) : base(values, uniqueValues)
{
Debug.Assert(!TBucketized.Value);
Debug.Assert(n is 2 or 3);
_buckets = new EightPackedReferences(MemoryMarshal.CreateReadOnlySpan(
ref Unsafe.As<string, object>(ref MemoryMarshal.GetReference(values)),
values.Length));
(_n0Low, _n0High) = TeddyBucketizer.GenerateNonBucketizedFingerprint(values, offset: 0);
(_n1Low, _n1High) = TeddyBucketizer.GenerateNonBucketizedFingerprint(values, offset: 1);
if (n == 3)
{
(_n2Low, _n2High) = TeddyBucketizer.GenerateNonBucketizedFingerprint(values, offset: 2);
}
}
protected AsciiStringSearchValuesTeddyBase(string[][] buckets, ReadOnlySpan<string> values, HashSet<string> uniqueValues, int n) : base(values, uniqueValues)
{
Debug.Assert(TBucketized.Value);
Debug.Assert(n is 2 or 3);
_buckets = new EightPackedReferences(buckets);
(_n0Low, _n0High) = TeddyBucketizer.GenerateBucketizedFingerprint(buckets, offset: 0);
(_n1Low, _n1High) = TeddyBucketizer.GenerateBucketizedFingerprint(buckets, offset: 1);
if (n == 3)
{
(_n2Low, _n2High) = TeddyBucketizer.GenerateBucketizedFingerprint(buckets, offset: 2);
}
}
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
protected int IndexOfAnyN2(ReadOnlySpan<char> span)
{
// The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN2)
{
return IndexOfAnyN2Avx512(span);
}
if (Avx2.IsSupported && span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN2)
{
return IndexOfAnyN2Avx2(span);
}
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
return IndexOfAnyN2Vector128(span);
}
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
protected int IndexOfAnyN3(ReadOnlySpan<char> span)
{
// The behavior of the rest of the function remains the same if Avx2 or Avx512BW aren't supported
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN3)
{
return IndexOfAnyN3Avx512(span);
}
if (Avx2.IsSupported && span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN3)
{
return IndexOfAnyN3Avx2(span);
}
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
return IndexOfAnyN3Vector128(span);
}
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
private int IndexOfAnyN2Vector128(ReadOnlySpan<char> span)
{
// See comments in 'IndexOfAnyN3Vector128' below.
// This method is the same, but compares 2 starting chars instead of 3.
if (span.Length < CharsPerIterationVector128 + MatchStartOffsetN2)
{
return ShortInputFallback(span);
}
ref char searchSpace = ref MemoryMarshal.GetReference(span);
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationVector128);
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN2);
Vector128<byte> n0Low = _n0Low._lower._lower, n0High = _n0High._lower._lower;
Vector128<byte> n1Low = _n1Low._lower._lower, n1High = _n1High._lower._lower;
Vector128<byte> prev0 = Vector128<byte>.AllBitsSet;
Loop:
ValidateReadPosition(span, ref searchSpace);
Vector128<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack16AsciiChars(ref searchSpace));
(Vector128<byte> result, prev0) = ProcessInputN2(input, prev0, n0Low, n0High, n1Low, n1High);
if (result != Vector128<byte>.Zero)
{
goto CandidateFound;
}
ContinueLoop:
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationVector128);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationVector128)))
{
return -1;
}
// We're switching which characters we will process in the next iteration.
// prev0 no longer points to the characters just before the current input, so we must reset it.
prev0 = Vector128<byte>.AllBitsSet;
searchSpace = ref lastSearchSpaceStart;
}
goto Loop;
CandidateFound:
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN2, out int offset))
{
return offset;
}
goto ContinueLoop;
}
[CompExactlyDependsOn(typeof(Avx2))]
private int IndexOfAnyN2Avx2(ReadOnlySpan<char> span)
{
// See comments in 'IndexOfAnyN3Vector128' below.
// This method is the same, but operates on 32 input characters at a time and compares 2 starting chars instead of 3.
Debug.Assert(span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN2);
ref char searchSpace = ref MemoryMarshal.GetReference(span);
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx2);
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN2);
Vector256<byte> n0Low = _n0Low._lower, n0High = _n0High._lower;
Vector256<byte> n1Low = _n1Low._lower, n1High = _n1High._lower;
Vector256<byte> prev0 = Vector256<byte>.AllBitsSet;
Loop:
ValidateReadPosition(span, ref searchSpace);
Vector256<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack32AsciiChars(ref searchSpace));
(Vector256<byte> result, prev0) = ProcessInputN2(input, prev0, n0Low, n0High, n1Low, n1High);
if (result != Vector256<byte>.Zero)
{
goto CandidateFound;
}
ContinueLoop:
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx2);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx2)))
{
return -1;
}
// We're switching which characters we will process in the next iteration.
// prev0 no longer points to the characters just before the current input, so we must reset it.
prev0 = Vector256<byte>.AllBitsSet;
searchSpace = ref lastSearchSpaceStart;
}
goto Loop;
CandidateFound:
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN2, out int offset))
{
return offset;
}
goto ContinueLoop;
}
[CompExactlyDependsOn(typeof(Avx512BW))]
private int IndexOfAnyN2Avx512(ReadOnlySpan<char> span)
{
// See comments in 'IndexOfAnyN3Vector128' below.
// This method is the same, but operates on 64 input characters at a time and compares 2 starting chars instead of 3.
Debug.Assert(span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN2);
ref char searchSpace = ref MemoryMarshal.GetReference(span);
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx512);
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN2);
Vector512<byte> n0Low = _n0Low, n0High = _n0High;
Vector512<byte> n1Low = _n1Low, n1High = _n1High;
Vector512<byte> prev0 = Vector512<byte>.AllBitsSet;
Loop:
ValidateReadPosition(span, ref searchSpace);
Vector512<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack64AsciiChars(ref searchSpace));
(Vector512<byte> result, prev0) = ProcessInputN2(input, prev0, n0Low, n0High, n1Low, n1High);
if (result != Vector512<byte>.Zero)
{
goto CandidateFound;
}
ContinueLoop:
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx512);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx512)))
{
return -1;
}
// We're switching which characters we will process in the next iteration.
// prev0 no longer points to the characters just before the current input, so we must reset it.
prev0 = Vector512<byte>.AllBitsSet;
searchSpace = ref lastSearchSpaceStart;
}
goto Loop;
CandidateFound:
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN2, out int offset))
{
return offset;
}
goto ContinueLoop;
}
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
private int IndexOfAnyN3Vector128(ReadOnlySpan<char> span)
{
// We can't process inputs shorter than 18 characters in a vectorized manner here.
if (span.Length < CharsPerIterationVector128 + MatchStartOffsetN3)
{
return ShortInputFallback(span);
}
ref char searchSpace = ref MemoryMarshal.GetReference(span);
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationVector128);
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN3);
// All the input bitmaps are Vector128<byte>, duplicated 4 times up to Vector512<byte>.
// They are stored as Vector512 to lower the overhead of routines that do load the full Vector512<byte>.
// When using the Vector128 routine, we just load the first of those duplicates (._lower._lower).
Vector128<byte> n0Low = _n0Low._lower._lower, n0High = _n0High._lower._lower;
Vector128<byte> n1Low = _n1Low._lower._lower, n1High = _n1High._lower._lower;
Vector128<byte> n2Low = _n2Low._lower._lower, n2High = _n2High._lower._lower;
// As matching is offset by 2 positions (MatchStartOffsetN3), we must remember the result of the previous loop iteration.
// See the full description of TeddyHelper.ProcessInputN3 for more details about why these exist.
// When doing the first loop iteration, there is no previous iteration, so we have to assume that the input did match (AllBitsSet)
// for those positions. This makes it more likely to hit a false-positive at the very beginning, but TryFindMatch will discard them.
Vector128<byte> prev0 = Vector128<byte>.AllBitsSet;
Vector128<byte> prev1 = Vector128<byte>.AllBitsSet;
Loop:
// Load the input characters and normalize them to their uppercase variant if we're ignoring casing.
// These characters may not be ASCII, but we know that the starting 3 characters of each value are.
ValidateReadPosition(span, ref searchSpace);
Vector128<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack16AsciiChars(ref searchSpace));
// Find which buckets contain potential matches for each input position.
// For a bucket to be marked as a potential match, its fingerprint must match for all 3 starting characters (all 6 nibbles).
(Vector128<byte> result, prev0, prev1) = ProcessInputN3(input, prev0, prev1, n0Low, n0High, n1Low, n1High, n2Low, n2High);
if (result != Vector128<byte>.Zero)
{
goto CandidateFound;
}
ContinueLoop:
// We haven't found a match. Update the input position and check if we've reached the end.
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationVector128);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationVector128)))
{
return -1;
}
// We're switching which characters we will process in the next iteration.
// prev0 and prev1 no longer point to the characters just before the current input, so we must reset them.
// Just like with the first iteration, we must assume that these positions did match (AllBitsSet).
prev0 = Vector128<byte>.AllBitsSet;
prev1 = Vector128<byte>.AllBitsSet;
searchSpace = ref lastSearchSpaceStart;
}
goto Loop;
CandidateFound:
// We found potential matches, but they may be false-positives, so we must verify each one.
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN3, out int offset))
{
return offset;
}
goto ContinueLoop;
}
[CompExactlyDependsOn(typeof(Avx2))]
private int IndexOfAnyN3Avx2(ReadOnlySpan<char> span)
{
// See comments in 'IndexOfAnyN3Vector128' above.
// This method is the same, but operates on 32 input characters at a time.
Debug.Assert(span.Length >= CharsPerIterationAvx2 + MatchStartOffsetN3);
ref char searchSpace = ref MemoryMarshal.GetReference(span);
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx2);
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN3);
Vector256<byte> n0Low = _n0Low._lower, n0High = _n0High._lower;
Vector256<byte> n1Low = _n1Low._lower, n1High = _n1High._lower;
Vector256<byte> n2Low = _n2Low._lower, n2High = _n2High._lower;
Vector256<byte> prev0 = Vector256<byte>.AllBitsSet;
Vector256<byte> prev1 = Vector256<byte>.AllBitsSet;
Loop:
ValidateReadPosition(span, ref searchSpace);
Vector256<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack32AsciiChars(ref searchSpace));
(Vector256<byte> result, prev0, prev1) = ProcessInputN3(input, prev0, prev1, n0Low, n0High, n1Low, n1High, n2Low, n2High);
if (result != Vector256<byte>.Zero)
{
goto CandidateFound;
}
ContinueLoop:
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx2);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx2)))
{
return -1;
}
// We're switching which characters we will process in the next iteration.
// prev0 and prev1 no longer point to the characters just before the current input, so we must reset them.
prev0 = Vector256<byte>.AllBitsSet;
prev1 = Vector256<byte>.AllBitsSet;
searchSpace = ref lastSearchSpaceStart;
}
goto Loop;
CandidateFound:
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN3, out int offset))
{
return offset;
}
goto ContinueLoop;
}
[CompExactlyDependsOn(typeof(Avx512BW))]
private int IndexOfAnyN3Avx512(ReadOnlySpan<char> span)
{
// See comments in 'IndexOfAnyN3Vector128' above.
// This method is the same, but operates on 64 input characters at a time.
Debug.Assert(span.Length >= CharsPerIterationAvx512 + MatchStartOffsetN3);
ref char searchSpace = ref MemoryMarshal.GetReference(span);
ref char lastSearchSpaceStart = ref Unsafe.Add(ref searchSpace, span.Length - CharsPerIterationAvx512);
searchSpace = ref Unsafe.Add(ref searchSpace, MatchStartOffsetN3);
Vector512<byte> n0Low = _n0Low, n0High = _n0High;
Vector512<byte> n1Low = _n1Low, n1High = _n1High;
Vector512<byte> n2Low = _n2Low, n2High = _n2High;
Vector512<byte> prev0 = Vector512<byte>.AllBitsSet;
Vector512<byte> prev1 = Vector512<byte>.AllBitsSet;
Loop:
ValidateReadPosition(span, ref searchSpace);
Vector512<byte> input = TStartCaseSensitivity.TransformInput(LoadAndPack64AsciiChars(ref searchSpace));
(Vector512<byte> result, prev0, prev1) = ProcessInputN3(input, prev0, prev1, n0Low, n0High, n1Low, n1High, n2Low, n2High);
if (result != Vector512<byte>.Zero)
{
goto CandidateFound;
}
ContinueLoop:
searchSpace = ref Unsafe.Add(ref searchSpace, CharsPerIterationAvx512);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpaceStart))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpaceStart, CharsPerIterationAvx512)))
{
return -1;
}
// We're switching which characters we will process in the next iteration.
// prev0 and prev1 no longer point to the characters just before the current input, so we must reset them.
prev0 = Vector512<byte>.AllBitsSet;
prev1 = Vector512<byte>.AllBitsSet;
searchSpace = ref lastSearchSpaceStart;
}
goto Loop;
CandidateFound:
if (TryFindMatch(span, ref searchSpace, result, MatchStartOffsetN3, out int offset))
{
return offset;
}
goto ContinueLoop;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryFindMatch(ReadOnlySpan<char> span, ref char searchSpace, Vector128<byte> result, int matchStartOffset, out int offsetFromStart)
{
// 'resultMask' encodes the input positions where at least one bucket may contain a match.
// These positions are offset by 'matchStartOffset' places.
uint resultMask = (~Vector128.Equals(result, Vector128<byte>.Zero)).ExtractMostSignificantBits();
do
{
int matchOffset = BitOperations.TrailingZeroCount(resultMask);
// Calculate where in the input span this potential match begins.
ref char matchRef = ref Unsafe.Add(ref searchSpace, matchOffset - matchStartOffset);
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref matchRef) / 2);
int lengthRemaining = span.Length - offsetFromStart;
ValidateReadPosition(span, ref matchRef, lengthRemaining);
// 'candidateMask' encodes which buckets contain potential matches, starting at 'matchRef'.
uint candidateMask = result.GetElementUnsafe(matchOffset);
do
{
// Verify each bucket to see if we've found a match.
int candidateOffset = BitOperations.TrailingZeroCount(candidateMask);
object? bucket = _buckets[candidateOffset];
Debug.Assert(bucket is not null);
if (TBucketized.Value
? StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string[]>(bucket))
: StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string>(bucket)))
{
return true;
}
candidateMask = BitOperations.ResetLowestSetBit(candidateMask);
}
while (candidateMask != 0);
resultMask = BitOperations.ResetLowestSetBit(resultMask);
}
while (resultMask != 0);
offsetFromStart = 0;
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryFindMatch(ReadOnlySpan<char> span, ref char searchSpace, Vector256<byte> result, int matchStartOffset, out int offsetFromStart)
{
// See comments in 'TryFindMatch' for Vector128<byte> above.
// This method is the same, but checks the potential matches for 32 input positions.
uint resultMask = (~Vector256.Equals(result, Vector256<byte>.Zero)).ExtractMostSignificantBits();
do
{
int matchOffset = BitOperations.TrailingZeroCount(resultMask);
ref char matchRef = ref Unsafe.Add(ref searchSpace, matchOffset - matchStartOffset);
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref matchRef) / 2);
int lengthRemaining = span.Length - offsetFromStart;
ValidateReadPosition(span, ref matchRef, lengthRemaining);
uint candidateMask = result.GetElementUnsafe(matchOffset);
do
{
int candidateOffset = BitOperations.TrailingZeroCount(candidateMask);
object? bucket = _buckets[candidateOffset];
Debug.Assert(bucket is not null);
if (TBucketized.Value
? StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string[]>(bucket))
: StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string>(bucket)))
{
return true;
}
candidateMask = BitOperations.ResetLowestSetBit(candidateMask);
}
while (candidateMask != 0);
resultMask = BitOperations.ResetLowestSetBit(resultMask);
}
while (resultMask != 0);
offsetFromStart = 0;
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryFindMatch(ReadOnlySpan<char> span, ref char searchSpace, Vector512<byte> result, int matchStartOffset, out int offsetFromStart)
{
// See comments in 'TryFindMatch' for Vector128<byte> above.
// This method is the same, but checks the potential matches for 64 input positions.
ulong resultMask = (~Vector512.Equals(result, Vector512<byte>.Zero)).ExtractMostSignificantBits();
do
{
int matchOffset = BitOperations.TrailingZeroCount(resultMask);
ref char matchRef = ref Unsafe.Add(ref searchSpace, matchOffset - matchStartOffset);
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref matchRef) / 2);
int lengthRemaining = span.Length - offsetFromStart;
ValidateReadPosition(span, ref matchRef, lengthRemaining);
uint candidateMask = result.GetElementUnsafe(matchOffset);
do
{
int candidateOffset = BitOperations.TrailingZeroCount(candidateMask);
object? bucket = _buckets[candidateOffset];
Debug.Assert(bucket is not null);
if (TBucketized.Value
? StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string[]>(bucket))
: StartsWith<TCaseSensitivity>(ref matchRef, lengthRemaining, Unsafe.As<string>(bucket)))
{
return true;
}
candidateMask = BitOperations.ResetLowestSetBit(candidateMask);
}
while (candidateMask != 0);
resultMask = BitOperations.ResetLowestSetBit(resultMask);
}
while (resultMask != 0);
offsetFromStart = 0;
return false;
}
}
}

View file

@ -0,0 +1,24 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace System.Buffers
{
internal sealed class AsciiStringSearchValuesTeddyBucketizedN2<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.TrueConst, TStartCaseSensitivity, TCaseSensitivity>
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
{
public AsciiStringSearchValuesTeddyBucketizedN2(string[][] buckets, ReadOnlySpan<string> values, HashSet<string> uniqueValues)
: base(buckets, values, uniqueValues, n: 2)
{ }
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
IndexOfAnyN2(span);
}
}

View file

@ -0,0 +1,24 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace System.Buffers
{
internal sealed class AsciiStringSearchValuesTeddyBucketizedN3<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.TrueConst, TStartCaseSensitivity, TCaseSensitivity>
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
{
public AsciiStringSearchValuesTeddyBucketizedN3(string[][] buckets, ReadOnlySpan<string> values, HashSet<string> uniqueValues)
: base(buckets, values, uniqueValues, n: 3)
{ }
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
IndexOfAnyN3(span);
}
}

View file

@ -0,0 +1,24 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace System.Buffers
{
internal sealed class AsciiStringSearchValuesTeddyNonBucketizedN2<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.FalseConst, TStartCaseSensitivity, TCaseSensitivity>
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
{
public AsciiStringSearchValuesTeddyNonBucketizedN2(ReadOnlySpan<string> values, HashSet<string> uniqueValues)
: base(values, uniqueValues, n: 2)
{ }
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
IndexOfAnyN2(span);
}
}

View file

@ -0,0 +1,24 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace System.Buffers
{
internal sealed class AsciiStringSearchValuesTeddyNonBucketizedN3<TStartCaseSensitivity, TCaseSensitivity> : AsciiStringSearchValuesTeddyBase<SearchValues.FalseConst, TStartCaseSensitivity, TCaseSensitivity>
where TStartCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
{
public AsciiStringSearchValuesTeddyNonBucketizedN3(ReadOnlySpan<string> values, HashSet<string> uniqueValues)
: base(values, uniqueValues, n: 3)
{ }
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
IndexOfAnyN3(span);
}
}

View file

@ -0,0 +1,355 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace System.Buffers
{
/// <summary>
/// An implementation of the Aho-Corasick algorithm we use as a fallback when we can't use Teddy
/// (either due to missing hardware intrinsics, or due to characteristics of the values used).
/// https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
/// Works in O(n).
/// </summary>
internal readonly struct AhoCorasick
{
private readonly AhoCorasickNode[] _nodes;
private readonly Vector256<byte> _startingCharsAsciiBitmap;
public AhoCorasick(AhoCorasickNode[] nodes, Vector256<byte> startingAsciiBitmap)
{
_nodes = nodes;
_startingCharsAsciiBitmap = startingAsciiBitmap;
}
public readonly bool ShouldUseAsciiFastScan
{
get
{
Vector256<byte> bitmap = _startingCharsAsciiBitmap;
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && bitmap != default)
{
// If there are a lot of starting characters such that we often find one early,
// the ASCII fast scan may end up performing worse than checking one character at a time.
// Avoid using this optimization if the combined frequency of starting chars is too high.
//
// For reference, the combined frequency of characters based on CharacterFrequencyHelper.AsciiFrequency:
// - All digits is ~ 5 %
// - All lowercase letters is ~ 57.2 %
// - All uppercase letters is ~ 7.4 %
//
// This limit is based on experimentation with different texts and sets of values.
// Above ~50 %, the cost of calling into the vectorized helper is higher than checking char by char on average.
const float MaxCombinedFrequency = 50f;
float frequency = 0;
for (int i = 0; i < 128; i++)
{
if (IndexOfAnyAsciiSearcher.BitmapContains(ref bitmap, (char)i))
{
frequency += CharacterFrequencyHelper.AsciiFrequency[i];
}
}
return frequency <= MaxCombinedFrequency;
}
return false;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly int IndexOfAny<TCaseSensitivity, TFastScanVariant>(ReadOnlySpan<char> span)
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TFastScanVariant : struct, IFastScan
{
return typeof(TCaseSensitivity) == typeof(StringSearchValuesHelper.CaseInsensitiveUnicode)
? IndexOfAnyCaseInsensitiveUnicode<TFastScanVariant>(span)
: IndexOfAnyCore<TCaseSensitivity, TFastScanVariant>(span);
}
private readonly int IndexOfAnyCore<TCaseSensitivity, TFastScanVariant>(ReadOnlySpan<char> span)
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TFastScanVariant : struct, IFastScan
{
Debug.Assert(typeof(TCaseSensitivity) != typeof(StringSearchValuesHelper.CaseInsensitiveUnicode));
ref AhoCorasickNode nodes = ref MemoryMarshal.GetArrayDataReference(_nodes);
int nodeIndex = 0;
int result = -1;
int i = 0;
FastScan:
Debug.Assert(nodeIndex == 0);
// We are currently in the root node and trying to find the next position of any starting character.
// If all the values start with an ASCII character, use a vectorized helper to quickly skip over characters that can't start a match.
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && typeof(TFastScanVariant) == typeof(IndexOfAnyAsciiFastScan))
{
int remainingLength = span.Length - i;
if (remainingLength >= Vector128<ushort>.Count)
{
// If '\0' is one of the starting chars and we're running on Ssse3 hardware, this may return false-positives.
// False-positives here are okay, we'll just rule them out below. While we could flow the Ssse3AndWasmHandleZeroInNeedle
// generic through, we expect such values to be rare enough that introducing more code is not worth it.
int offset = IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<IndexOfAnyAsciiSearcher.DontNegate, IndexOfAnyAsciiSearcher.Default>(
ref Unsafe.As<char, short>(ref Unsafe.Add(ref MemoryMarshal.GetReference(span), i)),
remainingLength,
ref Unsafe.AsRef(in _startingCharsAsciiBitmap));
if (offset < 0)
{
goto Return;
}
i += offset;
goto LoopWithoutRangeCheck;
}
}
Loop:
if ((uint)i >= (uint)span.Length)
{
goto Return;
}
LoopWithoutRangeCheck:
// Read the next input character and either find the next potential match prefix or transition back to the root node.
Debug.Assert((uint)i < (uint)span.Length);
char c = TCaseSensitivity.TransformInput(Unsafe.Add(ref MemoryMarshal.GetReference(span), i));
while (true)
{
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
ref AhoCorasickNode node = ref Unsafe.Add(ref nodes, (uint)nodeIndex);
if (node.TryGetChild(c, out int childIndex))
{
// We were able to extend the current match. If this node contains a potential match, remember that.
nodeIndex = childIndex;
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
int matchLength = Unsafe.Add(ref nodes, (uint)nodeIndex).MatchLength;
if (matchLength != 0)
{
// Any result we find from here on out may only be lower (longer match with a start closer to the beginning of the input).
Debug.Assert(result == -1 || result >= i + 1 - matchLength);
result = i + 1 - matchLength;
}
i++;
goto Loop;
}
if (nodeIndex == 0)
{
// We are back at the root node and none of the values start with the current character.
if (result >= 0)
{
// If we've already found a match, we can't find an earlier one anymore. This is the result
goto Return;
}
// Go back to searching for the next possible starting character.
i++;
goto FastScan;
}
// Follow the next suffix link.
nodeIndex = node.SuffixLink;
if (nodeIndex < 0)
{
// A node with a suffix link of -1 indicates a match, see AhoCorasickBuilder.AddSuffixLinks.
Debug.Assert(nodeIndex == -1);
Debug.Assert(result >= 0);
goto Return;
}
// Try to match the current character again at the suffix link node.
}
Return:
return result;
}
// Mostly a copy of IndexOfAnyCore, but we may read two characters at a time in the case of surrogate pairs.
private readonly int IndexOfAnyCaseInsensitiveUnicode<TFastScanVariant>(ReadOnlySpan<char> span)
where TFastScanVariant : struct, IFastScan
{
const char LowSurrogateNotSet = '\0';
ref AhoCorasickNode nodes = ref MemoryMarshal.GetArrayDataReference(_nodes);
int nodeIndex = 0;
int result = -1;
int i = 0;
char lowSurrogateUpper = LowSurrogateNotSet;
FastScan:
// We are currently in the root node and trying to find the next position of any starting character.
// If all the values start with an ASCII character, use a vectorized helper to quickly skip over characters that can't start a match.
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && typeof(TFastScanVariant) == typeof(IndexOfAnyAsciiFastScan))
{
if (lowSurrogateUpper != LowSurrogateNotSet)
{
// We read a surrogate pair in the previous loop iteration and processed the high surrogate.
// Continue with the stored low surrogate.
goto LoopWithoutRangeCheck;
}
int remainingLength = span.Length - i;
if (remainingLength >= Vector128<ushort>.Count)
{
int offset = IndexOfAnyAsciiSearcher.IndexOfAnyVectorized<IndexOfAnyAsciiSearcher.DontNegate, IndexOfAnyAsciiSearcher.Default>(
ref Unsafe.As<char, short>(ref Unsafe.Add(ref MemoryMarshal.GetReference(span), i)),
remainingLength,
ref Unsafe.AsRef(in _startingCharsAsciiBitmap));
if (offset < 0)
{
goto Return;
}
i += offset;
goto LoopWithoutRangeCheck;
}
}
Loop:
if ((uint)i >= (uint)span.Length)
{
goto Return;
}
LoopWithoutRangeCheck:
// Read the next input character and either find the next potential match prefix or transition back to the root node.
Debug.Assert((uint)i < (uint)span.Length);
char c;
if (lowSurrogateUpper != LowSurrogateNotSet)
{
// We have just processed the high surrogate. Continue with the low surrogate we read in the previous iteration.
c = lowSurrogateUpper;
lowSurrogateUpper = LowSurrogateNotSet;
}
else
{
// Read the next character, check if it's a high surrogate, and transform it to its Ordinal uppercase representation.
c = Unsafe.Add(ref MemoryMarshal.GetReference(span), i);
char lowSurrogate;
if (char.IsHighSurrogate(c) &&
(uint)(i + 1) < (uint)span.Length &&
char.IsLowSurrogate(lowSurrogate = Unsafe.Add(ref MemoryMarshal.GetReference(span), i + 1)))
{
if (GlobalizationMode.UseNls)
{
SurrogateToUpperNLS(c, lowSurrogate, out c, out lowSurrogateUpper);
}
else
{
SurrogateCasing.ToUpper(c, lowSurrogate, out c, out lowSurrogateUpper);
}
Debug.Assert(lowSurrogateUpper != LowSurrogateNotSet);
}
else
{
c = TextInfo.ToUpperOrdinal(c);
}
#if DEBUG
// The above logic must match Ordinal.ToUpperOrdinal exactly.
Span<char> destination = new char[2]; // Avoid stackalloc in a loop
Ordinal.ToUpperOrdinal(span.Slice(i, i + 1 == span.Length ? 1 : 2), destination);
Debug.Assert(c == destination[0]);
Debug.Assert(lowSurrogateUpper == LowSurrogateNotSet || lowSurrogateUpper == destination[1]);
#endif
}
while (true)
{
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
ref AhoCorasickNode node = ref Unsafe.Add(ref nodes, (uint)nodeIndex);
if (node.TryGetChild(c, out int childIndex))
{
// We were able to extend the current match. If this node contains a potential match, remember that.
nodeIndex = childIndex;
Debug.Assert((uint)nodeIndex < (uint)_nodes.Length);
int matchLength = Unsafe.Add(ref nodes, (uint)nodeIndex).MatchLength;
if (matchLength != 0)
{
// Any result we find from here on out may only be lower (longer match with a start closer to the beginning of the input).
Debug.Assert(result == -1 || result >= i + 1 - matchLength);
result = i + 1 - matchLength;
}
i++;
goto Loop;
}
if (nodeIndex == 0)
{
// We are back at the root node and none of the values start with the current character.
if (result >= 0)
{
// If we've already found a match, we can't find an earlier one anymore. This is the result
goto Return;
}
// Go back to searching for the next possible starting character.
i++;
goto FastScan;
}
// Follow the next suffix link.
nodeIndex = node.SuffixLink;
if (nodeIndex < 0)
{
// A node with a suffix link of -1 indicates a match, see AhoCorasickBuilder.AddSuffixLinks.
Debug.Assert(nodeIndex == -1);
Debug.Assert(result >= 0);
goto Return;
}
// Try to match the current character again at the suffix link node.
}
Return:
return result;
}
private static void SurrogateToUpperNLS(char h, char l, out char hr, out char lr)
{
Debug.Assert(char.IsHighSurrogate(h));
Debug.Assert(char.IsLowSurrogate(l));
Span<char> chars = stackalloc char[] { h, l };
Span<char> destination = stackalloc char[2];
int written = Ordinal.ToUpperOrdinal(chars, destination);
Debug.Assert(written == 2);
hr = destination[0];
lr = destination[1];
Debug.Assert(char.IsHighSurrogate(hr));
Debug.Assert(char.IsLowSurrogate(lr));
}
public interface IFastScan { }
public readonly struct IndexOfAnyAsciiFastScan : IFastScan { }
public readonly struct NoFastScan : IFastScan { }
}
}

View file

@ -0,0 +1,224 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.Intrinsics;
using System.Text;
namespace System.Buffers
{
/// <summary>
/// Separated out of <see cref="AhoCorasick"/> to allow us to defer some computation costs in case we decide not to build the full thing.
/// </summary>
internal ref struct AhoCorasickBuilder
{
private readonly ReadOnlySpan<string> _values;
private readonly bool _ignoreCase;
private ValueListBuilder<AhoCorasickNode> _nodes;
private ValueListBuilder<int> _parents;
private Vector256<byte> _startingCharsAsciiBitmap;
public AhoCorasickBuilder(ReadOnlySpan<string> values, bool ignoreCase, ref HashSet<string>? unreachableValues)
{
Debug.Assert(!values.IsEmpty);
Debug.Assert(!string.IsNullOrEmpty(values[0]));
#if DEBUG
// The input should have been sorted by length
for (int i = 1; i < values.Length; i++)
{
Debug.Assert(values[i - 1].Length <= values[i].Length);
}
#endif
_values = values;
_ignoreCase = ignoreCase;
BuildTrie(ref unreachableValues);
}
public AhoCorasick Build()
{
AddSuffixLinks();
Debug.Assert(_nodes[0].MatchLength == 0, "The root node shouldn't have a match.");
for (int i = 0; i < _nodes.Length; i++)
{
_nodes[i].OptimizeChildren();
}
if (IndexOfAnyAsciiSearcher.IsVectorizationSupported)
{
GenerateStartingAsciiCharsBitmap();
}
return new AhoCorasick(_nodes.AsSpan().ToArray(), _startingCharsAsciiBitmap);
}
public void Dispose()
{
_nodes.Dispose();
_parents.Dispose();
}
private void BuildTrie(ref HashSet<string>? unreachableValues)
{
_nodes.Append(new AhoCorasickNode());
_parents.Append(0);
foreach (string value in _values)
{
int nodeIndex = 0;
ref AhoCorasickNode node = ref _nodes[nodeIndex];
for (int i = 0; i < value.Length; i++)
{
char c = value[i];
if (!node.TryGetChild(c, out int childIndex))
{
childIndex = _nodes.Length;
node.AddChild(c, childIndex);
_nodes.Append(new AhoCorasickNode());
_parents.Append(nodeIndex);
}
node = ref _nodes[childIndex];
nodeIndex = childIndex;
if (node.MatchLength != 0)
{
// A previous value is an exact prefix of this one.
// We're looking for the index of the first match, not necessarily the longest one, so we can skip this value.
// We've already normalized the values, so we can do ordinal comparisons here.
unreachableValues ??= new HashSet<string>(StringComparer.Ordinal);
unreachableValues.Add(value);
break;
}
if (i == value.Length - 1)
{
node.MatchLength = value.Length;
break;
}
}
}
}
private void AddSuffixLinks()
{
// Besides the list of children which continue the current value, each node also contains a suffix link
// which points to the node with the longest suffix of the current node.
// When we're searching and can't find a child to extend the current string with, we will follow
// suffix links to find the longest string that does match up until the current point.
//
// For example if we have strings "DOTNET" and "OTTER", we want
// the 'O' and 'T' in "dotnet" to point into 'O' and 'T' in "OTTER".
// If our text contains the word "dotter", we will walk it character by character.
// Once we get to "DOTNET" and read the next character 'T', we can no longer continue with "DOTNET",
// and will instead follow the suffix link to "ot" in "OTTER" where we can continue the search.
//
// We also remember when a node's suffix link points to the end of a different value, such that it is itself a match.
// If we also had the word "POTTERY", the 'R' would contain a suffix link to the 'R' in "OTTER",
// but also mark that it is already a length=5 match.
//
// +---> D O T N E T
// | | |
// | +--+ |
// root--+ | |
// | | +--+
// | v v
// +---> O T T E R
// | ^ ^ ^ ^ ^
// | | | | | | -- this is also a length=5 match
// | | | | | |
// +> P O T T E R Y
var queue = new Queue<(char Char, int Index)>();
queue.Enqueue(((char)0, 0));
while (queue.TryDequeue(out (char Char, int Index) trieNode))
{
ref AhoCorasickNode node = ref _nodes[trieNode.Index];
int parent = _parents[trieNode.Index];
int suffixLink = _nodes[parent].SuffixLink;
// If this node doesn't represent the first character of a value (doesn't immediately follow the root node),
// it may have a have a non-zero suffix link.
if (parent != 0)
{
while (suffixLink >= 0)
{
ref AhoCorasickNode suffixNode = ref _nodes[suffixLink];
if (suffixNode.TryGetChild(trieNode.Char, out int childSuffixLink))
{
suffixLink = childSuffixLink;
break;
}
if (suffixLink == 0)
{
break;
}
suffixLink = suffixNode.SuffixLink;
}
}
if (node.MatchLength != 0)
{
// This node represents the end of a match.
// Mark it in a special way we can recognize when searching.
node.SuffixLink = -1;
// If a node is a match, there is no need to assign suffix links to its children.
// If a child does not match, such that we would look at its suffix link,
// we have already saw an earlier match node that is definitely the earliest possible match.
}
else
{
node.SuffixLink = suffixLink;
if (suffixLink >= 0)
{
// Remember if this node's suffix link points to a node that is itself a match.
node.MatchLength = _nodes[suffixLink].MatchLength;
}
node.AddChildrenToQueue(queue);
}
}
}
// If all the values start with ASCII characters, we can use IndexOfAnyAsciiSearcher
// to quickly skip to the next possible starting location in the input.
private void GenerateStartingAsciiCharsBitmap()
{
scoped ValueListBuilder<char> startingChars = new ValueListBuilder<char>(stackalloc char[128]);
foreach (string value in _values)
{
char c = value[0];
if (_ignoreCase)
{
startingChars.Append(char.ToLowerInvariant(c));
startingChars.Append(char.ToUpperInvariant(c));
}
else
{
startingChars.Append(c);
}
}
if (Ascii.IsValid(startingChars.AsSpan()))
{
IndexOfAnyAsciiSearcher.ComputeBitmap(startingChars.AsSpan(), out _startingCharsAsciiBitmap, out _);
}
startingChars.Dispose();
}
}
}

View file

@ -0,0 +1,192 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace System.Buffers
{
internal struct AhoCorasickNode
{
private static object EmptyChildrenSentinel => Array.Empty<int>();
public int SuffixLink;
public int MatchLength;
// This is not a radix tree so we may have a lot of very sparse nodes (single child).
// We save 1 child separately to avoid allocating a separate collection in such cases.
private int _firstChildChar;
private int _firstChildIndex;
private object _children; // Either int[] or Dictionary<char, int>
public AhoCorasickNode()
{
_firstChildChar = -1;
_children = EmptyChildrenSentinel;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly bool TryGetChild(char c, out int index)
{
if (_firstChildChar == c)
{
index = _firstChildIndex;
return true;
}
object children = _children;
Debug.Assert(children is int[] || children is Dictionary<char, int>);
if (children.GetType() == typeof(int[]))
{
int[] table = Unsafe.As<int[]>(children);
if (c < (uint)table.Length)
{
index = table[c];
if (index >= 0)
{
return true;
}
}
}
else
{
return Unsafe.As<Dictionary<char, int>>(children).TryGetValue(c, out index);
}
index = 0;
return false;
}
public void AddChild(char c, int index)
{
if (_firstChildChar < 0)
{
_firstChildChar = c;
_firstChildIndex = index;
}
else
{
if (ReferenceEquals(_children, EmptyChildrenSentinel))
{
_children = new Dictionary<char, int>();
}
((Dictionary<char, int>)_children).Add(c, index);
}
}
public readonly void AddChildrenToQueue(Queue<(char Char, int Index)> queue)
{
if (_firstChildChar >= 0)
{
queue.Enqueue(((char)_firstChildChar, _firstChildIndex));
if (_children is Dictionary<char, int> children)
{
foreach ((char childChar, int childIndex) in children)
{
queue.Enqueue((childChar, childIndex));
}
}
else
{
Debug.Assert(ReferenceEquals(_children, EmptyChildrenSentinel));
}
}
}
public void OptimizeChildren()
{
if (_children is Dictionary<char, int> children)
{
children.Add((char)_firstChildChar, _firstChildIndex);
float frequency = -2;
// We have the _firstChildChar field that will always be checked first.
// Improve throughput by setting it to the child character with the highest frequency.
foreach ((char childChar, int childIndex) in children)
{
float newFrequency = char.IsAscii(childChar) ? CharacterFrequencyHelper.AsciiFrequency[childChar] : -1;
if (newFrequency > frequency)
{
frequency = newFrequency;
_firstChildChar = childChar;
_firstChildIndex = childIndex;
}
}
children.Remove((char)_firstChildChar);
if (TryCreateJumpTable(children, out int[]? table))
{
_children = table;
}
}
static bool TryCreateJumpTable(Dictionary<char, int> children, [NotNullWhen(true)] out int[]? table)
{
// We can use either a Dictionary<char, int> or int[] to map child characters to node indexes.
// int[] is generally faster but consumes more memory for characters with high values.
// We try to find the right balance between memory usage and lookup performance.
// Currently we will sacrifice up to ~2x the memory consumption to use int[] for faster lookups.
const int AcceptableSizeMultiplier = 2;
Debug.Assert(children.Count > 0);
int maxValue = -1;
foreach ((char childChar, _) in children)
{
maxValue = Math.Max(maxValue, childChar);
}
int tableSize = TableMemoryFootprintBytesEstimate(maxValue);
int dictionarySize = DictionaryMemoryFootprintBytesEstimate(children.Count);
if (tableSize > dictionarySize * AcceptableSizeMultiplier)
{
// We would have a lot of empty entries. Avoid wasting too much memory.
table = null;
return false;
}
table = new int[maxValue + 1];
Array.Fill(table, -1);
foreach ((char childChar, int childIndex) in children)
{
table[childChar] = childIndex;
}
return true;
static int TableMemoryFootprintBytesEstimate(int maxValue)
{
// An approximate number of bytes consumed by an
// int[] table with a known number of entries.
// Only used as a heuristic, so numbers don't have to be exact.
return 32 + (maxValue * sizeof(int));
}
static int DictionaryMemoryFootprintBytesEstimate(int childCount)
{
// An approximate number of bytes consumed by a
// Dictionary<char, int> with a known number of entries.
// Only used as a heuristic, so numbers don't have to be exact.
return childCount switch
{
< 4 => 192,
< 8 => 272,
< 12 => 352,
_ => childCount * 25
};
}
}
}
}
}

View file

@ -0,0 +1,127 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
namespace System.Buffers
{
internal static class CharacterFrequencyHelper
{
// Same as RegexPrefixAnalyzer.Frequency.
// https://github.com/dotnet/runtime/blob/a355d5f7db162714ee19533ca55074aa2cbd8a8c/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs#L956C43-L956C53
public static ReadOnlySpan<float> AsciiFrequency => new float[]
{
0.000f /* '\x00' */, 0.000f /* '\x01' */, 0.000f /* '\x02' */, 0.000f /* '\x03' */, 0.000f /* '\x04' */, 0.000f /* '\x05' */, 0.000f /* '\x06' */, 0.000f /* '\x07' */,
0.000f /* '\x08' */, 0.001f /* '\x09' */, 0.000f /* '\x0A' */, 0.000f /* '\x0B' */, 0.000f /* '\x0C' */, 0.000f /* '\x0D' */, 0.000f /* '\x0E' */, 0.000f /* '\x0F' */,
0.000f /* '\x10' */, 0.000f /* '\x11' */, 0.000f /* '\x12' */, 0.000f /* '\x13' */, 0.003f /* '\x14' */, 0.000f /* '\x15' */, 0.000f /* '\x16' */, 0.000f /* '\x17' */,
0.000f /* '\x18' */, 0.004f /* '\x19' */, 0.000f /* '\x1A' */, 0.000f /* '\x1B' */, 0.006f /* '\x1C' */, 0.006f /* '\x1D' */, 0.000f /* '\x1E' */, 0.000f /* '\x1F' */,
8.952f /* ' ' */, 0.065f /* ' !' */, 0.420f /* ' "' */, 0.010f /* ' #' */, 0.011f /* ' $' */, 0.005f /* ' %' */, 0.070f /* ' &' */, 0.050f /* ' '' */,
3.911f /* ' (' */, 3.910f /* ' )' */, 0.356f /* ' *' */, 2.775f /* ' +' */, 1.411f /* ' ,' */, 0.173f /* ' -' */, 2.054f /* ' .' */, 0.677f /* ' /' */,
1.199f /* ' 0' */, 0.870f /* ' 1' */, 0.729f /* ' 2' */, 0.491f /* ' 3' */, 0.335f /* ' 4' */, 0.269f /* ' 5' */, 0.435f /* ' 6' */, 0.240f /* ' 7' */,
0.234f /* ' 8' */, 0.196f /* ' 9' */, 0.144f /* ' :' */, 0.983f /* ' ;' */, 0.357f /* ' <' */, 0.661f /* ' =' */, 0.371f /* ' >' */, 0.088f /* ' ?' */,
0.007f /* ' @' */, 0.763f /* ' A' */, 0.229f /* ' B' */, 0.551f /* ' C' */, 0.306f /* ' D' */, 0.449f /* ' E' */, 0.337f /* ' F' */, 0.162f /* ' G' */,
0.131f /* ' H' */, 0.489f /* ' I' */, 0.031f /* ' J' */, 0.035f /* ' K' */, 0.301f /* ' L' */, 0.205f /* ' M' */, 0.253f /* ' N' */, 0.228f /* ' O' */,
0.288f /* ' P' */, 0.034f /* ' Q' */, 0.380f /* ' R' */, 0.730f /* ' S' */, 0.675f /* ' T' */, 0.265f /* ' U' */, 0.309f /* ' V' */, 0.137f /* ' W' */,
0.084f /* ' X' */, 0.023f /* ' Y' */, 0.023f /* ' Z' */, 0.591f /* ' [' */, 0.085f /* ' \' */, 0.590f /* ' ]' */, 0.013f /* ' ^' */, 0.797f /* ' _' */,
0.001f /* ' `' */, 4.596f /* ' a' */, 1.296f /* ' b' */, 2.081f /* ' c' */, 2.005f /* ' d' */, 6.903f /* ' e' */, 1.494f /* ' f' */, 1.019f /* ' g' */,
1.024f /* ' h' */, 3.750f /* ' i' */, 0.286f /* ' j' */, 0.439f /* ' k' */, 2.913f /* ' l' */, 1.459f /* ' m' */, 3.908f /* ' n' */, 3.230f /* ' o' */,
1.444f /* ' p' */, 0.231f /* ' q' */, 4.220f /* ' r' */, 3.924f /* ' s' */, 5.312f /* ' t' */, 2.112f /* ' u' */, 0.737f /* ' v' */, 0.573f /* ' w' */,
0.992f /* ' x' */, 1.067f /* ' y' */, 0.181f /* ' z' */, 0.391f /* ' {' */, 0.056f /* ' |' */, 0.391f /* ' }' */, 0.002f /* ' ~' */, 0.000f /* '\x7F' */,
};
public static void GetSingleStringMultiCharacterOffsets(string value, bool ignoreCase, out int ch2Offset, out int ch3Offset)
{
Debug.Assert(value.Length > 1);
Debug.Assert(!ignoreCase || char.IsAscii(value[0]));
ch2Offset = IndexOfAsciiCharWithLowestFrequency(value, ignoreCase);
ch3Offset = 0;
if (ch2Offset < 0)
{
// We have fewer than 2 ASCII chars in the value.
Debug.Assert(!ignoreCase);
// We don't have a frequency table for non-ASCII characters, pick a random one.
ch2Offset = value.Length - 1;
}
if (value.Length > 2)
{
ch3Offset = IndexOfAsciiCharWithLowestFrequency(value, ignoreCase, excludeIndex: ch2Offset);
if (ch3Offset < 0)
{
// We have fewer than 3 ASCII chars in the value.
if (ignoreCase)
{
// We can still use N=2.
ch3Offset = 0;
}
else
{
// We don't have a frequency table for non-ASCII characters, pick a random one.
ch3Offset = value.Length - 1;
if (ch2Offset == ch3Offset)
{
ch2Offset--;
}
}
}
}
Debug.Assert(ch2Offset != 0);
Debug.Assert(ch2Offset != ch3Offset);
if (ch3Offset > 0 && ch3Offset < ch2Offset)
{
(ch2Offset, ch3Offset) = (ch3Offset, ch2Offset);
}
}
private static int IndexOfAsciiCharWithLowestFrequency(ReadOnlySpan<char> span, bool ignoreCase, int excludeIndex = -1)
{
float minFrequency = float.MaxValue;
int minIndex = -1;
// Exclude i = 0 as we've already decided to use the first character.
for (int i = 1; i < span.Length; i++)
{
if (i == excludeIndex)
{
continue;
}
char c = span[i];
// We don't have a frequency table for non-ASCII characters, so they are ignored.
if (char.IsAscii(c))
{
float frequency = AsciiFrequency[c];
if (ignoreCase)
{
// Include the alternative character that will also match.
frequency += AsciiFrequency[c ^ 0x20];
}
// Avoiding characters from the front of the value for the 2nd and 3rd character
// results in 18 % fewer false positive 3-char matches on "The Adventures of Sherlock Holmes".
if (i <= 2)
{
frequency *= 1.5f;
}
if (frequency <= minFrequency)
{
minFrequency = frequency;
minIndex = i;
}
}
}
return minIndex;
}
}
}

View file

@ -0,0 +1,23 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace System.Buffers
{
[InlineArray(8)]
internal struct EightPackedReferences
{
#pragma warning disable CA1823 // Unused field -- https://github.com/dotnet/roslyn-analyzers/issues/6788
private object? _ref0;
#pragma warning restore CA1823
public EightPackedReferences(ReadOnlySpan<object> values)
{
Debug.Assert(values.Length is > 0 and <= 8, $"Got {values.Length} values");
values.CopyTo(this!);
}
}
}

View file

@ -0,0 +1,175 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using static System.Buffers.StringSearchValuesHelper;
namespace System.Buffers
{
/// <summary>
/// An implementation of the Rabin-Karp algorithm we use as a fallback for
/// short inputs that we can't handle with Teddy.
/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
/// Has an O(i * m) worst-case, but we will only use it for very short inputs.
/// </summary>
internal readonly struct RabinKarp
{
// The number of values we'll accept before falling back to Aho-Corasick.
// This also affects when Teddy may be used.
public const int MaxValues = 80;
// This is a tradeoff between memory consumption and the number of false positives
// we have to rule out during the verification step.
private const nuint BucketCount = 64;
// 18 = Vector128<byte>.Count + 2 (MatchStartOffset for N=3)
// The logic in this class is not safe from overflows, but we avoid any issues by
// only calling into it for inputs that are too short for Teddy to handle.
private const int MaxInputLength = 18 - 1;
// We're using nuint as the rolling hash, so we can spread the hash over more bits on 64bit.
private static int HashShiftPerElement => IntPtr.Size == 8 ? 2 : 1;
private readonly string[]?[] _buckets;
private readonly int _hashLength;
private readonly nuint _hashUpdateMultiplier;
public RabinKarp(ReadOnlySpan<string> values)
{
Debug.Assert(values.Length <= MaxValues);
int minimumLength = int.MaxValue;
foreach (string value in values)
{
minimumLength = Math.Min(minimumLength, value.Length);
}
Debug.Assert(minimumLength > 1);
_hashLength = minimumLength;
_hashUpdateMultiplier = (nuint)1 << ((minimumLength - 1) * HashShiftPerElement);
if (minimumLength > MaxInputLength)
{
// All the values are long. They'll either be handled by Teddy or won't match at all.
// There's no point in allocating the buckets as they will never be accessed.
_buckets = null!;
return;
}
string[]?[] buckets = _buckets = new string[BucketCount][];
foreach (string value in values)
{
nuint hash = 0;
for (int i = 0; i < minimumLength; i++)
{
hash = (hash << HashShiftPerElement) + value[i];
}
nuint bucket = hash % BucketCount;
string[] newBucket;
// Start with a bucket containing 1 element and reallocate larger ones if needed.
// As MaxValues is similar to BucketCount, we will have 1 value per bucket on average.
if (buckets[bucket] is string[] existingBucket)
{
newBucket = new string[existingBucket.Length + 1];
existingBucket.AsSpan().CopyTo(newBucket);
}
else
{
newBucket = new string[1];
}
newBucket[^1] = value;
buckets[bucket] = newBucket;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly int IndexOfAny<TCaseSensitivity>(ReadOnlySpan<char> span)
where TCaseSensitivity : struct, ICaseSensitivity
{
return typeof(TCaseSensitivity) == typeof(CaseInsensitiveUnicode)
? IndexOfAnyCaseInsensitiveUnicode(span)
: IndexOfAnyCore<TCaseSensitivity>(span);
}
private readonly int IndexOfAnyCore<TCaseSensitivity>(ReadOnlySpan<char> span)
where TCaseSensitivity : struct, ICaseSensitivity
{
Debug.Assert(typeof(TCaseSensitivity) != typeof(CaseInsensitiveUnicode));
Debug.Assert(span.Length <= MaxInputLength, "Teddy should have handled short inputs.");
ref char current = ref MemoryMarshal.GetReference(span);
int hashLength = _hashLength;
if (span.Length >= hashLength)
{
ref char end = ref Unsafe.Add(ref MemoryMarshal.GetReference(span), (uint)(span.Length - hashLength));
nuint hash = 0;
for (uint i = 0; i < hashLength; i++)
{
hash = (hash << HashShiftPerElement) + TCaseSensitivity.TransformInput(Unsafe.Add(ref current, i));
}
Debug.Assert(_buckets is not null);
ref string[]? bucketsRef = ref MemoryMarshal.GetArrayDataReference(_buckets);
while (true)
{
ValidateReadPosition(span, ref current);
if (Unsafe.Add(ref bucketsRef, hash % BucketCount) is string[] bucket)
{
int startOffset = (int)((nuint)Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref current) / sizeof(char));
if (StartsWith<TCaseSensitivity>(ref current, span.Length - startOffset, bucket))
{
return startOffset;
}
}
if (!Unsafe.IsAddressLessThan(ref current, ref end))
{
break;
}
char previous = TCaseSensitivity.TransformInput(current);
char next = TCaseSensitivity.TransformInput(Unsafe.Add(ref current, (uint)hashLength));
// Update the hash by removing the previous character and adding the next one.
hash = ((hash - (previous * _hashUpdateMultiplier)) << HashShiftPerElement) + next;
current = ref Unsafe.Add(ref current, 1);
}
}
return -1;
}
private readonly int IndexOfAnyCaseInsensitiveUnicode(ReadOnlySpan<char> span)
{
Debug.Assert(span.Length <= MaxInputLength, "Teddy should have handled long inputs.");
if (_hashLength > span.Length)
{
// Can't possibly match, all the values are longer than our input span.
return -1;
}
Span<char> upperCase = stackalloc char[MaxInputLength].Slice(0, span.Length);
int charsWritten = Ordinal.ToUpperOrdinal(span, upperCase);
Debug.Assert(charsWritten == upperCase.Length);
// CaseSensitive instead of CaseInsensitiveUnicode as we've already done the case conversion.
return IndexOfAnyCore<CaseSensitive>(upperCase);
}
}
}

View file

@ -0,0 +1,191 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace System.Buffers
{
// Provides implementations for helpers shared across multiple SearchValues<string> implementations,
// such as normalizing and matching values under different case sensitivity rules.
internal static class StringSearchValuesHelper
{
[Conditional("DEBUG")]
public static void ValidateReadPosition(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, int offset = 0)
{
Debug.Assert(searchSpaceLength >= 0);
ValidateReadPosition(MemoryMarshal.CreateReadOnlySpan(ref searchSpaceStart, searchSpaceLength), ref searchSpace, offset);
}
[Conditional("DEBUG")]
public static void ValidateReadPosition(ReadOnlySpan<char> span, ref char searchSpace, int offset = 0)
{
Debug.Assert(offset >= 0);
nint currentByteOffset = Unsafe.ByteOffset(ref MemoryMarshal.GetReference(span), ref searchSpace);
Debug.Assert(currentByteOffset >= 0);
Debug.Assert((currentByteOffset & 1) == 0);
int currentOffset = (int)(currentByteOffset / 2);
int availableLength = span.Length - currentOffset;
Debug.Assert(offset <= availableLength);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool StartsWith<TCaseSensitivity>(ref char matchStart, int lengthRemaining, string[] candidates)
where TCaseSensitivity : struct, ICaseSensitivity
{
foreach (string candidate in candidates)
{
if (StartsWith<TCaseSensitivity>(ref matchStart, lengthRemaining, candidate))
{
return true;
}
}
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool StartsWith<TCaseSensitivity>(ref char matchStart, int lengthRemaining, string candidate)
where TCaseSensitivity : struct, ICaseSensitivity
{
Debug.Assert(lengthRemaining > 0);
if (lengthRemaining < candidate.Length)
{
return false;
}
return TCaseSensitivity.Equals(ref matchStart, candidate);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ScalarEquals<TCaseSensitivity>(ref char matchStart, string candidate)
where TCaseSensitivity : struct, ICaseSensitivity
{
for (int i = 0; i < candidate.Length; i++)
{
if (TCaseSensitivity.TransformInput(Unsafe.Add(ref matchStart, i)) != candidate[i])
{
return false;
}
}
return true;
}
public interface ICaseSensitivity
{
static abstract char TransformInput(char input);
static abstract Vector128<byte> TransformInput(Vector128<byte> input);
static abstract Vector256<byte> TransformInput(Vector256<byte> input);
static abstract Vector512<byte> TransformInput(Vector512<byte> input);
static abstract bool Equals(ref char matchStart, string candidate);
}
// Performs no case transformations.
public readonly struct CaseSensitive : ICaseSensitivity
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static char TransformInput(char input) => input;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> TransformInput(Vector128<byte> input) => input;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> TransformInput(Vector256<byte> input) => input;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<byte> TransformInput(Vector512<byte> input) => input;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool Equals(ref char matchStart, string candidate) =>
ScalarEquals<CaseSensitive>(ref matchStart, candidate);
}
// Transforms inputs to their uppercase variants with the assumption that all input characters are ASCII letters.
// These helpers may produce wrong results for other characters, and the callers must account for that.
public readonly struct CaseInsensitiveAsciiLetters : ICaseSensitivity
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static char TransformInput(char input) => (char)(input & ~0x20);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> TransformInput(Vector128<byte> input) => input & Vector128.Create(unchecked((byte)~0x20));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> TransformInput(Vector256<byte> input) => input & Vector256.Create(unchecked((byte)~0x20));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<byte> TransformInput(Vector512<byte> input) => input & Vector512.Create(unchecked((byte)~0x20));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool Equals(ref char matchStart, string candidate) =>
ScalarEquals<CaseInsensitiveAsciiLetters>(ref matchStart, candidate);
}
// Transforms inputs to their uppercase variants with the assumption that all input characters are ASCII.
// These helpers may produce wrong results for non-ASCII inputs, and the callers must account for that.
public readonly struct CaseInsensitiveAscii : ICaseSensitivity
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static char TransformInput(char input) => TextInfo.ToUpperAsciiInvariant(input);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> TransformInput(Vector128<byte> input)
{
Vector128<byte> subtraction = Vector128.Create((byte)(128 + 'a'));
Vector128<byte> comparison = Vector128.Create((byte)(128 + 26));
Vector128<byte> caseConversion = Vector128.Create((byte)0x20);
Vector128<byte> matches = Vector128.LessThan((input - subtraction).AsSByte(), comparison.AsSByte()).AsByte();
return input ^ (matches & caseConversion);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> TransformInput(Vector256<byte> input)
{
Vector256<byte> subtraction = Vector256.Create((byte)(128 + 'a'));
Vector256<byte> comparison = Vector256.Create((byte)(128 + 26));
Vector256<byte> caseConversion = Vector256.Create((byte)0x20);
Vector256<byte> matches = Vector256.LessThan((input - subtraction).AsSByte(), comparison.AsSByte()).AsByte();
return input ^ (matches & caseConversion);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<byte> TransformInput(Vector512<byte> input)
{
Vector512<byte> subtraction = Vector512.Create((byte)(128 + 'a'));
Vector512<byte> comparison = Vector512.Create((byte)(128 + 26));
Vector512<byte> caseConversion = Vector512.Create((byte)0x20);
Vector512<byte> matches = Vector512.LessThan((input - subtraction).AsSByte(), comparison.AsSByte()).AsByte();
return input ^ (matches & caseConversion);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool Equals(ref char matchStart, string candidate) =>
ScalarEquals<CaseInsensitiveAscii>(ref matchStart, candidate);
}
// We can't efficiently map non-ASCII inputs to their Ordinal uppercase variants,
// so this helper is only used for the verification of the whole input.
public readonly struct CaseInsensitiveUnicode : ICaseSensitivity
{
public static char TransformInput(char input) => throw new UnreachableException();
public static Vector128<byte> TransformInput(Vector128<byte> input) => throw new UnreachableException();
public static Vector256<byte> TransformInput(Vector256<byte> input) => throw new UnreachableException();
public static Vector512<byte> TransformInput(Vector512<byte> input) => throw new UnreachableException();
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool Equals(ref char matchStart, string candidate) =>
Ordinal.EqualsIgnoreCase(ref matchStart, ref candidate.GetRawStringData(), candidate.Length);
}
}
}

View file

@ -0,0 +1,142 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
namespace System.Buffers
{
internal static class TeddyBucketizer
{
// This method is the same as GenerateBucketizedFingerprint below, but each bucket only contains 1 value.
public static (Vector512<byte> Low, Vector512<byte> High) GenerateNonBucketizedFingerprint(ReadOnlySpan<string> values, int offset)
{
Debug.Assert(values.Length <= 8);
Vector128<byte> low = default;
Vector128<byte> high = default;
for (int i = 0; i < values.Length; i++)
{
string value = values[i];
int bit = 1 << i;
char c = value[offset];
Debug.Assert(char.IsAscii(c));
int lowNibble = c & 0xF;
int highNibble = c >> 4;
low.SetElementUnsafe(lowNibble, (byte)(low.GetElementUnsafe(lowNibble) | bit));
high.SetElementUnsafe(highNibble, (byte)(high.GetElementUnsafe(highNibble) | bit));
}
return (DuplicateTo512(low), DuplicateTo512(high));
}
// We can have up to 8 buckets, and their positions are encoded by 1 bit each.
// Every bitmap encodes a mapping of each of the possible 16 nibble values into an 8-bit bitmap.
// For example if bucket 0 contains strings ["foo", "bar"], the bitmaps will have the first bit (0th bucket) set like the following:
// 'f' is 0x66, 'b' is 0x62, so n0Low has the bit set at index 2 and 6, n0High has it set at index 6.
// 'o' is 0x6F, 'a' is 0x61, so n1Low has the bit set at index 1 and 15, n1High has it set at index 6.
// 'o' is 0x6F, 'r' is 0x72, so n2Low has the bit set at index 2 and 15, n2High has it set at index 6 and 7.
// We repeat this for each bucket and then OR together the bitmaps (fingerprints) of each bucket to generate a single bitmap for each nibble.
public static (Vector512<byte> Low, Vector512<byte> High) GenerateBucketizedFingerprint(string[][] valueBuckets, int offset)
{
Debug.Assert(valueBuckets.Length <= 8);
Vector128<byte> low = default;
Vector128<byte> high = default;
for (int i = 0; i < valueBuckets.Length; i++)
{
int bit = 1 << i;
foreach (string value in valueBuckets[i])
{
char c = value[offset];
Debug.Assert(char.IsAscii(c));
int lowNibble = c & 0xF;
int highNibble = c >> 4;
low.SetElementUnsafe(lowNibble, (byte)(low.GetElementUnsafe(lowNibble) | bit));
high.SetElementUnsafe(highNibble, (byte)(high.GetElementUnsafe(highNibble) | bit));
}
}
return (DuplicateTo512(low), DuplicateTo512(high));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector512<byte> DuplicateTo512(Vector128<byte> vector)
{
Vector256<byte> vector256 = Vector256.Create(vector, vector);
return Vector512.Create(vector256, vector256);
}
public static string[][] Bucketize(ReadOnlySpan<string> values, int bucketCount, int n)
{
Debug.Assert(bucketCount == 8, "This may change if we end up supporting the 'fat Teddy' variant.");
Debug.Assert(values.Length > bucketCount, "Should be using a non-bucketized implementation.");
Debug.Assert(values.Length <= RabinKarp.MaxValues);
// Stores the offset of the bucket each value should be assigned to.
// This lets us avoid allocating temporary lists to build up each bucket.
Span<int> bucketIndexes = stackalloc int[RabinKarp.MaxValues].Slice(0, values.Length);
// Group patterns with the same prefix into the same bucket to avoid wasting time during verification steps.
Dictionary<int, int> prefixToBucket = new(bucketCount);
int bucketCounter = 0;
for (int i = 0; i < values.Length; i++)
{
string value = values[i];
int prefix = 0;
for (int j = 0; j < n; j++)
{
Debug.Assert(char.IsAscii(value[j]));
prefix = (prefix << 8) | value[j];
}
if (!prefixToBucket.TryGetValue(prefix, out int bucketIndex))
{
// Potential optimization: We currently merge values with different prefixes into buckets randomly (round-robin).
// We could employ a more sophisticated strategy here, e.g. by trying to minimize the number of
// values in each bucket, or by minimizing the PopCount of final merged fingerprints.
// Example of the latter: https://gist.github.com/MihaZupan/831324d1d646b69ae0ba4b54e3446a49
bucketIndex = bucketCounter++ % bucketCount;
prefixToBucket.Add(prefix, bucketIndex);
}
bucketIndexes[i] = bucketIndex;
}
string[][] buckets = new string[bucketCount][];
for (int bucketIndex = 0; bucketIndex < buckets.Length; bucketIndex++)
{
string[] strings = buckets[bucketIndex] = new string[bucketIndexes.Count(bucketIndex)];
int count = 0;
for (int i = 0; i < bucketIndexes.Length; i++)
{
if (bucketIndexes[i] == bucketIndex)
{
strings[count++] = values[i];
}
}
Debug.Assert(count == strings.Length);
}
return buckets;
}
}
}

View file

@ -0,0 +1,436 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace System.Buffers
{
/// <summary>
/// Contains the implementation of core vectorized Teddy matching operations.
/// They determine which buckets contain potential matches for each input position.
/// </summary>
internal static class TeddyHelper
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
public static (Vector128<byte> Result, Vector128<byte> Prev0) ProcessInputN2(
Vector128<byte> input,
Vector128<byte> prev0,
Vector128<byte> n0Low, Vector128<byte> n0High,
Vector128<byte> n1Low, Vector128<byte> n1High)
{
// See the full description of ProcessInputN3 below for more details.
// This method follows the same pattern as ProcessInputN3, but compares 2 bytes of each bucket at a time instead of 3.
// We are dealing with 4 input nibble bitmaps instead of 6, and only 1 result from the previous iteration instead of 2.
(Vector128<byte> low, Vector128<byte> high) = GetNibbles(input);
// Shuffle each nibble with the 2 corresponding bitmaps to determine which positions match any bucket.
Vector128<byte> match0 = Shuffle(n0Low, n0High, low, high);
Vector128<byte> result1 = Shuffle(n1Low, n1High, low, high);
// RightShift1 shifts the match0 vector to the right by 1 place and shifts in 1 byte from the previous iteration.
Vector128<byte> result0 = RightShift1(prev0, match0);
// AND the results together to obtain a list of only buckets that match at all 4 nibble positions.
Vector128<byte> result = result0 & result1;
// Return the result and the current matches for byte 0.
// The next loop iteration, 'match0' will be passed back to this method as 'prev0'.
return (result, match0);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
public static (Vector256<byte> Result, Vector256<byte> Prev0) ProcessInputN2(
Vector256<byte> input,
Vector256<byte> prev0,
Vector256<byte> n0Low, Vector256<byte> n0High,
Vector256<byte> n1Low, Vector256<byte> n1High)
{
// See comments in 'ProcessInputN2' for Vector128<byte> above.
// This method is the same, but operates on 32 input characters at a time.
(Vector256<byte> low, Vector256<byte> high) = GetNibbles(input);
Vector256<byte> match0 = Shuffle(n0Low, n0High, low, high);
Vector256<byte> result1 = Shuffle(n1Low, n1High, low, high);
Vector256<byte> result0 = RightShift1(prev0, match0);
Vector256<byte> result = result0 & result1;
return (result, match0);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512BW))]
public static (Vector512<byte> Result, Vector512<byte> Prev0) ProcessInputN2(
Vector512<byte> input,
Vector512<byte> prev0,
Vector512<byte> n0Low, Vector512<byte> n0High,
Vector512<byte> n1Low, Vector512<byte> n1High)
{
// See comments in 'ProcessInputN2' for Vector128<byte> above.
// This method is the same, but operates on 64 input characters at a time.
(Vector512<byte> low, Vector512<byte> high) = GetNibbles(input);
Vector512<byte> match0 = Shuffle(n0Low, n0High, low, high);
Vector512<byte> result1 = Shuffle(n1Low, n1High, low, high);
Vector512<byte> result0 = RightShift1(prev0, match0);
Vector512<byte> result = result0 & result1;
return (result, match0);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
public static (Vector128<byte> Result, Vector128<byte> Prev0, Vector128<byte> Prev1) ProcessInputN3(
Vector128<byte> input,
Vector128<byte> prev0, Vector128<byte> prev1,
Vector128<byte> n0Low, Vector128<byte> n0High,
Vector128<byte> n1Low, Vector128<byte> n1High,
Vector128<byte> n2Low, Vector128<byte> n2High)
{
// This is the core operation of the Teddy algorithm that determines which of the buckets contain potential matches.
// Every input bitmap argument (n0Low, n0High, ...) encodes a mapping of each of the possible 16 nibble values into an 8-bit bitmap.
// We test each nibble in the input against these bitmaps to determine which buckets match a given nibble.
// We then AND together these results to obtain only a list of buckets that match at all 6 nibble positions.
// Each byte of the result represents an 8-bit bitmask of buckets that may match at each position.
(Vector128<byte> low, Vector128<byte> high) = GetNibbles(input);
// Shuffle each nibble with the 3 corresponding bitmaps to determine which positions match any bucket.
Vector128<byte> match0 = Shuffle(n0Low, n0High, low, high);
Vector128<byte> match1 = Shuffle(n1Low, n1High, low, high);
Vector128<byte> result2 = Shuffle(n2Low, n2High, low, high);
// match0 contain the information for bucket matches at position 0.
// match1 contain the information for bucket matches at position 1.
// result2 contain the information for bucket matches at position 2.
// If we imagine that we only have 1 bucket with 1 string "ABC", the bitmaps we've just obtained encode the following information:
// match0 tells us at which positions we matched the letter 'A'
// match1 tells us at which positions we matched the letter 'B'
// result2 tells us at which positions we matched the letter 'C'
// If input represents the text "BC text ABC text", they would contain:
// input: [B, C, , t, e, x, t, , A, B, C, , t, e, x, t]
// match0: [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
// match1: [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
// result2: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
// ^ ^ ^
// Note how the input contains the string ABC, but the matches are not aligned, so we can't just AND them together.
// To solve this, we shift 'match0' to the right by 2 places and 'match1' to the right by 1 place.
// result0: [?, ?, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
// result1: [?, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
// result2: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
// ^ ^ ^
// The results are now aligned, but we don't know whether the first two positions matched result0 and result1.
// To replace the missing bytes, we remember the matches from the previous loop iteration, and look at their last 2 bytes.
// If the previous loop iteration ended on the character 'A', we might even have an earlier match.
// For example, if the previous input was "Random strings A":
// prev0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
// result0: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
// ^ ^
// We will merge the last two bytes of 'prev0' into 'result0' and the last byte of 'prev1' into 'result1'
// result0: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
// result1: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
// result2: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
//
// RightShift1 and RightShift2 perform the above operation of shifting the match vectors
// to the right by 1 and 2 places and shifting in the bytes from the previous iteration.
Vector128<byte> result0 = RightShift2(prev0, match0);
Vector128<byte> result1 = RightShift1(prev1, match1);
// AND the results together to obtain a list of only buckets that match at all 6 nibble positions.
// result: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
// ^ ^
// Note that we found the match at index 1, even though that match started 2 bytes earlier, at the end of the previous iteration.
// The caller must account for that when verifying potential matches, see 'MatchStartOffsetN3 = 2' in 'AsciiStringSearchValuesTeddyBase'.
Vector128<byte> result = result0 & result1 & result2;
// Return the result and the current matches for byte 0 and 1.
// The next loop iteration, 'match0' and 'match1' will be passed back to this method as 'prev0' and 'prev1'.
return (result, match0, match1);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
public static (Vector256<byte> Result, Vector256<byte> Prev0, Vector256<byte> Prev1) ProcessInputN3(
Vector256<byte> input,
Vector256<byte> prev0, Vector256<byte> prev1,
Vector256<byte> n0Low, Vector256<byte> n0High,
Vector256<byte> n1Low, Vector256<byte> n1High,
Vector256<byte> n2Low, Vector256<byte> n2High)
{
// See comments in 'ProcessInputN3' for Vector128<byte> above.
// This method is the same, but operates on 32 input characters at a time.
(Vector256<byte> low, Vector256<byte> high) = GetNibbles(input);
Vector256<byte> match0 = Shuffle(n0Low, n0High, low, high);
Vector256<byte> match1 = Shuffle(n1Low, n1High, low, high);
Vector256<byte> result2 = Shuffle(n2Low, n2High, low, high);
Vector256<byte> result0 = RightShift2(prev0, match0);
Vector256<byte> result1 = RightShift1(prev1, match1);
Vector256<byte> result = result0 & result1 & result2;
return (result, match0, match1);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512BW))]
public static (Vector512<byte> Result, Vector512<byte> Prev0, Vector512<byte> Prev1) ProcessInputN3(
Vector512<byte> input,
Vector512<byte> prev0, Vector512<byte> prev1,
Vector512<byte> n0Low, Vector512<byte> n0High,
Vector512<byte> n1Low, Vector512<byte> n1High,
Vector512<byte> n2Low, Vector512<byte> n2High)
{
// See comments in 'ProcessInputN3' for Vector128<byte> above.
// This method is the same, but operates on 64 input characters at a time.
(Vector512<byte> low, Vector512<byte> high) = GetNibbles(input);
Vector512<byte> match0 = Shuffle(n0Low, n0High, low, high);
Vector512<byte> match1 = Shuffle(n1Low, n1High, low, high);
Vector512<byte> result2 = Shuffle(n2Low, n2High, low, high);
Vector512<byte> result0 = RightShift2(prev0, match0);
Vector512<byte> result1 = RightShift1(prev1, match1);
Vector512<byte> result = result0 & result1 & result2;
return (result, match0, match1);
}
// Read two Vector512<ushort> and concatenate their lower bytes together into a single Vector512<byte>.
// On X86, characters above 32767 are turned into 0, but we account for that by not using Teddy if any of the string values contain a 0.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Sse2))]
[CompExactlyDependsOn(typeof(AdvSimd))]
public static Vector128<byte> LoadAndPack16AsciiChars(ref char source)
{
Vector128<ushort> source0 = Vector128.LoadUnsafe(ref source);
Vector128<ushort> source1 = Vector128.LoadUnsafe(ref source, (nuint)Vector128<ushort>.Count);
return Sse2.IsSupported
? Sse2.PackUnsignedSaturate(source0.AsInt16(), source1.AsInt16())
: AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(source0), source1);
}
// Read two Vector512<ushort> and concatenate their lower bytes together into a single Vector512<byte>.
// Characters above 32767 are turned into 0, but we account for that by not using Teddy if any of the string values contain a 0.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
public static Vector256<byte> LoadAndPack32AsciiChars(ref char source)
{
Vector256<ushort> source0 = Vector256.LoadUnsafe(ref source);
Vector256<ushort> source1 = Vector256.LoadUnsafe(ref source, (nuint)Vector256<ushort>.Count);
Vector256<byte> packed = Avx2.PackUnsignedSaturate(source0.AsInt16(), source1.AsInt16());
return PackedSpanHelpers.FixUpPackedVector256Result(packed);
}
// Read two Vector512<ushort> and concatenate their lower bytes together into a single Vector512<byte>.
// Characters above 32767 are turned into 0, but we account for that by not using Teddy if any of the string values contain a 0.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512BW))]
public static Vector512<byte> LoadAndPack64AsciiChars(ref char source)
{
Vector512<ushort> source0 = Vector512.LoadUnsafe(ref source);
Vector512<ushort> source1 = Vector512.LoadUnsafe(ref source, (nuint)Vector512<ushort>.Count);
Vector512<byte> packed = Avx512BW.PackUnsignedSaturate(source0.AsInt16(), source1.AsInt16());
return PackedSpanHelpers.FixUpPackedVector512Result(packed);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd))]
private static (Vector128<byte> Low, Vector128<byte> High) GetNibbles(Vector128<byte> input)
{
// 'low' is not strictly correct here, but we take advantage of Ssse3.Shuffle's behavior
// of doing an implicit 'AND 0xF' in order to skip the redundant AND.
Vector128<byte> low = Ssse3.IsSupported
? input
: input & Vector128.Create((byte)0xF);
Vector128<byte> high = input >>> 4;
return (low, high);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static (Vector256<byte> Low, Vector256<byte> High) GetNibbles(Vector256<byte> input)
{
// 'low' is not strictly correct here, but we take advantage of Avx2.Shuffle's behavior
// of doing an implicit 'AND 0xF' in order to skip the redundant AND.
Vector256<byte> low = input;
Vector256<byte> high = input >>> 4;
return (low, high);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static (Vector512<byte> Low, Vector512<byte> High) GetNibbles(Vector512<byte> input)
{
// 'low' is not strictly correct here, but we take advantage of Avx512BW.Shuffle's behavior
// of doing an implicit 'AND 0xF' in order to skip the redundant AND.
Vector512<byte> low = input;
Vector512<byte> high = input >>> 4;
return (low, high);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
private static Vector128<byte> Shuffle(Vector128<byte> maskLow, Vector128<byte> maskHigh, Vector128<byte> low, Vector128<byte> high)
{
return Vector128.ShuffleUnsafe(maskLow, low) & Vector128.ShuffleUnsafe(maskHigh, high);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> Shuffle(Vector256<byte> maskLow, Vector256<byte> maskHigh, Vector256<byte> low, Vector256<byte> high)
{
return Avx2.Shuffle(maskLow, low) & Avx2.Shuffle(maskHigh, high);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512BW))]
private static Vector512<byte> Shuffle(Vector512<byte> maskLow, Vector512<byte> maskHigh, Vector512<byte> low, Vector512<byte> high)
{
return Avx512BW.Shuffle(maskLow, low) & Avx512BW.Shuffle(maskHigh, high);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
private static Vector128<byte> RightShift1(Vector128<byte> left, Vector128<byte> right)
{
// Given input vectors like
// left: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
// right: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
// We want to shift the last element of left (15) to be the first element of the result
// result: [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
if (Ssse3.IsSupported)
{
return Ssse3.AlignRight(right, left, 15);
}
else
{
// We create a temporary 'leftShifted' vector where the 1st element is the 16th element of the input.
// We then use TBX to shuffle all the elements one place to the left.
// 0xFF is used for the first element to replace it with the one from 'leftShifted'.
Vector128<byte> leftShifted = Vector128.Shuffle(left, Vector128.Create(15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte());
return AdvSimd.Arm64.VectorTableLookupExtension(leftShifted, right, Vector128.Create(0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Ssse3))]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
private static Vector128<byte> RightShift2(Vector128<byte> left, Vector128<byte> right)
{
// Given input vectors like
// left: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
// right: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
// We want to shift the last two elements of left (14, 15) to be the first elements of the result
// result: [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
if (Ssse3.IsSupported)
{
return Ssse3.AlignRight(right, left, 14);
}
else
{
// We create a temporary 'leftShifted' vector where the 1st and 2nd element are the 15th and 16th element of the input.
// We then use TBX to shuffle all the elements two places to the left.
// 0xFF is used for the first two elements to replace them with the ones from 'leftShifted'.
Vector128<byte> leftShifted = Vector128.Shuffle(left, Vector128.Create(14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte());
return AdvSimd.Arm64.VectorTableLookupExtension(leftShifted, right, Vector128.Create(0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> RightShift1(Vector256<byte> left, Vector256<byte> right)
{
// Given input vectors like
// left: 0, 1, 2, 3, 4, 5, ... , 26, 27, 28, 29, 30, [31]
// right: 32, 33, 34, 35, 36, 37, ... , 58, 59, 60, 61, 62, 63
// We want to shift the last element of left (31) to be the first element of the result
// result: [31], 32, 33, 34, 35, 36, ... , 57, 58, 59, 60, 61, 62
//
// Avx2.AlignRight acts like two separate Ssse3.AlignRight calls on the lower and upper halves of the source operands.
// Result of Avx2.AlignRight(right, left, 15) is
// lower: [15], 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
// upper: [31], 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62
// note how elements at indexes 0 and 16 are off by 16 places.
// We want to read 31 instead of 15 and 47 instead of 31.
//
// To achieve that we create a temporary value where we combine the second half of the first operand and the first half of the second operand (Permute2x128).
// left: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ] control: (1 << 0)
// right: [ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 ], 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 control: (2 << 4)
// result: 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, [31], 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, [47]
// This effectively shifts the 0th and 16th element by 16 places (note values 31 and 47).
Vector256<byte> leftShifted = Avx2.Permute2x128(left, right, (1 << 0) + (2 << 4));
return Avx2.AlignRight(right, leftShifted, 15);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> RightShift2(Vector256<byte> left, Vector256<byte> right)
{
// See comments in 'RightShift1(Vector256<byte> left, Vector256<byte> right)' above.
Vector256<byte> leftShifted = Avx2.Permute2x128(left, right, (1 << 0) + (2 << 4));
return Avx2.AlignRight(right, leftShifted, 14);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512BW))]
private static Vector512<byte> RightShift1(Vector512<byte> left, Vector512<byte> right)
{
// Given input vectors like
// left: 0, 1, 2, 3, 4, 5, ... , 58, 59, 60, 61, 62, [63]
// right: 64, 65, 66, 67, 68, 69, ... , 122, 123, 124, 125, 126, 127
// We want to shift the last element of left (63) to be the first element of the result
// result: [63], 64, 65, 66, 67, 68, ... , 121, 122, 123, 124, 125, 126
//
// Avx512BW.AlignRight acts like four separate Ssse3.AlignRight calls on each 128-bit pair of the of the source operands.
// Result of Avx512BW.AlignRight(right, left, 15) is
// lower: [15], 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, [31], 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
// upper: [47], 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, [63], 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
// note how elements at indexes 0, 16, 32 and 48 are off by 48 places.
// We want to read 63 instead of 15, 79 instead of 31, 95 instead of 47, and 111 instead of 63.
//
// Similar to Avx2 above, we create a temporary value where we shift these positions by 48 places - shift 8-byte values by 6 places (PermuteVar8x64x2).
// The indices vector below could be [6, 7, 8, 9, 10, 11, 12, 13], but we only care about the last byte in each 128-bit block (positions with value 0 don't affect the result).
Vector512<byte> leftShifted = Avx512F.PermuteVar8x64x2(left.AsInt64(), Vector512.Create(0, 7, 0, 9, 0, 11, 0, 13), right.AsInt64()).AsByte();
return Avx512BW.AlignRight(right, leftShifted, 15);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512BW))]
private static Vector512<byte> RightShift2(Vector512<byte> left, Vector512<byte> right)
{
// See comments in 'RightShift1(Vector512<byte> left, Vector512<byte> right)' above.
Vector512<byte> leftShifted = Avx512F.PermuteVar8x64x2(left.AsInt64(), Vector512.Create(0, 7, 0, 9, 0, 11, 0, 13), right.AsInt64()).AsByte();
return Avx512BW.AlignRight(right, leftShifted, 14);
}
}
}

View file

@ -0,0 +1,42 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
namespace System.Buffers
{
internal sealed class MultiStringIgnoreCaseSearchValuesFallback : StringSearchValuesBase
{
private readonly string[] _values;
public MultiStringIgnoreCaseSearchValuesFallback(HashSet<string> uniqueValues) : base(uniqueValues)
{
_values = new string[uniqueValues.Count];
uniqueValues.CopyTo(_values, 0);
}
/// <summary>
/// This method is intentionally implemented in a way that checks haystack positions one at a time.
/// See the description in <see cref="SpanHelpers.IndexOfAny{T}(ref T, int, ref T, int)"/>.
/// </summary>
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span)
{
string[] values = _values;
for (int i = 0; i < span.Length; i++)
{
ReadOnlySpan<char> remaining = span.Slice(i);
foreach (string value in values)
{
if (remaining.StartsWith(value, StringComparison.OrdinalIgnoreCase))
{
return i;
}
}
}
return -1;
}
}
}

View file

@ -0,0 +1,26 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Globalization;
using System.Runtime.CompilerServices;
namespace System.Buffers
{
internal sealed class SingleStringSearchValuesFallback<TIgnoreCase> : StringSearchValuesBase
where TIgnoreCase : struct, SearchValues.IRuntimeConst
{
private readonly string _value;
public SingleStringSearchValuesFallback(string value, HashSet<string> uniqueValues) : base(uniqueValues)
{
_value = value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
TIgnoreCase.Value
? Ordinal.IndexOfOrdinalIgnoreCase(span, _value)
: span.IndexOf(_value);
}
}

View file

@ -0,0 +1,416 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using static System.Buffers.StringSearchValuesHelper;
namespace System.Buffers
{
// Based on SpanHelpers.IndexOf(ref char, int, ref char, int)
// This implementation uses 3 precomputed anchor points when searching.
// This implementation may also be used for length=2 values, in which case two anchors point at the same position.
// Has an O(i * m) worst-case, with the expected time closer to O(n) for most inputs.
internal sealed class SingleStringSearchValuesThreeChars<TCaseSensitivity> : SearchValues<string>
where TCaseSensitivity : struct, ICaseSensitivity
{
private const ushort CaseConversionMask = unchecked((ushort)~0x20);
private readonly string _value;
private readonly nint _minusValueTailLength;
private readonly nuint _ch2ByteOffset;
private readonly nuint _ch3ByteOffset;
private readonly ushort _ch1;
private readonly ushort _ch2;
private readonly ushort _ch3;
private static bool IgnoreCase => typeof(TCaseSensitivity) != typeof(CaseSensitive);
public SingleStringSearchValuesThreeChars(string value)
{
// We could have more than one entry in 'uniqueValues' if this value is an exact prefix of all the others.
Debug.Assert(value.Length > 1);
CharacterFrequencyHelper.GetSingleStringMultiCharacterOffsets(value, IgnoreCase, out int ch2Offset, out int ch3Offset);
Debug.Assert(ch3Offset == 0 || ch3Offset > ch2Offset);
_value = value;
_minusValueTailLength = -(value.Length - 1);
_ch1 = value[0];
_ch2 = value[ch2Offset];
_ch3 = value[ch3Offset];
if (IgnoreCase)
{
_ch1 &= CaseConversionMask;
_ch2 &= CaseConversionMask;
_ch3 &= CaseConversionMask;
}
_ch2ByteOffset = (nuint)ch2Offset * 2;
_ch3ByteOffset = (nuint)ch3Offset * 2;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
IndexOf(ref MemoryMarshal.GetReference(span), span.Length);
private int IndexOf(ref char searchSpace, int searchSpaceLength)
{
ref char searchSpaceStart = ref searchSpace;
nint searchSpaceMinusValueTailLength = searchSpaceLength + _minusValueTailLength;
if (!Vector128.IsHardwareAccelerated || searchSpaceMinusValueTailLength < Vector128<ushort>.Count)
{
goto ShortInput;
}
nuint ch2ByteOffset = _ch2ByteOffset;
nuint ch3ByteOffset = _ch3ByteOffset;
if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512<ushort>.Count >= 0)
{
Vector512<ushort> ch1 = Vector512.Create(_ch1);
Vector512<ushort> ch2 = Vector512.Create(_ch2);
Vector512<ushort> ch3 = Vector512.Create(_ch3);
ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector512<ushort>.Count);
while (true)
{
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count);
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count + (int)(_ch2ByteOffset / 2));
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector512<ushort>.Count + (int)(_ch3ByteOffset / 2));
// Find which starting positions likely contain a match (likely match all 3 anchor characters).
Vector512<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);
if (result != Vector512<byte>.Zero)
{
goto CandidateFound;
}
LoopFooter:
// We haven't found a match. Update the input position and check if we've reached the end.
searchSpace = ref Unsafe.Add(ref searchSpace, Vector512<ushort>.Count);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector512<ushort>.Count)))
{
return -1;
}
// We have fewer than 32 characters remaining. Adjust the input position such that we will do one last loop iteration.
searchSpace = ref lastSearchSpace;
}
continue;
CandidateFound:
// We found potential matches, but they may be false-positives, so we must verify each one.
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
{
return offset;
}
goto LoopFooter;
}
}
else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256<ushort>.Count >= 0)
{
Vector256<ushort> ch1 = Vector256.Create(_ch1);
Vector256<ushort> ch2 = Vector256.Create(_ch2);
Vector256<ushort> ch3 = Vector256.Create(_ch3);
ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector256<ushort>.Count);
while (true)
{
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector256<ushort>.Count);
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector256<ushort>.Count + (int)(_ch2ByteOffset / 2));
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector256<ushort>.Count + (int)(_ch3ByteOffset / 2));
// Find which starting positions likely contain a match (likely match all 3 anchor characters).
Vector256<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);
if (result != Vector256<byte>.Zero)
{
goto CandidateFound;
}
LoopFooter:
searchSpace = ref Unsafe.Add(ref searchSpace, Vector256<ushort>.Count);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector256<ushort>.Count)))
{
return -1;
}
// We have fewer than 16 characters remaining. Adjust the input position such that we will do one last loop iteration.
searchSpace = ref lastSearchSpace;
}
continue;
CandidateFound:
// We found potential matches, but they may be false-positives, so we must verify each one.
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
{
return offset;
}
goto LoopFooter;
}
}
else
{
Vector128<ushort> ch1 = Vector128.Create(_ch1);
Vector128<ushort> ch2 = Vector128.Create(_ch2);
Vector128<ushort> ch3 = Vector128.Create(_ch3);
ref char lastSearchSpace = ref Unsafe.Add(ref searchSpace, searchSpaceMinusValueTailLength - Vector128<ushort>.Count);
while (true)
{
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector128<ushort>.Count);
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector128<ushort>.Count + (int)(_ch2ByteOffset / 2));
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref searchSpace, Vector128<ushort>.Count + (int)(_ch3ByteOffset / 2));
// Find which starting positions likely contain a match (likely match all 3 anchor characters).
Vector128<byte> result = GetComparisonResult(ref searchSpace, ch2ByteOffset, ch3ByteOffset, ch1, ch2, ch3);
if (result != Vector128<byte>.Zero)
{
goto CandidateFound;
}
LoopFooter:
searchSpace = ref Unsafe.Add(ref searchSpace, Vector128<ushort>.Count);
if (Unsafe.IsAddressGreaterThan(ref searchSpace, ref lastSearchSpace))
{
if (Unsafe.AreSame(ref searchSpace, ref Unsafe.Add(ref lastSearchSpace, Vector128<ushort>.Count)))
{
return -1;
}
// We have fewer than 8 characters remaining. Adjust the input position such that we will do one last loop iteration.
searchSpace = ref lastSearchSpace;
}
continue;
CandidateFound:
// We found potential matches, but they may be false-positives, so we must verify each one.
if (TryMatch(ref searchSpaceStart, searchSpaceLength, ref searchSpace, result.ExtractMostSignificantBits(), out int offset))
{
return offset;
}
goto LoopFooter;
}
}
ShortInput:
string value = _value;
char valueHead = value.GetRawStringData();
for (nint i = 0; i < searchSpaceMinusValueTailLength; i++)
{
ref char cur = ref Unsafe.Add(ref searchSpace, i);
// CaseInsensitiveUnicode doesn't support single-character transformations, so we skip checking the first character first.
if ((typeof(TCaseSensitivity) == typeof(CaseInsensitiveUnicode) || TCaseSensitivity.TransformInput(cur) == valueHead) &&
TCaseSensitivity.Equals(ref cur, value))
{
return (int)i;
}
}
return -1;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector128<ushort> ch1, Vector128<ushort> ch2, Vector128<ushort> ch3)
{
// Load 3 vectors from the input.
// One from the current search space, the other two at an offset based on the distance of those characters from the first one.
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
{
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace));
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
Vector128<ushort> cmpCh3 = Vector128.Equals(ch3, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
// AND all 3 together to get a mask of possible match positions that match in at least 3 places.
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
else
{
// For each, AND the value with ~0x20 so that letters are uppercased.
// For characters that aren't ASCII letters, this may produce wrong results, but only false-positives.
// We will take care of those in the verification step if the other characters also indicate a possible match.
Vector128<ushort> caseConversion = Vector128.Create(CaseConversionMask);
Vector128<ushort> cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace) & caseConversion);
Vector128<ushort> cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
Vector128<ushort> cmpCh3 = Vector128.Equals(ch3, Vector128.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
// AND all 3 together to get a mask of possible match positions that likely match in at least 3 places.
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector256<ushort> ch1, Vector256<ushort> ch2, Vector256<ushort> ch3)
{
// See comments in 'GetComparisonResult' for Vector128<byte> above.
// This method is the same, but operates on 32 input characters at a time.
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
{
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace));
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
Vector256<ushort> cmpCh3 = Vector256.Equals(ch3, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
else
{
Vector256<ushort> caseConversion = Vector256.Create(CaseConversionMask);
Vector256<ushort> cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace) & caseConversion);
Vector256<ushort> cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
Vector256<ushort> cmpCh3 = Vector256.Equals(ch3, Vector256.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector512<byte> GetComparisonResult(ref char searchSpace, nuint ch2ByteOffset, nuint ch3ByteOffset, Vector512<ushort> ch1, Vector512<ushort> ch2, Vector512<ushort> ch3)
{
// See comments in 'GetComparisonResult' for Vector128<byte> above.
// This method is the same, but operates on 64 input characters at a time.
if (typeof(TCaseSensitivity) == typeof(CaseSensitive))
{
Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace));
Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16());
Vector512<ushort> cmpCh3 = Vector512.Equals(ch3, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16());
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
else
{
Vector512<ushort> caseConversion = Vector512.Create(CaseConversionMask);
Vector512<ushort> cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace) & caseConversion);
Vector512<ushort> cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch2ByteOffset).AsUInt16() & caseConversion);
Vector512<ushort> cmpCh3 = Vector512.Equals(ch3, Vector512.LoadUnsafe(ref Unsafe.As<char, byte>(ref searchSpace), ch3ByteOffset).AsUInt16() & caseConversion);
return (cmpCh1 & cmpCh2 & cmpCh3).AsByte();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, uint mask, out int offsetFromStart)
{
// 'mask' encodes the input positions where at least 3 characters likely matched.
// Verify each one to see if we've found a match, otherwise return back to the vectorized loop.
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
Debug.Assert(bitPos % 2 == 0);
ref char matchRef = ref Unsafe.AddByteOffset(ref searchSpace, bitPos);
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length);
if (TCaseSensitivity.Equals(ref matchRef, _value))
{
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2);
return true;
}
mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
}
while (mask != 0);
offsetFromStart = 0;
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool TryMatch(ref char searchSpaceStart, int searchSpaceLength, ref char searchSpace, ulong mask, out int offsetFromStart)
{
// 'mask' encodes the input positions where at least 3 characters likely matched.
// Verify each one to see if we've found a match, otherwise return back to the vectorized loop.
do
{
int bitPos = BitOperations.TrailingZeroCount(mask);
Debug.Assert(bitPos % 2 == 0);
ref char matchRef = ref Unsafe.AddByteOffset(ref searchSpace, bitPos);
ValidateReadPosition(ref searchSpaceStart, searchSpaceLength, ref matchRef, _value.Length);
if (TCaseSensitivity.Equals(ref matchRef, _value))
{
offsetFromStart = (int)((nuint)Unsafe.ByteOffset(ref searchSpaceStart, ref matchRef) / 2);
return true;
}
mask = BitOperations.ResetLowestSetBit(BitOperations.ResetLowestSetBit(mask));
}
while (mask != 0);
offsetFromStart = 0;
return false;
}
internal override bool ContainsCore(string value) =>
_value.Equals(value, IgnoreCase ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
internal override string[] GetValues() =>
new string[] { _value };
internal override int IndexOfAny(ReadOnlySpan<string> span) =>
IndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
internal override int IndexOfAnyExcept(ReadOnlySpan<string> span) =>
IndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
internal override int LastIndexOfAny(ReadOnlySpan<string> span) =>
LastIndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
internal override int LastIndexOfAnyExcept(ReadOnlySpan<string> span) =>
LastIndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
private int IndexOfAny<TNegator>(ReadOnlySpan<string> span)
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
{
for (int i = 0; i < span.Length; i++)
{
if (TNegator.NegateIfNeeded(ContainsCore(span[i])))
{
return i;
}
}
return -1;
}
private int LastIndexOfAny<TNegator>(ReadOnlySpan<string> span)
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
{
for (int i = span.Length - 1; i >= 0; i--)
{
if (TNegator.NegateIfNeeded(ContainsCore(span[i])))
{
return i;
}
}
return -1;
}
}
}

View file

@ -0,0 +1,414 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using System.Text;
using static System.Buffers.StringSearchValuesHelper;
namespace System.Buffers
{
internal static class StringSearchValues
{
private static readonly SearchValues<char> s_asciiLetters =
SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
private static readonly SearchValues<char> s_allAsciiExceptLowercase =
SearchValues.Create("\0\u0001\u0002\u0003\u0004\u0005\u0006\a\b\t\n\v\f\r\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`{|}~\u007F");
public static SearchValues<string> Create(ReadOnlySpan<string> values, bool ignoreCase)
{
if (values.Length == 0)
{
return new EmptySearchValues<string>();
}
if (values.Length == 1)
{
// Avoid additional overheads for single-value inputs.
string value = values[0];
ArgumentNullException.ThrowIfNull(value, nameof(values));
string normalizedValue = NormalizeIfNeeded(value, ignoreCase);
AnalyzeValues(new ReadOnlySpan<string>(ref normalizedValue), ref ignoreCase, out bool ascii, out bool asciiLettersOnly, out _, out _);
return CreateForSingleValue(normalizedValue, uniqueValues: null, ignoreCase, ascii, asciiLettersOnly);
}
var uniqueValues = new HashSet<string>(values.Length, ignoreCase ? StringComparer.OrdinalIgnoreCase : StringComparer.Ordinal);
foreach (string value in values)
{
ArgumentNullException.ThrowIfNull(value, nameof(values));
uniqueValues.Add(value);
}
if (uniqueValues.Contains(string.Empty))
{
return new SingleStringSearchValuesFallback<SearchValues.FalseConst>(string.Empty, uniqueValues);
}
Span<string> normalizedValues = new string[uniqueValues.Count];
int i = 0;
foreach (string value in uniqueValues)
{
normalizedValues[i++] = NormalizeIfNeeded(value, ignoreCase);
}
Debug.Assert(i == normalizedValues.Length);
// Aho-Corasick's ctor expects values to be sorted by length.
normalizedValues.Sort(static (a, b) => a.Length.CompareTo(b.Length));
// We may not end up choosing Aho-Corasick as the implementation, but it has a nice property of
// finding all the unreachable values during the construction stage, so we build the trie early.
HashSet<string>? unreachableValues = null;
var ahoCorasickBuilder = new AhoCorasickBuilder(normalizedValues, ignoreCase, ref unreachableValues);
if (unreachableValues is not null)
{
// Some values are exact prefixes of other values.
// Exclude those values now to reduce the number of buckets and make verification steps cheaper during searching.
normalizedValues = RemoveUnreachableValues(normalizedValues, unreachableValues);
}
SearchValues<string> searchValues = CreateFromNormalizedValues(normalizedValues, uniqueValues, ignoreCase, ref ahoCorasickBuilder);
ahoCorasickBuilder.Dispose();
return searchValues;
static string NormalizeIfNeeded(string value, bool ignoreCase)
{
if (ignoreCase && value.AsSpan().ContainsAnyExcept(s_allAsciiExceptLowercase))
{
string upperCase = string.FastAllocateString(value.Length);
int charsWritten = Ordinal.ToUpperOrdinal(value, new Span<char>(ref upperCase.GetRawStringData(), upperCase.Length));
Debug.Assert(charsWritten == upperCase.Length);
value = upperCase;
}
return value;
}
static Span<string> RemoveUnreachableValues(Span<string> values, HashSet<string> unreachableValues)
{
int newCount = 0;
foreach (string value in values)
{
if (!unreachableValues.Contains(value))
{
values[newCount++] = value;
}
}
Debug.Assert(newCount <= values.Length - unreachableValues.Count);
Debug.Assert(newCount > 0);
return values.Slice(0, newCount);
}
}
private static SearchValues<string> CreateFromNormalizedValues(
ReadOnlySpan<string> values,
HashSet<string> uniqueValues,
bool ignoreCase,
ref AhoCorasickBuilder ahoCorasickBuilder)
{
AnalyzeValues(values, ref ignoreCase, out bool allAscii, out bool asciiLettersOnly, out bool nonAsciiAffectedByCaseConversion, out int minLength);
if (values.Length == 1)
{
// We may reach this if we've removed unreachable values and ended up with only 1 remaining.
return CreateForSingleValue(values[0], uniqueValues, ignoreCase, allAscii, asciiLettersOnly);
}
if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) &&
TryGetTeddyAcceleratedValues(values, uniqueValues, ignoreCase, allAscii, asciiLettersOnly, nonAsciiAffectedByCaseConversion, minLength) is { } searchValues)
{
return searchValues;
}
// Fall back to Aho-Corasick for all other multi-value sets.
AhoCorasick ahoCorasick = ahoCorasickBuilder.Build();
if (!ignoreCase)
{
return PickAhoCorasickImplementation<CaseSensitive>(ahoCorasick, uniqueValues);
}
if (nonAsciiAffectedByCaseConversion)
{
if (ContainsIncompleteSurrogatePairs(values))
{
// Aho-Corasick can't deal with the matching semantics of standalone surrogate code units.
// We will use a slow but correct O(n * m) fallback implementation.
return new MultiStringIgnoreCaseSearchValuesFallback(uniqueValues);
}
return PickAhoCorasickImplementation<CaseInsensitiveUnicode>(ahoCorasick, uniqueValues);
}
if (asciiLettersOnly)
{
return PickAhoCorasickImplementation<CaseInsensitiveAsciiLetters>(ahoCorasick, uniqueValues);
}
return PickAhoCorasickImplementation<CaseInsensitiveAscii>(ahoCorasick, uniqueValues);
static SearchValues<string> PickAhoCorasickImplementation<TCaseSensitivity>(AhoCorasick ahoCorasick, HashSet<string> uniqueValues)
where TCaseSensitivity : struct, ICaseSensitivity
{
return ahoCorasick.ShouldUseAsciiFastScan
? new StringSearchValuesAhoCorasick<TCaseSensitivity, AhoCorasick.IndexOfAnyAsciiFastScan>(ahoCorasick, uniqueValues)
: new StringSearchValuesAhoCorasick<TCaseSensitivity, AhoCorasick.NoFastScan>(ahoCorasick, uniqueValues);
}
}
private static SearchValues<string>? TryGetTeddyAcceleratedValues(
ReadOnlySpan<string> values,
HashSet<string> uniqueValues,
bool ignoreCase,
bool allAscii,
bool asciiLettersOnly,
bool nonAsciiAffectedByCaseConversion,
int minLength)
{
if (minLength == 1)
{
// An 'N=1' implementation is possible, but callers should
// consider using SearchValues<char> instead in such cases.
// It can be added if Regex ends up running into this case.
return null;
}
if (values.Length > RabinKarp.MaxValues)
{
// The more values we have, the higher the chance of hash/fingerprint collisions.
// To avoid spending too much time in verification steps, fallback to Aho-Corasick which guarantees O(n).
// If it turns out that this limit is commonly exceeded, we can tweak the number of buckets
// in the implementation, or use different variants depending on input.
return null;
}
int n = minLength == 2 ? 2 : 3;
if (Ssse3.IsSupported)
{
foreach (string value in values)
{
if (value.AsSpan(0, n).Contains('\0'))
{
// If we let null chars through here, Teddy would still work correctly, but it
// would hit more false positives that the verification step would have to rule out.
// While we could flow a generic flag like Ssse3AndWasmHandleZeroInNeedle through,
// we expect such values to be rare enough that introducing more code is not worth it.
return null;
}
}
}
// Even if the values contain non-ASCII chars, we may be able to use Teddy as long as the
// first N characters are ASCII.
if (!allAscii)
{
foreach (string value in values)
{
if (!Ascii.IsValid(value.AsSpan(0, n)))
{
// A vectorized implementation for non-ASCII values is possible.
// It can be added if it turns out to be a common enough scenario.
return null;
}
}
}
if (!ignoreCase)
{
return PickTeddyImplementation<CaseSensitive, CaseSensitive>(values, uniqueValues, n);
}
if (asciiLettersOnly)
{
return PickTeddyImplementation<CaseInsensitiveAsciiLetters, CaseInsensitiveAsciiLetters>(values, uniqueValues, n);
}
// Even if the whole value isn't ASCII letters only, we can still use a faster approach
// for the vectorized part as long as the first N characters are.
bool asciiStartLettersOnly = true;
bool asciiStartUnaffectedByCaseConversion = true;
foreach (string value in values)
{
ReadOnlySpan<char> slice = value.AsSpan(0, n);
asciiStartLettersOnly = asciiStartLettersOnly && !slice.ContainsAnyExcept(s_asciiLetters);
asciiStartUnaffectedByCaseConversion = asciiStartUnaffectedByCaseConversion && !slice.ContainsAny(s_asciiLetters);
}
Debug.Assert(!(asciiStartLettersOnly && asciiStartUnaffectedByCaseConversion));
if (asciiStartUnaffectedByCaseConversion)
{
return nonAsciiAffectedByCaseConversion
? PickTeddyImplementation<CaseSensitive, CaseInsensitiveUnicode>(values, uniqueValues, n)
: PickTeddyImplementation<CaseSensitive, CaseInsensitiveAscii>(values, uniqueValues, n);
}
if (nonAsciiAffectedByCaseConversion)
{
return asciiStartLettersOnly
? PickTeddyImplementation<CaseInsensitiveAsciiLetters, CaseInsensitiveUnicode>(values, uniqueValues, n)
: PickTeddyImplementation<CaseInsensitiveAscii, CaseInsensitiveUnicode>(values, uniqueValues, n);
}
return asciiStartLettersOnly
? PickTeddyImplementation<CaseInsensitiveAsciiLetters, CaseInsensitiveAscii>(values, uniqueValues, n)
: PickTeddyImplementation<CaseInsensitiveAscii, CaseInsensitiveAscii>(values, uniqueValues, n);
}
private static SearchValues<string> PickTeddyImplementation<TStartCaseSensitivity, TCaseSensitivity>(
ReadOnlySpan<string> values,
HashSet<string> uniqueValues,
int n)
where TStartCaseSensitivity : struct, ICaseSensitivity
where TCaseSensitivity : struct, ICaseSensitivity
{
Debug.Assert(typeof(TStartCaseSensitivity) != typeof(CaseInsensitiveUnicode));
Debug.Assert(values.Length > 1);
Debug.Assert(n is 2 or 3);
if (values.Length > 8)
{
string[][] buckets = TeddyBucketizer.Bucketize(values, bucketCount: 8, n);
// Potential optimization: We don't have to pick the first N characters for the fingerprint.
// Different offset selection can noticeably improve throughput (e.g. 2x).
return n == 2
? new AsciiStringSearchValuesTeddyBucketizedN2<TStartCaseSensitivity, TCaseSensitivity>(buckets, values, uniqueValues)
: new AsciiStringSearchValuesTeddyBucketizedN3<TStartCaseSensitivity, TCaseSensitivity>(buckets, values, uniqueValues);
}
else
{
return n == 2
? new AsciiStringSearchValuesTeddyNonBucketizedN2<TStartCaseSensitivity, TCaseSensitivity>(values, uniqueValues)
: new AsciiStringSearchValuesTeddyNonBucketizedN3<TStartCaseSensitivity, TCaseSensitivity>(values, uniqueValues);
}
}
private static SearchValues<string> CreateForSingleValue(
string value,
HashSet<string>? uniqueValues,
bool ignoreCase,
bool allAscii,
bool asciiLettersOnly)
{
// We make use of optimizations that may overflow on 32bit systems for long values.
int maxLength = IntPtr.Size == 4 ? 1_000_000_000 : int.MaxValue;
if (Vector128.IsHardwareAccelerated && value.Length > 1 && value.Length <= maxLength)
{
if (!ignoreCase)
{
return new SingleStringSearchValuesThreeChars<CaseSensitive>(value);
}
if (asciiLettersOnly)
{
return new SingleStringSearchValuesThreeChars<CaseInsensitiveAsciiLetters>(value);
}
if (allAscii)
{
return new SingleStringSearchValuesThreeChars<CaseInsensitiveAscii>(value);
}
// When ignoring casing, all anchor chars we search for must be ASCII.
if (char.IsAscii(value[0]) && value.AsSpan().LastIndexOfAnyInRange((char)0, (char)127) > 0)
{
return new SingleStringSearchValuesThreeChars<CaseInsensitiveUnicode>(value);
}
}
uniqueValues ??= new HashSet<string>(1, ignoreCase ? StringComparer.OrdinalIgnoreCase : StringComparer.Ordinal) { value };
return ignoreCase
? new SingleStringSearchValuesFallback<SearchValues.TrueConst>(value, uniqueValues)
: new SingleStringSearchValuesFallback<SearchValues.FalseConst>(value, uniqueValues);
}
private static void AnalyzeValues(
ReadOnlySpan<string> values,
ref bool ignoreCase,
out bool allAscii,
out bool asciiLettersOnly,
out bool nonAsciiAffectedByCaseConversion,
out int minLength)
{
allAscii = true;
asciiLettersOnly = true;
minLength = int.MaxValue;
foreach (string value in values)
{
allAscii = allAscii && Ascii.IsValid(value);
asciiLettersOnly = asciiLettersOnly && !value.AsSpan().ContainsAnyExcept(s_asciiLetters);
minLength = Math.Min(minLength, value.Length);
}
// Potential optimization: Not all characters participate in Unicode case conversion.
// If we can determine that none of the non-ASCII characters do, we can make searching faster
// by using the same paths as we do for ASCII-only values.
nonAsciiAffectedByCaseConversion = ignoreCase && !allAscii;
// If all the characters in values are unaffected by casing, we can avoid the ignoreCase overhead.
if (ignoreCase && !nonAsciiAffectedByCaseConversion && !asciiLettersOnly)
{
ignoreCase = false;
foreach (string value in values)
{
if (value.AsSpan().ContainsAny(s_asciiLetters))
{
ignoreCase = true;
break;
}
}
}
}
private static bool ContainsIncompleteSurrogatePairs(ReadOnlySpan<string> values)
{
foreach (string value in values)
{
int i = value.AsSpan().IndexOfAnyInRange(CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END);
if (i < 0)
{
continue;
}
for (; (uint)i < (uint)value.Length; i++)
{
if (char.IsHighSurrogate(value[i]))
{
if ((uint)(i + 1) >= (uint)value.Length || !char.IsLowSurrogate(value[i + 1]))
{
// High surrogate not followed by a low surrogate.
return true;
}
i++;
}
else if (char.IsLowSurrogate(value[i]))
{
// Low surrogate not preceded by a high surrogate.
return true;
}
}
}
return false;
}
}
}

View file

@ -0,0 +1,22 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Runtime.CompilerServices;
namespace System.Buffers
{
internal sealed class StringSearchValuesAhoCorasick<TCaseSensitivity, TFastScanVariant> : StringSearchValuesBase
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
where TFastScanVariant : struct, AhoCorasick.IFastScan
{
private readonly AhoCorasick _ahoCorasick;
public StringSearchValuesAhoCorasick(AhoCorasick ahoCorasick, HashSet<string> uniqueValues) : base(uniqueValues) =>
_ahoCorasick = ahoCorasick;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal override int IndexOfAnyMultiString(ReadOnlySpan<char> span) =>
_ahoCorasick.IndexOfAny<TCaseSensitivity, TFastScanVariant>(span);
}
}

View file

@ -0,0 +1,72 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
namespace System.Buffers
{
/// <summary>
/// Implements the base <see cref="SearchValues{T}"/> {Last}IndexOfAny{Except} operations.
/// While these operations are exposed such that you can call string[].IndexOfAny(searchValues),
/// they are not expected to be used in performance-critical paths.
/// <see cref="MemoryExtensions.IndexOfAny(ReadOnlySpan{char}, SearchValues{string})"/> is the main
/// reason why someone would create an instance of <see cref="string"/> <see cref="SearchValues{T}"/>.
/// </summary>
internal abstract class StringSearchValuesBase : SearchValues<string>
{
private readonly HashSet<string> _uniqueValues;
public StringSearchValuesBase(HashSet<string> uniqueValues) =>
_uniqueValues = uniqueValues;
internal sealed override bool ContainsCore(string value) =>
_uniqueValues.Contains(value);
internal sealed override string[] GetValues()
{
string[] values = new string[_uniqueValues.Count];
_uniqueValues.CopyTo(values);
return values;
}
internal sealed override int IndexOfAny(ReadOnlySpan<string> span) =>
IndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
internal sealed override int IndexOfAnyExcept(ReadOnlySpan<string> span) =>
IndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
internal sealed override int LastIndexOfAny(ReadOnlySpan<string> span) =>
LastIndexOfAny<IndexOfAnyAsciiSearcher.DontNegate>(span);
internal sealed override int LastIndexOfAnyExcept(ReadOnlySpan<string> span) =>
LastIndexOfAny<IndexOfAnyAsciiSearcher.Negate>(span);
private int IndexOfAny<TNegator>(ReadOnlySpan<string> span)
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
{
for (int i = 0; i < span.Length; i++)
{
if (TNegator.NegateIfNeeded(_uniqueValues.Contains(span[i])))
{
return i;
}
}
return -1;
}
private int LastIndexOfAny<TNegator>(ReadOnlySpan<string> span)
where TNegator : struct, IndexOfAnyAsciiSearcher.INegator
{
for (int i = span.Length - 1; i >= 0; i--)
{
if (TNegator.NegateIfNeeded(_uniqueValues.Contains(span[i])))
{
return i;
}
}
return -1;
}
}
}

View file

@ -0,0 +1,21 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Runtime.CompilerServices;
namespace System.Buffers
{
internal abstract class StringSearchValuesRabinKarp<TCaseSensitivity> : StringSearchValuesBase
where TCaseSensitivity : struct, StringSearchValuesHelper.ICaseSensitivity
{
private readonly RabinKarp _rabinKarp;
public StringSearchValuesRabinKarp(ReadOnlySpan<string> values, HashSet<string> uniqueValues) : base(uniqueValues) =>
_rabinKarp = new RabinKarp(values);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected int ShortInputFallback(ReadOnlySpan<char> span) =>
_rabinKarp.IndexOfAny<TCaseSensitivity>(span);
}
}

View file

@ -1264,7 +1264,7 @@ namespace System
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> FixUpPackedVector256Result(Vector256<byte> result)
internal static Vector256<byte> FixUpPackedVector256Result(Vector256<byte> result)
{
Debug.Assert(Avx2.IsSupported);
// Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in
@ -1276,14 +1276,12 @@ namespace System
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(Avx512F))]
private static Vector512<byte> FixUpPackedVector512Result(Vector512<byte> result)
internal static Vector512<byte> FixUpPackedVector512Result(Vector512<byte> result)
{
Debug.Assert(Avx512F.IsSupported);
// Avx512BW.PackUnsignedSaturate(Vector512.Create((short)1), Vector512.Create((short)2)) will result in
// 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2
// We want to swap the X and Y bits
// 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2
return Avx512F.PermuteVar8x64(result.AsInt64(), Vector512.Create((long)0, 2, 4, 6, 1, 3, 5, 7)).AsByte();
// Avx512BW.PackUnsignedSaturate will interleave the inputs in 8-byte blocks.
// We want to preserve the order of the two input vectors, so we deinterleave the packed value.
return Avx512F.PermuteVar8x64(result.AsInt64(), Vector512.Create(0, 2, 4, 6, 1, 3, 5, 7)).AsByte();
}
}
}

View file

@ -7392,6 +7392,7 @@ namespace System.Buffers
{
public static System.Buffers.SearchValues<byte> Create(System.ReadOnlySpan<byte> values) { throw null; }
public static System.Buffers.SearchValues<char> Create(System.ReadOnlySpan<char> values) { throw null; }
public static System.Buffers.SearchValues<string> Create(System.ReadOnlySpan<string> values, System.StringComparison comparisonType) { throw null; }
}
public partial interface IPinnable
{