mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-09 09:34:49 +09:00
Adding AVX512 path to Base64 encoding/Decoding (#92241)
* Adding AVX512 path to Base64 encoding/Decoding * Addressing review Comments. Signed-off-by: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com> * Removing fallback path. * Updating Third Party Notice. * Addressing review comments --------- Signed-off-by: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
This commit is contained in:
parent
3cd64558e8
commit
9ad24aec01
3 changed files with 202 additions and 2 deletions
|
@ -1297,3 +1297,37 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
|
||||
License notice for Avx512Vbmi base64 encoding / decoding
|
||||
--------------------------------------------------------
|
||||
|
||||
Copyright (c) 2015-2018, Wojciech Muła
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
--------------------------------------------------------
|
||||
|
||||
Aspects of base64 encoding / decoding are based on algorithm described in "Base64 encoding and decoding at almost the speed of a memory
|
||||
copy", Wojciech Muła and Daniel Lemire. https://arxiv.org/pdf/1910.05109.pdf
|
||||
|
|
|
@ -68,7 +68,18 @@ namespace System.Buffers.Text
|
|||
|
||||
if (maxSrcLength >= 24)
|
||||
{
|
||||
byte* end = srcMax - 45;
|
||||
byte* end = srcMax - 88;
|
||||
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && (end >= src))
|
||||
{
|
||||
Avx512Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||
|
||||
if (src == srcEnd)
|
||||
{
|
||||
goto DoneExit;
|
||||
}
|
||||
}
|
||||
|
||||
end = srcMax - 45;
|
||||
if (Avx2.IsSupported && (end >= src))
|
||||
{
|
||||
Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||
|
@ -616,6 +627,78 @@ namespace System.Buffers.Text
|
|||
return status;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
|
||||
private static unsafe void Avx512Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||
{
|
||||
// Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/decode
|
||||
// If we have AVX512 support, pick off 64 bytes at a time for as long as we can,
|
||||
// but make sure that we quit before seeing any == markers at the end of the
|
||||
// string. Also, because we write 16 zeroes at the end of the output, ensure
|
||||
// that there are at least 22 valid bytes of input data remaining to close the
|
||||
// gap. 64 + 2 + 22 = 88 bytes.
|
||||
byte* src = srcBytes;
|
||||
byte* dest = destBytes;
|
||||
|
||||
// The JIT won't hoist these "constants", so help it
|
||||
Vector512<sbyte> vbmiLookup0 = Vector512.Create(
|
||||
0x80808080, 0x80808080, 0x80808080, 0x80808080,
|
||||
0x80808080, 0x80808080, 0x80808080, 0x80808080,
|
||||
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
|
||||
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080).AsSByte();
|
||||
Vector512<sbyte> vbmiLookup1 = Vector512.Create(
|
||||
0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
|
||||
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
|
||||
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
|
||||
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080).AsSByte();
|
||||
Vector512<byte> vbmiPackedLanesControl = Vector512.Create(
|
||||
0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
|
||||
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
|
||||
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000).AsByte();
|
||||
|
||||
Vector512<sbyte> mergeConstant0 = Vector512.Create(0x01400140).AsSByte();
|
||||
Vector512<short> mergeConstant1 = Vector512.Create(0x00011000).AsInt16();
|
||||
|
||||
// This algorithm requires AVX512VBMI support.
|
||||
// Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
|
||||
do
|
||||
{
|
||||
AssertRead<Vector512<sbyte>>(src, srcStart, sourceLength);
|
||||
Vector512<sbyte> str = Vector512.Load(src).AsSByte();
|
||||
|
||||
// Step 1: Translate encoded Base64 input to their original indices
|
||||
// This step also checks for invalid inputs and exits.
|
||||
// After this, we have indices which are verified to have upper 2 bits set to 0 in each byte.
|
||||
// origIndex = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
||||
Vector512<sbyte> origIndex = Avx512Vbmi.PermuteVar64x8x2(vbmiLookup0, str, vbmiLookup1);
|
||||
Vector512<sbyte> errorVec = (origIndex.AsInt32() | str.AsInt32()).AsSByte();
|
||||
if (errorVec.ExtractMostSignificantBits() != 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// Step 2: Now we need to reshuffle bits to remove the 0 bits.
|
||||
// multiAdd1: [...|0000cccc|ccdddddd|0000aaaa|aabbbbbb]
|
||||
Vector512<short> multiAdd1 = Avx512BW.MultiplyAddAdjacent(origIndex.AsByte(), mergeConstant0);
|
||||
// multiAdd1: [...|00000000|aaaaaabb|bbbbcccc|ccdddddd]
|
||||
Vector512<int> multiAdd2 = Avx512BW.MultiplyAddAdjacent(multiAdd1, mergeConstant1);
|
||||
|
||||
// Step 3: Pack 48 bytes
|
||||
str = Avx512Vbmi.PermuteVar64x8(multiAdd2.AsByte(), vbmiPackedLanesControl).AsSByte();
|
||||
|
||||
AssertWrite<Vector512<sbyte>>(dest, destStart, destLength);
|
||||
str.Store((sbyte*)dest);
|
||||
src += 64;
|
||||
dest += 48;
|
||||
}
|
||||
while (src <= srcEnd);
|
||||
|
||||
srcBytes = src;
|
||||
destBytes = dest;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||
|
|
|
@ -67,7 +67,16 @@ namespace System.Buffers.Text
|
|||
|
||||
if (maxSrcLength >= 16)
|
||||
{
|
||||
byte* end = srcMax - 32;
|
||||
byte* end = srcMax - 64;
|
||||
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && (end >= src))
|
||||
{
|
||||
Avx512Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||
|
||||
if (src == srcEnd)
|
||||
goto DoneExit;
|
||||
}
|
||||
|
||||
end = srcMax - 64;
|
||||
if (Avx2.IsSupported && (end >= src))
|
||||
{
|
||||
Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||
|
@ -226,6 +235,80 @@ namespace System.Buffers.Text
|
|||
}
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
|
||||
private static unsafe void Avx512Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||
{
|
||||
// Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/encode
|
||||
// If we have AVX512 support, pick off 48 bytes at a time for as long as we can.
|
||||
// But because we read 64 bytes at a time, ensure we have enough room to do a
|
||||
// full 64-byte read without segfaulting.
|
||||
|
||||
byte* src = srcBytes;
|
||||
byte* dest = destBytes;
|
||||
|
||||
// The JIT won't hoist these "constants", so help it
|
||||
Vector512<sbyte> shuffleVecVbmi = Vector512.Create(
|
||||
0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
|
||||
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
|
||||
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
|
||||
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e).AsSByte();
|
||||
Vector512<sbyte> vbmiLookup = Vector512.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"u8).AsSByte();
|
||||
|
||||
Vector512<ushort> maskAC = Vector512.Create((uint)0x0fc0fc00).AsUInt16();
|
||||
Vector512<uint> maskBB = Vector512.Create((uint)0x3f003f00);
|
||||
Vector512<ushort> shiftAC = Vector512.Create((uint)0x0006000a).AsUInt16();
|
||||
Vector512<ushort> shiftBB = Vector512.Create((uint)0x00080004).AsUInt16();
|
||||
|
||||
AssertRead<Vector256<sbyte>>(src, srcStart, sourceLength);
|
||||
|
||||
// This algorithm requires AVX512VBMI support.
|
||||
// Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
|
||||
|
||||
// str = [...|PONM|LKJI|HGFE|DCBA]
|
||||
Vector512<sbyte> str = Vector512.Load(src).AsSByte();
|
||||
|
||||
while (true)
|
||||
{
|
||||
// Step 1 : Split 48 bytes into 64 bytes with each byte using 6-bits from input
|
||||
// str = [...|KLJK|HIGH|EFDE|BCAB]
|
||||
str = Avx512Vbmi.PermuteVar64x8(str, shuffleVecVbmi);
|
||||
|
||||
// TO-DO- This can be achieved faster with multishift
|
||||
// Consider the first 4 bytes - BCAB
|
||||
// temp1 = [...|0000cccc|cc000000|aaaaaa00|00000000]
|
||||
Vector512<ushort> temp1 = (str.AsUInt16() & maskAC);
|
||||
|
||||
// temp2 = [...|00000000|00cccccc|00000000|00aaaaaa]
|
||||
Vector512<ushort> temp2 = Avx512BW.ShiftRightLogicalVariable(temp1, shiftAC).AsUInt16();
|
||||
|
||||
// temp3 = [...|ccdddddd|00000000|aabbbbbb|cccc0000]
|
||||
Vector512<ushort> temp3 = Avx512BW.ShiftLeftLogicalVariable(str.AsUInt16(), shiftBB).AsUInt16();
|
||||
|
||||
// str = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
||||
str = Vector512.ConditionalSelect(maskBB, temp3.AsUInt32(), temp2.AsUInt32()).AsSByte();
|
||||
|
||||
// Step 2: Now we have the indices calculated. Next step is to use these indices to translate.
|
||||
str = Avx512Vbmi.PermuteVar64x8(vbmiLookup, str);
|
||||
|
||||
AssertWrite<Vector512<sbyte>>(dest, destStart, destLength);
|
||||
str.Store((sbyte*)dest);
|
||||
|
||||
src += 48;
|
||||
dest += 64;
|
||||
|
||||
if (src > srcEnd)
|
||||
break;
|
||||
|
||||
AssertRead<Vector512<sbyte>>(src, srcStart, sourceLength);
|
||||
str = Vector512.Load(src).AsSByte();
|
||||
}
|
||||
|
||||
srcBytes = src;
|
||||
destBytes = dest;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
[CompExactlyDependsOn(typeof(Avx2))]
|
||||
private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue