mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-09 09:34:49 +09:00
Adding AVX512 path to Base64 encoding/Decoding (#92241)
* Adding AVX512 path to Base64 encoding/Decoding * Addressing review Comments. Signed-off-by: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com> * Removing fallback path. * Updating Third Party Notice. * Addressing review comments --------- Signed-off-by: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
This commit is contained in:
parent
3cd64558e8
commit
9ad24aec01
3 changed files with 202 additions and 2 deletions
|
@ -1297,3 +1297,37 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
THE SOFTWARE.
|
THE SOFTWARE.
|
||||||
|
|
||||||
|
License notice for Avx512Vbmi base64 encoding / decoding
|
||||||
|
--------------------------------------------------------
|
||||||
|
|
||||||
|
Copyright (c) 2015-2018, Wojciech Muła
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
--------------------------------------------------------
|
||||||
|
|
||||||
|
Aspects of base64 encoding / decoding are based on algorithm described in "Base64 encoding and decoding at almost the speed of a memory
|
||||||
|
copy", Wojciech Muła and Daniel Lemire. https://arxiv.org/pdf/1910.05109.pdf
|
||||||
|
|
|
@ -68,7 +68,18 @@ namespace System.Buffers.Text
|
||||||
|
|
||||||
if (maxSrcLength >= 24)
|
if (maxSrcLength >= 24)
|
||||||
{
|
{
|
||||||
byte* end = srcMax - 45;
|
byte* end = srcMax - 88;
|
||||||
|
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && (end >= src))
|
||||||
|
{
|
||||||
|
Avx512Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||||
|
|
||||||
|
if (src == srcEnd)
|
||||||
|
{
|
||||||
|
goto DoneExit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
end = srcMax - 45;
|
||||||
if (Avx2.IsSupported && (end >= src))
|
if (Avx2.IsSupported && (end >= src))
|
||||||
{
|
{
|
||||||
Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||||
|
@ -616,6 +627,78 @@ namespace System.Buffers.Text
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
|
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||||
|
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
|
||||||
|
private static unsafe void Avx512Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||||
|
{
|
||||||
|
// Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/decode
|
||||||
|
// If we have AVX512 support, pick off 64 bytes at a time for as long as we can,
|
||||||
|
// but make sure that we quit before seeing any == markers at the end of the
|
||||||
|
// string. Also, because we write 16 zeroes at the end of the output, ensure
|
||||||
|
// that there are at least 22 valid bytes of input data remaining to close the
|
||||||
|
// gap. 64 + 2 + 22 = 88 bytes.
|
||||||
|
byte* src = srcBytes;
|
||||||
|
byte* dest = destBytes;
|
||||||
|
|
||||||
|
// The JIT won't hoist these "constants", so help it
|
||||||
|
Vector512<sbyte> vbmiLookup0 = Vector512.Create(
|
||||||
|
0x80808080, 0x80808080, 0x80808080, 0x80808080,
|
||||||
|
0x80808080, 0x80808080, 0x80808080, 0x80808080,
|
||||||
|
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
|
||||||
|
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080).AsSByte();
|
||||||
|
Vector512<sbyte> vbmiLookup1 = Vector512.Create(
|
||||||
|
0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
|
||||||
|
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
|
||||||
|
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
|
||||||
|
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080).AsSByte();
|
||||||
|
Vector512<byte> vbmiPackedLanesControl = Vector512.Create(
|
||||||
|
0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
|
||||||
|
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
|
||||||
|
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000).AsByte();
|
||||||
|
|
||||||
|
Vector512<sbyte> mergeConstant0 = Vector512.Create(0x01400140).AsSByte();
|
||||||
|
Vector512<short> mergeConstant1 = Vector512.Create(0x00011000).AsInt16();
|
||||||
|
|
||||||
|
// This algorithm requires AVX512VBMI support.
|
||||||
|
// Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
|
||||||
|
do
|
||||||
|
{
|
||||||
|
AssertRead<Vector512<sbyte>>(src, srcStart, sourceLength);
|
||||||
|
Vector512<sbyte> str = Vector512.Load(src).AsSByte();
|
||||||
|
|
||||||
|
// Step 1: Translate encoded Base64 input to their original indices
|
||||||
|
// This step also checks for invalid inputs and exits.
|
||||||
|
// After this, we have indices which are verified to have upper 2 bits set to 0 in each byte.
|
||||||
|
// origIndex = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
||||||
|
Vector512<sbyte> origIndex = Avx512Vbmi.PermuteVar64x8x2(vbmiLookup0, str, vbmiLookup1);
|
||||||
|
Vector512<sbyte> errorVec = (origIndex.AsInt32() | str.AsInt32()).AsSByte();
|
||||||
|
if (errorVec.ExtractMostSignificantBits() != 0)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Now we need to reshuffle bits to remove the 0 bits.
|
||||||
|
// multiAdd1: [...|0000cccc|ccdddddd|0000aaaa|aabbbbbb]
|
||||||
|
Vector512<short> multiAdd1 = Avx512BW.MultiplyAddAdjacent(origIndex.AsByte(), mergeConstant0);
|
||||||
|
// multiAdd1: [...|00000000|aaaaaabb|bbbbcccc|ccdddddd]
|
||||||
|
Vector512<int> multiAdd2 = Avx512BW.MultiplyAddAdjacent(multiAdd1, mergeConstant1);
|
||||||
|
|
||||||
|
// Step 3: Pack 48 bytes
|
||||||
|
str = Avx512Vbmi.PermuteVar64x8(multiAdd2.AsByte(), vbmiPackedLanesControl).AsSByte();
|
||||||
|
|
||||||
|
AssertWrite<Vector512<sbyte>>(dest, destStart, destLength);
|
||||||
|
str.Store((sbyte*)dest);
|
||||||
|
src += 64;
|
||||||
|
dest += 48;
|
||||||
|
}
|
||||||
|
while (src <= srcEnd);
|
||||||
|
|
||||||
|
srcBytes = src;
|
||||||
|
destBytes = dest;
|
||||||
|
}
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
[CompExactlyDependsOn(typeof(Avx2))]
|
[CompExactlyDependsOn(typeof(Avx2))]
|
||||||
private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||||
|
|
|
@ -67,7 +67,16 @@ namespace System.Buffers.Text
|
||||||
|
|
||||||
if (maxSrcLength >= 16)
|
if (maxSrcLength >= 16)
|
||||||
{
|
{
|
||||||
byte* end = srcMax - 32;
|
byte* end = srcMax - 64;
|
||||||
|
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && (end >= src))
|
||||||
|
{
|
||||||
|
Avx512Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||||
|
|
||||||
|
if (src == srcEnd)
|
||||||
|
goto DoneExit;
|
||||||
|
}
|
||||||
|
|
||||||
|
end = srcMax - 64;
|
||||||
if (Avx2.IsSupported && (end >= src))
|
if (Avx2.IsSupported && (end >= src))
|
||||||
{
|
{
|
||||||
Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
|
||||||
|
@ -226,6 +235,80 @@ namespace System.Buffers.Text
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
|
[CompExactlyDependsOn(typeof(Avx512BW))]
|
||||||
|
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
|
||||||
|
private static unsafe void Avx512Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||||
|
{
|
||||||
|
// Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/encode
|
||||||
|
// If we have AVX512 support, pick off 48 bytes at a time for as long as we can.
|
||||||
|
// But because we read 64 bytes at a time, ensure we have enough room to do a
|
||||||
|
// full 64-byte read without segfaulting.
|
||||||
|
|
||||||
|
byte* src = srcBytes;
|
||||||
|
byte* dest = destBytes;
|
||||||
|
|
||||||
|
// The JIT won't hoist these "constants", so help it
|
||||||
|
Vector512<sbyte> shuffleVecVbmi = Vector512.Create(
|
||||||
|
0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
|
||||||
|
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
|
||||||
|
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
|
||||||
|
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e).AsSByte();
|
||||||
|
Vector512<sbyte> vbmiLookup = Vector512.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"u8).AsSByte();
|
||||||
|
|
||||||
|
Vector512<ushort> maskAC = Vector512.Create((uint)0x0fc0fc00).AsUInt16();
|
||||||
|
Vector512<uint> maskBB = Vector512.Create((uint)0x3f003f00);
|
||||||
|
Vector512<ushort> shiftAC = Vector512.Create((uint)0x0006000a).AsUInt16();
|
||||||
|
Vector512<ushort> shiftBB = Vector512.Create((uint)0x00080004).AsUInt16();
|
||||||
|
|
||||||
|
AssertRead<Vector256<sbyte>>(src, srcStart, sourceLength);
|
||||||
|
|
||||||
|
// This algorithm requires AVX512VBMI support.
|
||||||
|
// Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
|
||||||
|
|
||||||
|
// str = [...|PONM|LKJI|HGFE|DCBA]
|
||||||
|
Vector512<sbyte> str = Vector512.Load(src).AsSByte();
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
// Step 1 : Split 48 bytes into 64 bytes with each byte using 6-bits from input
|
||||||
|
// str = [...|KLJK|HIGH|EFDE|BCAB]
|
||||||
|
str = Avx512Vbmi.PermuteVar64x8(str, shuffleVecVbmi);
|
||||||
|
|
||||||
|
// TO-DO- This can be achieved faster with multishift
|
||||||
|
// Consider the first 4 bytes - BCAB
|
||||||
|
// temp1 = [...|0000cccc|cc000000|aaaaaa00|00000000]
|
||||||
|
Vector512<ushort> temp1 = (str.AsUInt16() & maskAC);
|
||||||
|
|
||||||
|
// temp2 = [...|00000000|00cccccc|00000000|00aaaaaa]
|
||||||
|
Vector512<ushort> temp2 = Avx512BW.ShiftRightLogicalVariable(temp1, shiftAC).AsUInt16();
|
||||||
|
|
||||||
|
// temp3 = [...|ccdddddd|00000000|aabbbbbb|cccc0000]
|
||||||
|
Vector512<ushort> temp3 = Avx512BW.ShiftLeftLogicalVariable(str.AsUInt16(), shiftBB).AsUInt16();
|
||||||
|
|
||||||
|
// str = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
|
||||||
|
str = Vector512.ConditionalSelect(maskBB, temp3.AsUInt32(), temp2.AsUInt32()).AsSByte();
|
||||||
|
|
||||||
|
// Step 2: Now we have the indices calculated. Next step is to use these indices to translate.
|
||||||
|
str = Avx512Vbmi.PermuteVar64x8(vbmiLookup, str);
|
||||||
|
|
||||||
|
AssertWrite<Vector512<sbyte>>(dest, destStart, destLength);
|
||||||
|
str.Store((sbyte*)dest);
|
||||||
|
|
||||||
|
src += 48;
|
||||||
|
dest += 64;
|
||||||
|
|
||||||
|
if (src > srcEnd)
|
||||||
|
break;
|
||||||
|
|
||||||
|
AssertRead<Vector512<sbyte>>(src, srcStart, sourceLength);
|
||||||
|
str = Vector512.Load(src).AsSByte();
|
||||||
|
}
|
||||||
|
|
||||||
|
srcBytes = src;
|
||||||
|
destBytes = dest;
|
||||||
|
}
|
||||||
|
|
||||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||||
[CompExactlyDependsOn(typeof(Avx2))]
|
[CompExactlyDependsOn(typeof(Avx2))]
|
||||||
private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue