1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-09 17:44:48 +09:00

Decompose some bitwise operations in HIR to allow more overall optimizations to kick in (#104517)

* Decompose some bitwise operations in HIR to allow more overall optimizations to kick in

* Ensure that we actually remove the underlying op

* Ensure the AND_NOT decomposition is still folded during import for minopts

* Ensure we propagate AllBitsSet into simd GT_XOR on xarch

* Ensure that we prefer AndNot over TernaryLogic

* Cleanup the TernaryLogic lowering code

* Ensure that TernaryLogic picks the best operand for containment

* Ensure we swap the operands that are being checked for containment

* Ensure that TernaryLogic is simplified where possible

* Apply formatting patch
This commit is contained in:
Tanner Gooding 2024-07-13 07:01:55 -07:00 committed by GitHub
parent 5336e1815c
commit 6d3cb53af9
Signed by: github
GPG key ID: B5690EEEBB952194
14 changed files with 1488 additions and 585 deletions

View file

@ -20506,10 +20506,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si
GenTree* bitMask;
bitMask = gtNewDconNode(-0.0, simdBaseType);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
if (simdBaseType == TYP_FLOAT)
{
bitMask = gtNewIconNode(0x7FFFFFFF);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize);
}
else
{
bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize);
}
return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize);
}
NamedIntrinsic intrinsic = NI_Illegal;
@ -20750,12 +20757,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}
}
}
if (op == GT_AND_NOT)
{
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
needsReverseOps = true;
}
break;
}
#endif // TARGET_XARCH
@ -20786,11 +20787,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
if (intrinsic != NI_Illegal)
{
if (op == GT_AND_NOT)
{
assert(fgNodeThreading == NodeThreading::LIR);
#if defined(TARGET_XARCH)
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
// We specially handle this here since we're only producing a
// native intrinsic node in LIR
std::swap(op1, op2);
#endif // TARGET_XARCH
}
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
}
switch (op)
{
case GT_AND_NOT:
{
// Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
// appropriately optimize around the individual operations being performed, particularly ~op2,
// and produce overall better codegen.
assert(fgNodeThreading != NodeThreading::LIR);
op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
}
#if defined(TARGET_XARCH)
case GT_RSZ:
{
@ -20955,9 +20979,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
}
// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());
// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
widenedSimdBaseJitType, widenedSimdSize);
@ -21922,9 +21943,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
CORINFO_TYPE_INT, simdSize);
// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());
op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
}
@ -24315,9 +24333,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
GenTree* vecCon2 = gtCloneExpr(vecCon1);
// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
@ -24356,9 +24371,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
GenTree* vecCon2 = gtCloneExpr(vecCon1);
// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
@ -24460,9 +24472,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
GenTree* vecCon2 = gtCloneExpr(vecCon1);
// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
@ -24499,9 +24508,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
GenTree* vecCon2 = gtCloneExpr(vecCon1);
// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
@ -28120,6 +28126,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
assert(!isScalar);
assert(op2->TypeIs(simdType));
if (comp->fgNodeThreading != NodeThreading::LIR)
{
// We don't want to support creating AND_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead.
break;
}
#if defined(TARGET_XARCH)
if (simdSize == 64)
{
@ -29187,6 +29201,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec
return IsUserCall() && (operand == Op(2));
}
#if defined(TARGET_XARCH)
case NI_SSE_Xor:
case NI_SSE2_Xor:
case NI_AVX_Xor:
case NI_AVX2_Xor:
case NI_AVX512F_Xor:
case NI_AVX512DQ_Xor:
case NI_AVX10v1_V512_Xor:
{
// We recognize this as GT_NOT which can enable other optimizations
assert(GetOperandCount() == 2);
return vecCon->IsVectorAllBitsSet();
}
#endif // TARGET_XARCH
default:
{
break;
@ -29936,7 +29965,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const
unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3)
{
#if defined(TARGET_XARCH)
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId));
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) ||
HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId));
#elif defined(TARGET_ARM64)
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId));
#endif
@ -29980,85 +30010,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree
return 0;
}
//------------------------------------------------------------------------
// GetTernaryControlByte: calculate the value of the control byte for ternary node
// with given logic nodes on the input.
//
// Return value: the value of the ternary control byte.
uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const
{
// we assume we have a structure like:
/*
/- A
+- B
t1 = binary logical op1
/- C
+- t1
t2 = binary logical op2
*/
// To calculate the control byte value:
// The way the constants work is we have three keys:
// * A: 0xF0
// * B: 0xCC
// * C: 0xAA
//
// To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you
// wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A.
assert(second->Op(1) == this || second->Op(2) == this);
const uint8_t A = 0xF0;
const uint8_t B = 0xCC;
const uint8_t C = 0xAA;
bool isScalar = false;
genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar);
assert(!isScalar);
genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar);
assert(!isScalar);
uint8_t AB = 0;
uint8_t ABC = 0;
if (firstOper == GT_AND)
{
AB = A & B;
}
else if (firstOper == GT_OR)
{
AB = A | B;
}
else if (firstOper == GT_XOR)
{
AB = A ^ B;
}
else
{
unreached();
}
if (secondOper == GT_AND)
{
ABC = AB & C;
}
else if (secondOper == GT_OR)
{
ABC = AB | C;
}
else if (secondOper == GT_XOR)
{
ABC = AB ^ C;
}
else
{
unreached();
}
return ABC;
}
#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS
unsigned GenTreeLclFld::GetSize() const
@ -30454,13 +30405,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
bool isScalar = false;
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
#if defined(TARGET_XARCH)
if (oper == GT_AND_NOT)
{
// xarch does: ~op1 & op2, we need op1 & ~op2
std::swap(op1, op2);
}
#endif // TARGET_XARCH
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);
GenTree* cnsNode = nullptr;
GenTree* otherNode = nullptr;
@ -30973,31 +30919,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
break;
}
case GT_AND_NOT:
{
// Handle `x & ~0 == x` and `0 & ~x == 0`
if (cnsNode->IsVectorZero())
{
if (cnsNode == op1)
{
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
break;
}
else
{
resultNode = otherNode;
}
break;
}
// Handle `x & ~AllBitsSet == 0`
if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
{
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
}
break;
}
case GT_DIV:
{
if (varTypeIsFloating(simdBaseType))
@ -31388,12 +31309,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
{
switch (ni)
{
case NI_Vector128_ConditionalSelect:
#if defined(TARGET_XARCH)
case NI_Vector128_ConditionalSelect:
case NI_Vector256_ConditionalSelect:
case NI_Vector512_ConditionalSelect:
#elif defined(TARGET_ARM64)
case NI_Vector64_ConditionalSelect:
case NI_AdvSimd_BitwiseSelect:
case NI_Sve_ConditionalSelect:
#endif
{

View file

@ -6527,7 +6527,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
bool OperRequiresGlobRefFlag() const;
unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3);
uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const;
ClassLayout* GetLayout(Compiler* compiler) const;

View file

@ -352,6 +352,365 @@ const TernaryLogicInfo& TernaryLogicInfo::lookup(uint8_t control)
return ternaryLogicFlags[control];
}
//------------------------------------------------------------------------
// GetTernaryControlByte: Get the control byte for a TernaryLogic operation
// given the oper and two existing control bytes
//
// Arguments:
// oper -- the operation being performed
// op1 -- the control byte for op1
// op2 -- the control byte for op2
//
// Return Value:
// The new control byte evaluated from performing oper on op1 and op2
//
uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2)
{
switch (oper)
{
case GT_AND:
{
return static_cast<uint8_t>(op1 & op2);
}
case GT_AND_NOT:
{
return static_cast<uint8_t>(~op1 & op2);
}
case GT_OR:
{
return static_cast<uint8_t>(op1 | op2);
}
case GT_XOR:
{
return static_cast<uint8_t>(op1 ^ op2);
}
default:
{
unreached();
}
}
}
//------------------------------------------------------------------------
// GetTernaryControlByte: Get the control byte for a TernaryLogic operation
// given a ternary logic oper and two inputs
//
// Arguments:
// oper -- the operation being performed
// op1 -- the control byte for op1, this is ignored for unary oper
// op2 -- the control byte for op2
//
// Return Value:
// The new control byte evaluated from performing oper on op1 and op2
//
uint8_t TernaryLogicInfo::GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2)
{
switch (oper)
{
case TernaryLogicOperKind::Select:
{
return op2;
}
case TernaryLogicOperKind::Not:
{
return ~op2;
}
case TernaryLogicOperKind::And:
{
return op1 & op2;
}
case TernaryLogicOperKind::Nand:
{
return ~(op1 & op2);
}
case TernaryLogicOperKind::Or:
{
return op1 | op2;
}
case TernaryLogicOperKind::Nor:
{
return ~(op1 | op2);
}
case TernaryLogicOperKind::Xor:
{
return op1 ^ op2;
}
case TernaryLogicOperKind::Xnor:
{
return ~(op1 ^ op2);
}
default:
{
unreached();
}
}
}
//------------------------------------------------------------------------
// GetTernaryControlByte: Get the control byte for a TernaryLogic operation
// given an existing info and three control bytes
//
// Arguments:
// info -- the info describing the operation being performed
// op1 -- the control byte for op1
// op2 -- the control byte for op2
// op3 -- the control byte for op3
//
// Return Value:
// The new control byte evaluated from performing info on op1, op2, and op3
//
uint8_t TernaryLogicInfo::GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3)
{
uint8_t oper1Result;
switch (info.oper1Use)
{
case TernaryLogicUseFlags::None:
{
assert(info.oper2 == TernaryLogicOperKind::None);
assert(info.oper2Use == TernaryLogicUseFlags::None);
assert(info.oper3 == TernaryLogicOperKind::None);
assert(info.oper3Use == TernaryLogicUseFlags::None);
switch (info.oper1)
{
case TernaryLogicOperKind::False:
{
oper1Result = 0x00;
break;
}
case TernaryLogicOperKind::True:
{
oper1Result = 0xFF;
break;
}
default:
{
unreached();
}
}
break;
}
case TernaryLogicUseFlags::A:
{
oper1Result = GetTernaryControlByte(info.oper1, 0x00, op1);
break;
}
case TernaryLogicUseFlags::B:
{
oper1Result = GetTernaryControlByte(info.oper1, 0x00, op2);
break;
}
case TernaryLogicUseFlags::C:
{
oper1Result = GetTernaryControlByte(info.oper1, 0x00, op3);
break;
}
case TernaryLogicUseFlags::AB:
{
oper1Result = GetTernaryControlByte(info.oper1, op1, op2);
break;
}
case TernaryLogicUseFlags::AC:
{
oper1Result = GetTernaryControlByte(info.oper1, op1, op3);
break;
}
case TernaryLogicUseFlags::BC:
{
oper1Result = GetTernaryControlByte(info.oper1, op2, op3);
break;
}
case TernaryLogicUseFlags::ABC:
{
assert(info.oper2 == TernaryLogicOperKind::None);
assert(info.oper2Use == TernaryLogicUseFlags::None);
assert(info.oper3 == TernaryLogicOperKind::None);
assert(info.oper3Use == TernaryLogicUseFlags::None);
switch (info.oper1)
{
case TernaryLogicOperKind::Nor:
{
oper1Result = ~(op1 | op2 | op3);
break;
}
case TernaryLogicOperKind::Minor:
{
oper1Result = 0x17;
break;
}
case TernaryLogicOperKind::Xnor:
{
oper1Result = ~(op1 ^ op2 ^ op3);
break;
}
case TernaryLogicOperKind::Nand:
{
oper1Result = ~(op1 & op2 & op3);
break;
}
case TernaryLogicOperKind::And:
{
oper1Result = op1 & op2 & op3;
break;
}
case TernaryLogicOperKind::Xor:
{
oper1Result = op1 ^ op2 ^ op3;
break;
}
case TernaryLogicOperKind::Major:
{
oper1Result = 0xE8;
break;
}
case TernaryLogicOperKind::Or:
{
oper1Result = op1 | op2 | op3;
break;
}
default:
{
unreached();
}
}
break;
}
default:
{
unreached();
}
}
uint8_t oper2Result;
switch (info.oper2Use)
{
case TernaryLogicUseFlags::None:
{
assert(info.oper3 == TernaryLogicOperKind::None);
assert(info.oper3Use == TernaryLogicUseFlags::None);
oper2Result = oper1Result;
break;
}
case TernaryLogicUseFlags::A:
{
oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op1);
break;
}
case TernaryLogicUseFlags::B:
{
oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op2);
break;
}
case TernaryLogicUseFlags::C:
{
oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op3);
break;
}
case TernaryLogicUseFlags::AB:
{
oper2Result = GetTernaryControlByte(info.oper2, op1, op2);
break;
}
case TernaryLogicUseFlags::AC:
{
oper2Result = GetTernaryControlByte(info.oper2, op1, op3);
break;
}
case TernaryLogicUseFlags::BC:
{
oper2Result = GetTernaryControlByte(info.oper2, op2, op3);
break;
}
default:
{
unreached();
}
}
uint8_t oper3Result;
switch (info.oper3Use)
{
case TernaryLogicUseFlags::None:
{
assert(info.oper3 == TernaryLogicOperKind::None);
oper3Result = oper2Result;
break;
}
case TernaryLogicUseFlags::A:
{
assert(info.oper3 == TernaryLogicOperKind::Cond);
oper3Result = (oper1Result & op1) | (oper2Result & ~op1);
break;
}
case TernaryLogicUseFlags::B:
{
assert(info.oper3 == TernaryLogicOperKind::Cond);
oper3Result = (oper1Result & op2) | (oper2Result & ~op2);
break;
}
case TernaryLogicUseFlags::C:
{
assert(info.oper3 == TernaryLogicOperKind::Cond);
oper3Result = (oper1Result & op3) | (oper2Result & ~op3);
break;
}
default:
{
unreached();
}
}
return oper3Result;
}
#endif // TARGET_XARCH
//------------------------------------------------------------------------

View file

@ -481,6 +481,10 @@ struct TernaryLogicInfo
static const TernaryLogicInfo& lookup(uint8_t control);
static uint8_t GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2);
static uint8_t GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2);
static uint8_t GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3);
TernaryLogicUseFlags GetAllUseFlags() const
{
uint8_t useFlagsBits = 0;
@ -1024,6 +1028,11 @@ struct HWIntrinsicInfo
HWIntrinsicFlag flags = lookupFlags(id);
return (flags & HW_Flag_PermuteVar2x) != 0;
}
static bool IsTernaryLogic(NamedIntrinsic id)
{
return (id == NI_AVX512F_TernaryLogic) || (id == NI_AVX512F_VL_TernaryLogic) || (id == NI_AVX10v1_TernaryLogic);
}
#endif // TARGET_XARCH
#if defined(TARGET_ARM64)

View file

@ -611,15 +611,39 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
case NI_AdvSimd_BitwiseClear:
case NI_Vector64_AndNot:
case NI_Vector128_AndNot:
{
assert(sig->numArgs == 2);
// We don't want to support creating AND_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead so decompose into the individual operations
// on import
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();
retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize);
op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize));
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
break;
}
case NI_AdvSimd_OrNot:
{
assert(sig->numArgs == 2);
// We don't want to support creating OR_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead so decompose into the individual operations
// on import
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();
op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize));
retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize);
break;
}

View file

@ -1298,9 +1298,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
// non-RMW based codegen.
#if defined(DEBUG)
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
assert((intrinsicId == NI_AVX512F_TernaryLogic) || (intrinsicId == NI_AVX512F_VL_TernaryLogic) ||
(intrinsicId == NI_AVX10v1_TernaryLogic));
assert(HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId()));
uint8_t control = static_cast<uint8_t>(ival);
const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control);
@ -1311,6 +1309,19 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
op2Reg = targetReg;
}
else
{
#if defined(DEBUG)
if (HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId()))
{
uint8_t control = static_cast<uint8_t>(ival);
const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control);
TernaryLogicUseFlags useFlags = info.GetAllUseFlags();
assert(useFlags == TernaryLogicUseFlags::BC);
}
#endif // DEBUG
}
}
assert(targetReg != REG_NA);
@ -2856,6 +2867,46 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption
break;
}
case NI_EVEX_XnorMask:
{
assert(instOptions == INS_OPTS_NONE);
uint32_t simdSize = node->GetSimdSize();
uint32_t count = simdSize / genTypeSize(baseType);
if (count <= 8)
{
assert((count == 2) || (count == 4) || (count == 8));
ins = INS_kxnorb;
}
else if (count == 16)
{
ins = INS_kxnorw;
}
else if (count == 32)
{
ins = INS_kxnord;
}
else
{
assert(count == 64);
ins = INS_kxnorq;
}
op1Reg = op1->GetRegNum();
GenTree* op2 = node->Op(2);
regNumber op2Reg = op2->GetRegNum();
assert(emitter::isMaskReg(targetReg));
assert(emitter::isMaskReg(op1Reg));
assert(emitter::isMaskReg(op2Reg));
// Use EA_32BYTE to ensure the VEX.L bit gets set
emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg);
break;
}
case NI_AVX512F_ConvertToInt32:
case NI_AVX512F_ConvertToUInt32:
case NI_AVX512F_ConvertToUInt32WithTruncation:

View file

@ -263,7 +263,7 @@ HARDWARE_INTRINSIC(AdvSimd, AddScalar,
HARDWARE_INTRINSIC(AdvSimd, AddWideningLower, 8, 2, true, {INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddw, INS_uaddw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd, AddWideningUpper, 16, 2, true, {INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddw2, INS_uaddw2, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd, And, -1, 2, true, {INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AdvSimd, BitwiseSelect, -1, 3, true, {INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl}, HW_Category_SIMD, HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd, Ceiling, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_invalid}, HW_Category_SIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, CeilingScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_frintp}, HW_Category_SIMD, HW_Flag_SIMDScalar)
@ -401,7 +401,7 @@ HARDWARE_INTRINSIC(AdvSimd, NegateSaturate,
HARDWARE_INTRINSIC(AdvSimd, NegateScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fneg, INS_fneg}, HW_Category_SIMD, HW_Flag_SIMDScalar)
HARDWARE_INTRINSIC(AdvSimd, Not, -1, 1, true, {INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn}, HW_Category_SIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, Or, -1, 2, true, {INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiply, -1, 2, true, {INS_pmul, INS_pmul, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningLower, 8, 2, true, {INS_pmull, INS_pmull, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningUpper, 16, 2, true, {INS_pmull2, INS_pmull2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative)

View file

@ -388,7 +388,7 @@ HARDWARE_INTRINSIC(X86Base_X64, DivRem,
HARDWARE_INTRINSIC(SSE, Add, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE, And, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
@ -468,7 +468,7 @@ HARDWARE_INTRINSIC(SSE, Subtract,
HARDWARE_INTRINSIC(SSE, SubtractScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE, UnpackHigh, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, UnpackLow, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_SSE NI_SSE_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -492,7 +492,7 @@ HARDWARE_INTRINSIC(SSE2, Add,
HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE2, And, 16, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, Average, 16, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, true, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
@ -590,7 +590,7 @@ HARDWARE_INTRINSIC(SSE2, SubtractScalar,
HARDWARE_INTRINSIC(SSE2, SumAbsoluteDifferences, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE2, UnpackHigh, 16, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, UnpackLow, 16, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_SSE2 NI_SSE2_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -723,7 +723,7 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32,
HARDWARE_INTRINSIC(AVX, Add, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, And, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, Blend, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
@ -791,7 +791,7 @@ HARDWARE_INTRINSIC(AVX, TestNotZAndNotC,
HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, true, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_AVX NI_AVX_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -805,7 +805,7 @@ HARDWARE_INTRINSIC(AVX2, Add,
HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, false, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, And, 32, 2, false, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, Average, 32, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, true, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, false, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
@ -865,7 +865,7 @@ HARDWARE_INTRINSIC(AVX2, SubtractSaturate,
HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2, UnpackHigh, 32, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, UnpackLow, 32, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_AVX2 NI_AVX2_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -880,7 +880,7 @@ HARDWARE_INTRINSIC(AVX512F, AddScalar,
HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad)
@ -1002,7 +1002,7 @@ HARDWARE_INTRINSIC(AVX512F, SubtractScalar,
HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_AVX512F NI_AVX512F_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -1183,7 +1183,7 @@ HARDWARE_INTRINSIC(AVX512CD_VL, LeadingZeroCount,
// AVX512DQ Intrinsics
#define FIRST_NI_AVX512DQ NI_AVX512DQ_And
HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512DQ, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
@ -1203,7 +1203,7 @@ HARDWARE_INTRINSIC(AVX512DQ, Range,
HARDWARE_INTRINSIC(AVX512DQ, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX512DQ, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512DQ, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_AVX512DQ NI_AVX512DQ_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -1368,7 +1368,7 @@ HARDWARE_INTRINSIC(AVX10v1, TernaryLogic,
// AVX10V1_V512 Intrinsics
#define FIRST_NI_AVX10v1_V512 NI_AVX10v1_V512_And
HARDWARE_INTRINSIC(AVX10v1_V512, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
@ -1391,7 +1391,7 @@ HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8,
HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8x2, 64, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
#define LAST_NI_AVX10v1_V512 NI_AVX10v1_V512_Xor
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@ -1438,7 +1438,7 @@ HARDWARE_INTRINSIC(AES, KeygenAssist,
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// BMI1 Intrinsics
#define FIRST_NI_BMI1 NI_BMI1_AndNot
HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
@ -1452,7 +1452,7 @@ HARDWARE_INTRINSIC(BMI1, TrailingZeroCount,
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// BMI1 Intrinsics
#define FIRST_NI_BMI1_X64 NI_BMI1_X64_AndNot
HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
@ -1599,6 +1599,7 @@ HARDWARE_INTRINSIC(EVEX, OrMask,
HARDWARE_INTRINSIC(EVEX, ShiftLeftMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(EVEX, ShiftRightMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(EVEX, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
HARDWARE_INTRINSIC(EVEX, XnorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
#endif // FEATURE_HW_INTRINSIC

View file

@ -1394,19 +1394,62 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
case NI_SSE_AndNot:
case NI_SSE2_AndNot:
case NI_AVX_AndNot:
case NI_AVX2_AndNot:
case NI_AVX512F_AndNot:
case NI_AVX512DQ_AndNot:
case NI_AVX10v1_V512_AndNot:
{
assert(sig->numArgs == 2);
// We don't want to support creating AND_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead so decompose into the individual operations
// on import, taking into account that despite the name, these APIs
// do (~op1 & op2), so we need to account for that
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();
op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize));
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
break;
}
case NI_BMI1_AndNot:
case NI_BMI1_X64_AndNot:
{
assert(sig->numArgs == 2);
// The same general reasoning for the decomposition exists here as
// given above for the SIMD AndNot APIs.
op2 = impPopStack().val;
op1 = impPopStack().val;
op1 = gtFoldExpr(gtNewOperNode(GT_NOT, retType, op1));
retNode = gtNewOperNode(GT_AND, retType, op1, op2);
break;
}
case NI_Vector128_AndNot:
case NI_Vector256_AndNot:
case NI_Vector512_AndNot:
{
assert(sig->numArgs == 2);
impSpillSideEffect(true,
verCurrentState.esStackDepth - 2 DEBUGARG("Spilling op1 side effects for HWIntrinsic"));
// We don't want to support creating AND_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead so decompose into the individual operations
// on import
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();
retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize);
op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize));
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
break;
}

View file

@ -160,20 +160,8 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize);
#if defined(TARGET_XARCH)
if (canUseEvexEncoding())
{
GenTree* control;
control = gtNewIconNode(static_cast<uint8_t>((0xF0 | 0xCC) ^ 0xAA)); // (A | B)) ^ C
xor1 = gtNewSimdTernaryLogicNode(simdType, vec1, toLowerVec1, cnsVec1, control, baseType, simdSize);
}
else
#endif // TARGET_XARCH
{
vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize);
xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize);
}
vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize);
xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize);
vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize);
}
@ -184,22 +172,10 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
// ((v1 ^ cns1) | (v2 ^ cns2)) == zero
#if defined(TARGET_XARCH)
if (canUseEvexEncoding())
{
GenTree* control;
GenTree* xor2;
control = gtNewIconNode(static_cast<uint8_t>(0xF0 | (0xCC ^ 0xAA))); // A | (B ^ C)
orr = gtNewSimdTernaryLogicNode(simdType, xor1, vec2, cnsVec2, control, baseType, simdSize);
}
else
#endif // TARGET_XARCH
{
GenTree* xor2;
xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize);
orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize);
}
xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize);
orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize);
// Optimization: use a single load when byteLen equals simdSize.
// For code simplicity we always create nodes for two vectors case.

View file

@ -1233,6 +1233,87 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
bool isScalar = false;
genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar);
switch (oper)
{
case GT_AND:
case GT_OR:
{
// We want to recognize (~op1 & op2) and transform it
// into AdvSimd.AndNot(op2, op1) as well as (op1 & ~op2)
// transforming it into AdvSimd.AndNot(op1, op2)
//
// We want to similarly handle (~op1 | op2) and (op1 | ~op2)
bool transform = false;
GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
if (op2->OperIsHWIntrinsic())
{
GenTreeHWIntrinsic* op2Intrin = op2->AsHWIntrinsic();
bool op2IsScalar = false;
genTreeOps op2Oper = op2Intrin->GetOperForHWIntrinsicId(&op2IsScalar);
if (op2Oper == GT_NOT)
{
assert(!op2IsScalar);
transform = true;
op2 = op2Intrin->Op(1);
BlockRange().Remove(op2Intrin);
}
}
if (!transform && op1->OperIsHWIntrinsic())
{
GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic();
bool op1IsScalar = false;
genTreeOps op1Oper = opIntrin->GetOperForHWIntrinsicId(&op1IsScalar);
if (op1Oper == GT_NOT)
{
assert(!op1IsScalar);
transform = true;
op1 = opIntrin->Op(1);
BlockRange().Remove(opIntrin);
std::swap(op1, op2);
}
}
if (transform)
{
if (oper == GT_AND)
{
oper = GT_AND_NOT;
intrinsicId = NI_AdvSimd_BitwiseClear;
}
else
{
assert(oper == GT_OR);
oper = GT_NONE;
intrinsicId = NI_AdvSimd_OrNot;
}
node->ChangeHWIntrinsicId(intrinsicId, op1, op2);
oper = GT_AND_NOT;
}
break;
}
default:
{
break;
}
}
switch (intrinsicId)
{
case NI_Vector64_Create:

File diff suppressed because it is too large Load diff

View file

@ -9921,6 +9921,9 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar);
genTreeOps oper = actualOper;
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);
if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
{
GenTree* op1 = node->Op(1);
@ -9994,12 +9997,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
case GT_AND_NOT:
{
maskIntrinsicId = NI_EVEX_AndNotMask;
break;
}
case GT_NOT:
{
maskIntrinsicId = NI_EVEX_NotMask;
@ -10079,91 +10076,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
switch (oper)
{
// Transforms:
// 1. (~v1 & v2) to VectorXxx.AndNot(v2, v1)
// 2. (v1 & ~v2) to VectorXxx.AndNot(v1, v2)
case GT_AND:
{
GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
GenTree* lhs = nullptr;
GenTree* rhs = nullptr;
if (op1->OperIsHWIntrinsic())
{
// Try handle: ~op1 & op2
GenTreeHWIntrinsic* hw = op1->AsHWIntrinsic();
genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar);
if (isScalar)
{
return node;
}
#if defined(TARGET_ARM64)
if (hwOper == GT_NOT)
{
lhs = op2;
rhs = hw->Op(1);
}
#elif defined(TARGET_XARCH)
if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet())
{
lhs = op2;
rhs = hw->Op(1);
}
#endif // !TARGET_ARM64 && !TARGET_XARCH
}
if ((lhs == nullptr) && op2->OperIsHWIntrinsic())
{
// Try handle: op1 & ~op2
GenTreeHWIntrinsic* hw = op2->AsHWIntrinsic();
genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar);
if (isScalar)
{
return node;
}
#if defined(TARGET_ARM64)
if (hwOper == GT_NOT)
{
lhs = op1;
rhs = hw->Op(1);
}
#elif defined(TARGET_XARCH)
if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet())
{
lhs = op1;
rhs = hw->Op(1);
}
#endif // !TARGET_ARM64 && !TARGET_XARCH
}
if (lhs == nullptr)
{
break;
}
assert(rhs != nullptr);
// Filter out side effecting cases for several reasons:
// 1. gtNewSimdBinOpNode may swap operand order.
// 2. The code above will swap operand order.
// 3. The code above does not handle GTF_REVERSE_OPS.
if (((lhs->gtFlags | rhs->gtFlags) & GTF_ALL_EFFECT) != 0)
{
break;
}
GenTree* andnNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, lhs, rhs, simdBaseJitType, simdSize);
DEBUG_DESTROY_NODE(node);
INDEBUG(andnNode->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED);
return andnNode;
}
#if defined(TARGET_ARM64)
// Transforms:
// 1. -(-v1) to v1

View file

@ -7919,13 +7919,11 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
if (oper != GT_NONE)
{
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);
#if defined(TARGET_XARCH)
if (oper == GT_AND_NOT)
{
// xarch does: ~arg0VN & arg1VN
std::swap(arg0VN, arg1VN);
}
else if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ))
if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ))
{
if (TypeOfVN(arg1VN) == TYP_SIMD16)
{
@ -8047,6 +8045,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
bool isScalar = false;
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);
if (isScalar)
{
// We don't support folding scalars today
@ -8108,37 +8109,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
break;
}
case GT_AND_NOT:
{
#if defined(TARGET_XARCH)
std::swap(arg0VN, arg1VN);
#endif // TARGET_XARCH
// Handle `x & ~0 == x` and `0 & ~x == 0`
ValueNum zeroVN = VNZeroForType(type);
if (cnsVN == zeroVN)
{
if (cnsVN == arg0VN)
{
return zeroVN;
}
return argVN;
}
// Handle `x & ~AllBitsSet == 0`
ValueNum allBitsVN = VNAllBitsForType(type);
if (cnsVN == allBitsVN)
{
if (cnsVN == arg1VN)
{
return zeroVN;
}
}
break;
}
case GT_DIV:
{
if (varTypeIsFloating(baseType))
@ -8397,6 +8367,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
bool isScalar = false;
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);
if (isScalar)
{
// We don't support folding scalars today
@ -8411,12 +8384,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
return arg0VN;
}
case GT_AND_NOT:
{
// Handle `x & ~x == 0`
return VNZeroForType(type);
}
case GT_OR:
{
// Handle `x | x == x`
@ -8575,12 +8542,12 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(GenTreeHWIntrinsic* tree,
switch (ni)
{
case NI_Vector128_ConditionalSelect:
#if defined(TARGET_XARCH)
case NI_Vector128_ConditionalSelect:
case NI_Vector256_ConditionalSelect:
case NI_Vector512_ConditionalSelect:
#elif defined(TARGET_ARM64)
case NI_Vector64_ConditionalSelect:
case NI_AdvSimd_BitwiseSelect:
case NI_Sve_ConditionalSelect:
#endif
{