mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-09 17:44:48 +09:00
Decompose some bitwise operations in HIR to allow more overall optimizations to kick in (#104517)
* Decompose some bitwise operations in HIR to allow more overall optimizations to kick in * Ensure that we actually remove the underlying op * Ensure the AND_NOT decomposition is still folded during import for minopts * Ensure we propagate AllBitsSet into simd GT_XOR on xarch * Ensure that we prefer AndNot over TernaryLogic * Cleanup the TernaryLogic lowering code * Ensure that TernaryLogic picks the best operand for containment * Ensure we swap the operands that are being checked for containment * Ensure that TernaryLogic is simplified where possible * Apply formatting patch
This commit is contained in:
parent
5336e1815c
commit
6d3cb53af9
14 changed files with 1488 additions and 585 deletions
|
@ -20506,10 +20506,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si
|
|||
|
||||
GenTree* bitMask;
|
||||
|
||||
bitMask = gtNewDconNode(-0.0, simdBaseType);
|
||||
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize);
|
||||
|
||||
return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
|
||||
if (simdBaseType == TYP_FLOAT)
|
||||
{
|
||||
bitMask = gtNewIconNode(0x7FFFFFFF);
|
||||
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF);
|
||||
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize);
|
||||
}
|
||||
return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize);
|
||||
}
|
||||
|
||||
NamedIntrinsic intrinsic = NI_Illegal;
|
||||
|
@ -20750,12 +20757,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (op == GT_AND_NOT)
|
||||
{
|
||||
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
|
||||
needsReverseOps = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif // TARGET_XARCH
|
||||
|
@ -20786,11 +20787,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
|
|||
|
||||
if (intrinsic != NI_Illegal)
|
||||
{
|
||||
if (op == GT_AND_NOT)
|
||||
{
|
||||
assert(fgNodeThreading == NodeThreading::LIR);
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
|
||||
// We specially handle this here since we're only producing a
|
||||
// native intrinsic node in LIR
|
||||
|
||||
std::swap(op1, op2);
|
||||
#endif // TARGET_XARCH
|
||||
}
|
||||
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
|
||||
}
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case GT_AND_NOT:
|
||||
{
|
||||
// Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
|
||||
// appropriately optimize around the individual operations being performed, particularly ~op2,
|
||||
// and produce overall better codegen.
|
||||
assert(fgNodeThreading != NodeThreading::LIR);
|
||||
|
||||
op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
|
||||
return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
|
||||
}
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
case GT_RSZ:
|
||||
{
|
||||
|
@ -20955,9 +20979,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
|
|||
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
|
||||
}
|
||||
|
||||
// Validate we can't use AVX512F_VL_TernaryLogic here
|
||||
assert(!canUseEvexEncodingDebugOnly());
|
||||
|
||||
// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
|
||||
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
|
||||
widenedSimdBaseJitType, widenedSimdSize);
|
||||
|
@ -21922,9 +21943,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
|
|||
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
|
||||
CORINFO_TYPE_INT, simdSize);
|
||||
|
||||
// Validate we can't use AVX512F_VL_TernaryLogic here
|
||||
assert(!canUseEvexEncodingDebugOnly());
|
||||
|
||||
op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
|
||||
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
|
||||
}
|
||||
|
@ -24315,9 +24333,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
|
|||
|
||||
GenTree* vecCon2 = gtCloneExpr(vecCon1);
|
||||
|
||||
// Validate we can't use AVX512F_VL_TernaryLogic here
|
||||
assert(!canUseEvexEncodingDebugOnly());
|
||||
|
||||
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
|
||||
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
|
||||
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
|
||||
|
@ -24356,9 +24371,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
|
|||
|
||||
GenTree* vecCon2 = gtCloneExpr(vecCon1);
|
||||
|
||||
// Validate we can't use AVX512F_VL_TernaryLogic here
|
||||
assert(!canUseEvexEncodingDebugOnly());
|
||||
|
||||
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
|
||||
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
|
||||
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
|
||||
|
@ -24460,9 +24472,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
|
|||
|
||||
GenTree* vecCon2 = gtCloneExpr(vecCon1);
|
||||
|
||||
// Validate we can't use AVX512F_VL_TernaryLogic here
|
||||
assert(!canUseEvexEncodingDebugOnly());
|
||||
|
||||
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
|
||||
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
|
||||
|
||||
|
@ -24499,9 +24508,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(
|
|||
|
||||
GenTree* vecCon2 = gtCloneExpr(vecCon1);
|
||||
|
||||
// Validate we can't use AVX512F_VL_TernaryLogic here
|
||||
assert(!canUseEvexEncodingDebugOnly());
|
||||
|
||||
tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
|
||||
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
|
||||
|
||||
|
@ -28120,6 +28126,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
|
|||
assert(!isScalar);
|
||||
assert(op2->TypeIs(simdType));
|
||||
|
||||
if (comp->fgNodeThreading != NodeThreading::LIR)
|
||||
{
|
||||
// We don't want to support creating AND_NOT nodes prior to LIR
|
||||
// as it can break important optimizations. We'll produces this
|
||||
// in lowering instead.
|
||||
break;
|
||||
}
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
if (simdSize == 64)
|
||||
{
|
||||
|
@ -29187,6 +29201,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec
|
|||
return IsUserCall() && (operand == Op(2));
|
||||
}
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
case NI_SSE_Xor:
|
||||
case NI_SSE2_Xor:
|
||||
case NI_AVX_Xor:
|
||||
case NI_AVX2_Xor:
|
||||
case NI_AVX512F_Xor:
|
||||
case NI_AVX512DQ_Xor:
|
||||
case NI_AVX10v1_V512_Xor:
|
||||
{
|
||||
// We recognize this as GT_NOT which can enable other optimizations
|
||||
assert(GetOperandCount() == 2);
|
||||
return vecCon->IsVectorAllBitsSet();
|
||||
}
|
||||
#endif // TARGET_XARCH
|
||||
|
||||
default:
|
||||
{
|
||||
break;
|
||||
|
@ -29936,7 +29965,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const
|
|||
unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3)
|
||||
{
|
||||
#if defined(TARGET_XARCH)
|
||||
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId));
|
||||
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) ||
|
||||
HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId));
|
||||
#elif defined(TARGET_ARM64)
|
||||
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId));
|
||||
#endif
|
||||
|
@ -29980,85 +30010,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// GetTernaryControlByte: calculate the value of the control byte for ternary node
|
||||
// with given logic nodes on the input.
|
||||
//
|
||||
// Return value: the value of the ternary control byte.
|
||||
uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const
|
||||
{
|
||||
// we assume we have a structure like:
|
||||
/*
|
||||
/- A
|
||||
+- B
|
||||
t1 = binary logical op1
|
||||
|
||||
/- C
|
||||
+- t1
|
||||
t2 = binary logical op2
|
||||
*/
|
||||
|
||||
// To calculate the control byte value:
|
||||
// The way the constants work is we have three keys:
|
||||
// * A: 0xF0
|
||||
// * B: 0xCC
|
||||
// * C: 0xAA
|
||||
//
|
||||
// To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you
|
||||
// wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A.
|
||||
assert(second->Op(1) == this || second->Op(2) == this);
|
||||
const uint8_t A = 0xF0;
|
||||
const uint8_t B = 0xCC;
|
||||
const uint8_t C = 0xAA;
|
||||
|
||||
bool isScalar = false;
|
||||
|
||||
genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar);
|
||||
assert(!isScalar);
|
||||
|
||||
genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar);
|
||||
assert(!isScalar);
|
||||
|
||||
uint8_t AB = 0;
|
||||
uint8_t ABC = 0;
|
||||
|
||||
if (firstOper == GT_AND)
|
||||
{
|
||||
AB = A & B;
|
||||
}
|
||||
else if (firstOper == GT_OR)
|
||||
{
|
||||
AB = A | B;
|
||||
}
|
||||
else if (firstOper == GT_XOR)
|
||||
{
|
||||
AB = A ^ B;
|
||||
}
|
||||
else
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
|
||||
if (secondOper == GT_AND)
|
||||
{
|
||||
ABC = AB & C;
|
||||
}
|
||||
else if (secondOper == GT_OR)
|
||||
{
|
||||
ABC = AB | C;
|
||||
}
|
||||
else if (secondOper == GT_XOR)
|
||||
{
|
||||
ABC = AB ^ C;
|
||||
}
|
||||
else
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
|
||||
return ABC;
|
||||
}
|
||||
#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS
|
||||
|
||||
unsigned GenTreeLclFld::GetSize() const
|
||||
|
@ -30454,13 +30405,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
|
|||
bool isScalar = false;
|
||||
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
if (oper == GT_AND_NOT)
|
||||
{
|
||||
// xarch does: ~op1 & op2, we need op1 & ~op2
|
||||
std::swap(op1, op2);
|
||||
}
|
||||
#endif // TARGET_XARCH
|
||||
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
|
||||
assert(oper != GT_AND_NOT);
|
||||
|
||||
GenTree* cnsNode = nullptr;
|
||||
GenTree* otherNode = nullptr;
|
||||
|
@ -30973,31 +30919,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
|
|||
break;
|
||||
}
|
||||
|
||||
case GT_AND_NOT:
|
||||
{
|
||||
// Handle `x & ~0 == x` and `0 & ~x == 0`
|
||||
if (cnsNode->IsVectorZero())
|
||||
{
|
||||
if (cnsNode == op1)
|
||||
{
|
||||
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
resultNode = otherNode;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Handle `x & ~AllBitsSet == 0`
|
||||
if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
|
||||
{
|
||||
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case GT_DIV:
|
||||
{
|
||||
if (varTypeIsFloating(simdBaseType))
|
||||
|
@ -31388,12 +31309,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
|
|||
{
|
||||
switch (ni)
|
||||
{
|
||||
case NI_Vector128_ConditionalSelect:
|
||||
#if defined(TARGET_XARCH)
|
||||
case NI_Vector128_ConditionalSelect:
|
||||
case NI_Vector256_ConditionalSelect:
|
||||
case NI_Vector512_ConditionalSelect:
|
||||
#elif defined(TARGET_ARM64)
|
||||
case NI_Vector64_ConditionalSelect:
|
||||
case NI_AdvSimd_BitwiseSelect:
|
||||
case NI_Sve_ConditionalSelect:
|
||||
#endif
|
||||
{
|
||||
|
|
|
@ -6527,7 +6527,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
|
|||
bool OperRequiresGlobRefFlag() const;
|
||||
|
||||
unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3);
|
||||
uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const;
|
||||
|
||||
ClassLayout* GetLayout(Compiler* compiler) const;
|
||||
|
||||
|
|
|
@ -352,6 +352,365 @@ const TernaryLogicInfo& TernaryLogicInfo::lookup(uint8_t control)
|
|||
|
||||
return ternaryLogicFlags[control];
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// GetTernaryControlByte: Get the control byte for a TernaryLogic operation
|
||||
// given the oper and two existing control bytes
|
||||
//
|
||||
// Arguments:
|
||||
// oper -- the operation being performed
|
||||
// op1 -- the control byte for op1
|
||||
// op2 -- the control byte for op2
|
||||
//
|
||||
// Return Value:
|
||||
// The new control byte evaluated from performing oper on op1 and op2
|
||||
//
|
||||
uint8_t TernaryLogicInfo::GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2)
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
case GT_AND:
|
||||
{
|
||||
return static_cast<uint8_t>(op1 & op2);
|
||||
}
|
||||
|
||||
case GT_AND_NOT:
|
||||
{
|
||||
return static_cast<uint8_t>(~op1 & op2);
|
||||
}
|
||||
|
||||
case GT_OR:
|
||||
{
|
||||
return static_cast<uint8_t>(op1 | op2);
|
||||
}
|
||||
|
||||
case GT_XOR:
|
||||
{
|
||||
return static_cast<uint8_t>(op1 ^ op2);
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// GetTernaryControlByte: Get the control byte for a TernaryLogic operation
|
||||
// given a ternary logic oper and two inputs
|
||||
//
|
||||
// Arguments:
|
||||
// oper -- the operation being performed
|
||||
// op1 -- the control byte for op1, this is ignored for unary oper
|
||||
// op2 -- the control byte for op2
|
||||
//
|
||||
// Return Value:
|
||||
// The new control byte evaluated from performing oper on op1 and op2
|
||||
//
|
||||
uint8_t TernaryLogicInfo::GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2)
|
||||
{
|
||||
switch (oper)
|
||||
{
|
||||
case TernaryLogicOperKind::Select:
|
||||
{
|
||||
return op2;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Not:
|
||||
{
|
||||
return ~op2;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::And:
|
||||
{
|
||||
return op1 & op2;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Nand:
|
||||
{
|
||||
return ~(op1 & op2);
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Or:
|
||||
{
|
||||
return op1 | op2;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Nor:
|
||||
{
|
||||
return ~(op1 | op2);
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Xor:
|
||||
{
|
||||
return op1 ^ op2;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Xnor:
|
||||
{
|
||||
return ~(op1 ^ op2);
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// GetTernaryControlByte: Get the control byte for a TernaryLogic operation
|
||||
// given an existing info and three control bytes
|
||||
//
|
||||
// Arguments:
|
||||
// info -- the info describing the operation being performed
|
||||
// op1 -- the control byte for op1
|
||||
// op2 -- the control byte for op2
|
||||
// op3 -- the control byte for op3
|
||||
//
|
||||
// Return Value:
|
||||
// The new control byte evaluated from performing info on op1, op2, and op3
|
||||
//
|
||||
uint8_t TernaryLogicInfo::GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3)
|
||||
{
|
||||
uint8_t oper1Result;
|
||||
|
||||
switch (info.oper1Use)
|
||||
{
|
||||
case TernaryLogicUseFlags::None:
|
||||
{
|
||||
assert(info.oper2 == TernaryLogicOperKind::None);
|
||||
assert(info.oper2Use == TernaryLogicUseFlags::None);
|
||||
|
||||
assert(info.oper3 == TernaryLogicOperKind::None);
|
||||
assert(info.oper3Use == TernaryLogicUseFlags::None);
|
||||
|
||||
switch (info.oper1)
|
||||
{
|
||||
case TernaryLogicOperKind::False:
|
||||
{
|
||||
oper1Result = 0x00;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::True:
|
||||
{
|
||||
oper1Result = 0xFF;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::A:
|
||||
{
|
||||
oper1Result = GetTernaryControlByte(info.oper1, 0x00, op1);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::B:
|
||||
{
|
||||
oper1Result = GetTernaryControlByte(info.oper1, 0x00, op2);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::C:
|
||||
{
|
||||
oper1Result = GetTernaryControlByte(info.oper1, 0x00, op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::AB:
|
||||
{
|
||||
oper1Result = GetTernaryControlByte(info.oper1, op1, op2);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::AC:
|
||||
{
|
||||
oper1Result = GetTernaryControlByte(info.oper1, op1, op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::BC:
|
||||
{
|
||||
oper1Result = GetTernaryControlByte(info.oper1, op2, op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::ABC:
|
||||
{
|
||||
assert(info.oper2 == TernaryLogicOperKind::None);
|
||||
assert(info.oper2Use == TernaryLogicUseFlags::None);
|
||||
|
||||
assert(info.oper3 == TernaryLogicOperKind::None);
|
||||
assert(info.oper3Use == TernaryLogicUseFlags::None);
|
||||
|
||||
switch (info.oper1)
|
||||
{
|
||||
case TernaryLogicOperKind::Nor:
|
||||
{
|
||||
oper1Result = ~(op1 | op2 | op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Minor:
|
||||
{
|
||||
oper1Result = 0x17;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Xnor:
|
||||
{
|
||||
oper1Result = ~(op1 ^ op2 ^ op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Nand:
|
||||
{
|
||||
oper1Result = ~(op1 & op2 & op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::And:
|
||||
{
|
||||
oper1Result = op1 & op2 & op3;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Xor:
|
||||
{
|
||||
oper1Result = op1 ^ op2 ^ op3;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Major:
|
||||
{
|
||||
oper1Result = 0xE8;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicOperKind::Or:
|
||||
{
|
||||
oper1Result = op1 | op2 | op3;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t oper2Result;
|
||||
|
||||
switch (info.oper2Use)
|
||||
{
|
||||
case TernaryLogicUseFlags::None:
|
||||
{
|
||||
assert(info.oper3 == TernaryLogicOperKind::None);
|
||||
assert(info.oper3Use == TernaryLogicUseFlags::None);
|
||||
|
||||
oper2Result = oper1Result;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::A:
|
||||
{
|
||||
oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op1);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::B:
|
||||
{
|
||||
oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op2);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::C:
|
||||
{
|
||||
oper2Result = GetTernaryControlByte(info.oper2, oper1Result, op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::AB:
|
||||
{
|
||||
oper2Result = GetTernaryControlByte(info.oper2, op1, op2);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::AC:
|
||||
{
|
||||
oper2Result = GetTernaryControlByte(info.oper2, op1, op3);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::BC:
|
||||
{
|
||||
oper2Result = GetTernaryControlByte(info.oper2, op2, op3);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t oper3Result;
|
||||
|
||||
switch (info.oper3Use)
|
||||
{
|
||||
case TernaryLogicUseFlags::None:
|
||||
{
|
||||
assert(info.oper3 == TernaryLogicOperKind::None);
|
||||
oper3Result = oper2Result;
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::A:
|
||||
{
|
||||
assert(info.oper3 == TernaryLogicOperKind::Cond);
|
||||
oper3Result = (oper1Result & op1) | (oper2Result & ~op1);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::B:
|
||||
{
|
||||
assert(info.oper3 == TernaryLogicOperKind::Cond);
|
||||
oper3Result = (oper1Result & op2) | (oper2Result & ~op2);
|
||||
break;
|
||||
}
|
||||
|
||||
case TernaryLogicUseFlags::C:
|
||||
{
|
||||
assert(info.oper3 == TernaryLogicOperKind::Cond);
|
||||
oper3Result = (oper1Result & op3) | (oper2Result & ~op3);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
unreached();
|
||||
}
|
||||
}
|
||||
|
||||
return oper3Result;
|
||||
}
|
||||
#endif // TARGET_XARCH
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
|
|
|
@ -481,6 +481,10 @@ struct TernaryLogicInfo
|
|||
|
||||
static const TernaryLogicInfo& lookup(uint8_t control);
|
||||
|
||||
static uint8_t GetTernaryControlByte(genTreeOps oper, uint8_t op1, uint8_t op2);
|
||||
static uint8_t GetTernaryControlByte(TernaryLogicOperKind oper, uint8_t op1, uint8_t op2);
|
||||
static uint8_t GetTernaryControlByte(const TernaryLogicInfo& info, uint8_t op1, uint8_t op2, uint8_t op3);
|
||||
|
||||
TernaryLogicUseFlags GetAllUseFlags() const
|
||||
{
|
||||
uint8_t useFlagsBits = 0;
|
||||
|
@ -1024,6 +1028,11 @@ struct HWIntrinsicInfo
|
|||
HWIntrinsicFlag flags = lookupFlags(id);
|
||||
return (flags & HW_Flag_PermuteVar2x) != 0;
|
||||
}
|
||||
|
||||
static bool IsTernaryLogic(NamedIntrinsic id)
|
||||
{
|
||||
return (id == NI_AVX512F_TernaryLogic) || (id == NI_AVX512F_VL_TernaryLogic) || (id == NI_AVX10v1_TernaryLogic);
|
||||
}
|
||||
#endif // TARGET_XARCH
|
||||
|
||||
#if defined(TARGET_ARM64)
|
||||
|
|
|
@ -611,15 +611,39 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
|
|||
break;
|
||||
}
|
||||
|
||||
case NI_AdvSimd_BitwiseClear:
|
||||
case NI_Vector64_AndNot:
|
||||
case NI_Vector128_AndNot:
|
||||
{
|
||||
assert(sig->numArgs == 2);
|
||||
|
||||
// We don't want to support creating AND_NOT nodes prior to LIR
|
||||
// as it can break important optimizations. We'll produces this
|
||||
// in lowering instead so decompose into the individual operations
|
||||
// on import
|
||||
|
||||
op2 = impSIMDPopStack();
|
||||
op1 = impSIMDPopStack();
|
||||
|
||||
retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize);
|
||||
op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize));
|
||||
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
|
||||
break;
|
||||
}
|
||||
|
||||
case NI_AdvSimd_OrNot:
|
||||
{
|
||||
assert(sig->numArgs == 2);
|
||||
|
||||
// We don't want to support creating OR_NOT nodes prior to LIR
|
||||
// as it can break important optimizations. We'll produces this
|
||||
// in lowering instead so decompose into the individual operations
|
||||
// on import
|
||||
|
||||
op2 = impSIMDPopStack();
|
||||
op1 = impSIMDPopStack();
|
||||
|
||||
op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize));
|
||||
retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -1298,9 +1298,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
|
|||
// non-RMW based codegen.
|
||||
|
||||
#if defined(DEBUG)
|
||||
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
|
||||
assert((intrinsicId == NI_AVX512F_TernaryLogic) || (intrinsicId == NI_AVX512F_VL_TernaryLogic) ||
|
||||
(intrinsicId == NI_AVX10v1_TernaryLogic));
|
||||
assert(HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId()));
|
||||
|
||||
uint8_t control = static_cast<uint8_t>(ival);
|
||||
const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control);
|
||||
|
@ -1311,6 +1309,19 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
|
|||
|
||||
op2Reg = targetReg;
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
if (HWIntrinsicInfo::IsTernaryLogic(node->GetHWIntrinsicId()))
|
||||
{
|
||||
uint8_t control = static_cast<uint8_t>(ival);
|
||||
const TernaryLogicInfo& info = TernaryLogicInfo::lookup(control);
|
||||
TernaryLogicUseFlags useFlags = info.GetAllUseFlags();
|
||||
|
||||
assert(useFlags == TernaryLogicUseFlags::BC);
|
||||
}
|
||||
#endif // DEBUG
|
||||
}
|
||||
}
|
||||
|
||||
assert(targetReg != REG_NA);
|
||||
|
@ -2856,6 +2867,46 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption
|
|||
break;
|
||||
}
|
||||
|
||||
case NI_EVEX_XnorMask:
|
||||
{
|
||||
assert(instOptions == INS_OPTS_NONE);
|
||||
|
||||
uint32_t simdSize = node->GetSimdSize();
|
||||
uint32_t count = simdSize / genTypeSize(baseType);
|
||||
|
||||
if (count <= 8)
|
||||
{
|
||||
assert((count == 2) || (count == 4) || (count == 8));
|
||||
ins = INS_kxnorb;
|
||||
}
|
||||
else if (count == 16)
|
||||
{
|
||||
ins = INS_kxnorw;
|
||||
}
|
||||
else if (count == 32)
|
||||
{
|
||||
ins = INS_kxnord;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(count == 64);
|
||||
ins = INS_kxnorq;
|
||||
}
|
||||
|
||||
op1Reg = op1->GetRegNum();
|
||||
|
||||
GenTree* op2 = node->Op(2);
|
||||
regNumber op2Reg = op2->GetRegNum();
|
||||
|
||||
assert(emitter::isMaskReg(targetReg));
|
||||
assert(emitter::isMaskReg(op1Reg));
|
||||
assert(emitter::isMaskReg(op2Reg));
|
||||
|
||||
// Use EA_32BYTE to ensure the VEX.L bit gets set
|
||||
emit->emitIns_R_R_R(ins, EA_32BYTE, targetReg, op1Reg, op2Reg);
|
||||
break;
|
||||
}
|
||||
|
||||
case NI_AVX512F_ConvertToInt32:
|
||||
case NI_AVX512F_ConvertToUInt32:
|
||||
case NI_AVX512F_ConvertToUInt32WithTruncation:
|
||||
|
|
|
@ -263,7 +263,7 @@ HARDWARE_INTRINSIC(AdvSimd, AddScalar,
|
|||
HARDWARE_INTRINSIC(AdvSimd, AddWideningLower, 8, 2, true, {INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddl, INS_uaddl, INS_saddw, INS_uaddw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen)
|
||||
HARDWARE_INTRINSIC(AdvSimd, AddWideningUpper, 16, 2, true, {INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddl2, INS_uaddl2, INS_saddw2, INS_uaddw2, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromSecondArg|HW_Flag_SpecialCodeGen)
|
||||
HARDWARE_INTRINSIC(AdvSimd, And, -1, 2, true, {INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and, INS_and}, HW_Category_SIMD, HW_Flag_Commutative)
|
||||
HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AdvSimd, BitwiseClear, -1, 2, true, {INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic, INS_bic}, HW_Category_SIMD, HW_Flag_SpecialImport)
|
||||
HARDWARE_INTRINSIC(AdvSimd, BitwiseSelect, -1, 3, true, {INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl, INS_bsl}, HW_Category_SIMD, HW_Flag_SpecialCodeGen)
|
||||
HARDWARE_INTRINSIC(AdvSimd, Ceiling, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_invalid}, HW_Category_SIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AdvSimd, CeilingScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_frintp, INS_frintp}, HW_Category_SIMD, HW_Flag_SIMDScalar)
|
||||
|
@ -401,7 +401,7 @@ HARDWARE_INTRINSIC(AdvSimd, NegateSaturate,
|
|||
HARDWARE_INTRINSIC(AdvSimd, NegateScalar, 8, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fneg, INS_fneg}, HW_Category_SIMD, HW_Flag_SIMDScalar)
|
||||
HARDWARE_INTRINSIC(AdvSimd, Not, -1, 1, true, {INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn, INS_mvn}, HW_Category_SIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AdvSimd, Or, -1, 2, true, {INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr, INS_orr}, HW_Category_SIMD, HW_Flag_Commutative)
|
||||
HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AdvSimd, OrNot, -1, 2, true, {INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn, INS_orn}, HW_Category_SIMD, HW_Flag_SpecialImport)
|
||||
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiply, -1, 2, true, {INS_pmul, INS_pmul, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative)
|
||||
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningLower, 8, 2, true, {INS_pmull, INS_pmull, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative)
|
||||
HARDWARE_INTRINSIC(AdvSimd, PolynomialMultiplyWideningUpper, 16, 2, true, {INS_pmull2, INS_pmull2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_Commutative)
|
||||
|
|
|
@ -388,7 +388,7 @@ HARDWARE_INTRINSIC(X86Base_X64, DivRem,
|
|||
HARDWARE_INTRINSIC(SSE, Add, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
|
||||
HARDWARE_INTRINSIC(SSE, And, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
|
||||
|
@ -468,7 +468,7 @@ HARDWARE_INTRINSIC(SSE, Subtract,
|
|||
HARDWARE_INTRINSIC(SSE, SubtractScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
|
||||
HARDWARE_INTRINSIC(SSE, UnpackHigh, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, UnpackLow, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE, Xor, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_SSE NI_SSE_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -492,7 +492,7 @@ HARDWARE_INTRINSIC(SSE2, Add,
|
|||
HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
|
||||
HARDWARE_INTRINSIC(SSE2, And, 16, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, Average, 16, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, true, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, true, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics)
|
||||
|
@ -590,7 +590,7 @@ HARDWARE_INTRINSIC(SSE2, SubtractScalar,
|
|||
HARDWARE_INTRINSIC(SSE2, SumAbsoluteDifferences, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(SSE2, UnpackHigh, 16, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, UnpackLow, 16, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_SSE2 NI_SSE2_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -723,7 +723,7 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32,
|
|||
HARDWARE_INTRINSIC(AVX, Add, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(AVX, And, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, Blend, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
|
||||
|
@ -791,7 +791,7 @@ HARDWARE_INTRINSIC(AVX, TestNotZAndNotC,
|
|||
HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, true, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX, Xor, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_AVX NI_AVX_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -805,7 +805,7 @@ HARDWARE_INTRINSIC(AVX2, Add,
|
|||
HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, true, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, false, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, And, 32, 2, false, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, false, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, Average, 32, 2, true, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, true, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, false, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
|
||||
|
@ -865,7 +865,7 @@ HARDWARE_INTRINSIC(AVX2, SubtractSaturate,
|
|||
HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AVX2, UnpackHigh, 32, 2, true, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, UnpackLow, 32, 2, true, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, false, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_AVX2 NI_AVX2_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -880,7 +880,7 @@ HARDWARE_INTRINSIC(AVX512F, AddScalar,
|
|||
HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, And, 64, 2, true, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, true, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId)
|
||||
HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad)
|
||||
|
@ -1002,7 +1002,7 @@ HARDWARE_INTRINSIC(AVX512F, SubtractScalar,
|
|||
HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, true, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_AVX512F NI_AVX512F_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -1183,7 +1183,7 @@ HARDWARE_INTRINSIC(AVX512CD_VL, LeadingZeroCount,
|
|||
// AVX512DQ Intrinsics
|
||||
#define FIRST_NI_AVX512DQ NI_AVX512DQ_And
|
||||
HARDWARE_INTRINSIC(AVX512DQ, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
|
||||
|
@ -1203,7 +1203,7 @@ HARDWARE_INTRINSIC(AVX512DQ, Range,
|
|||
HARDWARE_INTRINSIC(AVX512DQ, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX512DQ, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_AVX512DQ NI_AVX512DQ_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -1368,7 +1368,7 @@ HARDWARE_INTRINSIC(AVX10v1, TernaryLogic,
|
|||
// AVX10V1_V512 Intrinsics
|
||||
#define FIRST_NI_AVX10v1_V512 NI_AVX10v1_V512_And
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
|
||||
|
@ -1391,7 +1391,7 @@ HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8,
|
|||
HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8x2, 64, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible)
|
||||
HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp)
|
||||
#define LAST_NI_AVX10v1_V512 NI_AVX10v1_V512_Xor
|
||||
|
||||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
|
@ -1438,7 +1438,7 @@ HARDWARE_INTRINSIC(AES, KeygenAssist,
|
|||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
// BMI1 Intrinsics
|
||||
#define FIRST_NI_BMI1 NI_BMI1_AndNot
|
||||
HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
|
||||
|
@ -1452,7 +1452,7 @@ HARDWARE_INTRINSIC(BMI1, TrailingZeroCount,
|
|||
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
|
||||
// BMI1 Intrinsics
|
||||
#define FIRST_NI_BMI1_X64 NI_BMI1_X64_AndNot
|
||||
HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialImport|HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)
|
||||
HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
|
||||
|
@ -1599,6 +1599,7 @@ HARDWARE_INTRINSIC(EVEX, OrMask,
|
|||
HARDWARE_INTRINSIC(EVEX, ShiftLeftMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
|
||||
HARDWARE_INTRINSIC(EVEX, ShiftRightMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_SpecialCodeGen)
|
||||
HARDWARE_INTRINSIC(EVEX, XorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
|
||||
HARDWARE_INTRINSIC(EVEX, XnorMask, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask)
|
||||
|
||||
#endif // FEATURE_HW_INTRINSIC
|
||||
|
||||
|
|
|
@ -1394,19 +1394,62 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
|
|||
break;
|
||||
}
|
||||
|
||||
case NI_SSE_AndNot:
|
||||
case NI_SSE2_AndNot:
|
||||
case NI_AVX_AndNot:
|
||||
case NI_AVX2_AndNot:
|
||||
case NI_AVX512F_AndNot:
|
||||
case NI_AVX512DQ_AndNot:
|
||||
case NI_AVX10v1_V512_AndNot:
|
||||
{
|
||||
assert(sig->numArgs == 2);
|
||||
|
||||
// We don't want to support creating AND_NOT nodes prior to LIR
|
||||
// as it can break important optimizations. We'll produces this
|
||||
// in lowering instead so decompose into the individual operations
|
||||
// on import, taking into account that despite the name, these APIs
|
||||
// do (~op1 & op2), so we need to account for that
|
||||
|
||||
op2 = impSIMDPopStack();
|
||||
op1 = impSIMDPopStack();
|
||||
|
||||
op1 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize));
|
||||
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
|
||||
break;
|
||||
}
|
||||
|
||||
case NI_BMI1_AndNot:
|
||||
case NI_BMI1_X64_AndNot:
|
||||
{
|
||||
assert(sig->numArgs == 2);
|
||||
|
||||
// The same general reasoning for the decomposition exists here as
|
||||
// given above for the SIMD AndNot APIs.
|
||||
|
||||
op2 = impPopStack().val;
|
||||
op1 = impPopStack().val;
|
||||
|
||||
op1 = gtFoldExpr(gtNewOperNode(GT_NOT, retType, op1));
|
||||
retNode = gtNewOperNode(GT_AND, retType, op1, op2);
|
||||
break;
|
||||
}
|
||||
|
||||
case NI_Vector128_AndNot:
|
||||
case NI_Vector256_AndNot:
|
||||
case NI_Vector512_AndNot:
|
||||
{
|
||||
assert(sig->numArgs == 2);
|
||||
|
||||
impSpillSideEffect(true,
|
||||
verCurrentState.esStackDepth - 2 DEBUGARG("Spilling op1 side effects for HWIntrinsic"));
|
||||
// We don't want to support creating AND_NOT nodes prior to LIR
|
||||
// as it can break important optimizations. We'll produces this
|
||||
// in lowering instead so decompose into the individual operations
|
||||
// on import
|
||||
|
||||
op2 = impSIMDPopStack();
|
||||
op1 = impSIMDPopStack();
|
||||
|
||||
retNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, op1, op2, simdBaseJitType, simdSize);
|
||||
op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize));
|
||||
retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -160,20 +160,8 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
|
|||
GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
|
||||
GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize);
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
if (canUseEvexEncoding())
|
||||
{
|
||||
GenTree* control;
|
||||
|
||||
control = gtNewIconNode(static_cast<uint8_t>((0xF0 | 0xCC) ^ 0xAA)); // (A | B)) ^ C
|
||||
xor1 = gtNewSimdTernaryLogicNode(simdType, vec1, toLowerVec1, cnsVec1, control, baseType, simdSize);
|
||||
}
|
||||
else
|
||||
#endif // TARGET_XARCH
|
||||
{
|
||||
vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize);
|
||||
xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize);
|
||||
}
|
||||
vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize);
|
||||
xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize);
|
||||
|
||||
vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize);
|
||||
}
|
||||
|
@ -184,22 +172,10 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
|
|||
|
||||
// ((v1 ^ cns1) | (v2 ^ cns2)) == zero
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
if (canUseEvexEncoding())
|
||||
{
|
||||
GenTree* control;
|
||||
GenTree* xor2;
|
||||
|
||||
control = gtNewIconNode(static_cast<uint8_t>(0xF0 | (0xCC ^ 0xAA))); // A | (B ^ C)
|
||||
orr = gtNewSimdTernaryLogicNode(simdType, xor1, vec2, cnsVec2, control, baseType, simdSize);
|
||||
}
|
||||
else
|
||||
#endif // TARGET_XARCH
|
||||
{
|
||||
GenTree* xor2;
|
||||
|
||||
xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize);
|
||||
orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize);
|
||||
}
|
||||
xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize);
|
||||
orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize);
|
||||
|
||||
// Optimization: use a single load when byteLen equals simdSize.
|
||||
// For code simplicity we always create nodes for two vectors case.
|
||||
|
|
|
@ -1233,6 +1233,87 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
|
|||
|
||||
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
|
||||
|
||||
bool isScalar = false;
|
||||
genTreeOps oper = node->GetOperForHWIntrinsicId(&isScalar);
|
||||
|
||||
switch (oper)
|
||||
{
|
||||
case GT_AND:
|
||||
case GT_OR:
|
||||
{
|
||||
// We want to recognize (~op1 & op2) and transform it
|
||||
// into AdvSimd.AndNot(op2, op1) as well as (op1 & ~op2)
|
||||
// transforming it into AdvSimd.AndNot(op1, op2)
|
||||
//
|
||||
// We want to similarly handle (~op1 | op2) and (op1 | ~op2)
|
||||
|
||||
bool transform = false;
|
||||
|
||||
GenTree* op1 = node->Op(1);
|
||||
GenTree* op2 = node->Op(2);
|
||||
|
||||
if (op2->OperIsHWIntrinsic())
|
||||
{
|
||||
GenTreeHWIntrinsic* op2Intrin = op2->AsHWIntrinsic();
|
||||
|
||||
bool op2IsScalar = false;
|
||||
genTreeOps op2Oper = op2Intrin->GetOperForHWIntrinsicId(&op2IsScalar);
|
||||
|
||||
if (op2Oper == GT_NOT)
|
||||
{
|
||||
assert(!op2IsScalar);
|
||||
transform = true;
|
||||
|
||||
op2 = op2Intrin->Op(1);
|
||||
BlockRange().Remove(op2Intrin);
|
||||
}
|
||||
}
|
||||
|
||||
if (!transform && op1->OperIsHWIntrinsic())
|
||||
{
|
||||
GenTreeHWIntrinsic* opIntrin = op1->AsHWIntrinsic();
|
||||
|
||||
bool op1IsScalar = false;
|
||||
genTreeOps op1Oper = opIntrin->GetOperForHWIntrinsicId(&op1IsScalar);
|
||||
|
||||
if (op1Oper == GT_NOT)
|
||||
{
|
||||
assert(!op1IsScalar);
|
||||
transform = true;
|
||||
|
||||
op1 = opIntrin->Op(1);
|
||||
BlockRange().Remove(opIntrin);
|
||||
|
||||
std::swap(op1, op2);
|
||||
}
|
||||
}
|
||||
|
||||
if (transform)
|
||||
{
|
||||
if (oper == GT_AND)
|
||||
{
|
||||
oper = GT_AND_NOT;
|
||||
intrinsicId = NI_AdvSimd_BitwiseClear;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(oper == GT_OR);
|
||||
oper = GT_NONE;
|
||||
intrinsicId = NI_AdvSimd_OrNot;
|
||||
}
|
||||
|
||||
node->ChangeHWIntrinsicId(intrinsicId, op1, op2);
|
||||
oper = GT_AND_NOT;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (intrinsicId)
|
||||
{
|
||||
case NI_Vector64_Create:
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -9921,6 +9921,9 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
|
|||
genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar);
|
||||
genTreeOps oper = actualOper;
|
||||
|
||||
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
|
||||
assert(oper != GT_AND_NOT);
|
||||
|
||||
if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
|
||||
{
|
||||
GenTree* op1 = node->Op(1);
|
||||
|
@ -9994,12 +9997,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
|
|||
break;
|
||||
}
|
||||
|
||||
case GT_AND_NOT:
|
||||
{
|
||||
maskIntrinsicId = NI_EVEX_AndNotMask;
|
||||
break;
|
||||
}
|
||||
|
||||
case GT_NOT:
|
||||
{
|
||||
maskIntrinsicId = NI_EVEX_NotMask;
|
||||
|
@ -10079,91 +10076,6 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
|
|||
|
||||
switch (oper)
|
||||
{
|
||||
// Transforms:
|
||||
// 1. (~v1 & v2) to VectorXxx.AndNot(v2, v1)
|
||||
// 2. (v1 & ~v2) to VectorXxx.AndNot(v1, v2)
|
||||
case GT_AND:
|
||||
{
|
||||
GenTree* op1 = node->Op(1);
|
||||
GenTree* op2 = node->Op(2);
|
||||
GenTree* lhs = nullptr;
|
||||
GenTree* rhs = nullptr;
|
||||
|
||||
if (op1->OperIsHWIntrinsic())
|
||||
{
|
||||
// Try handle: ~op1 & op2
|
||||
GenTreeHWIntrinsic* hw = op1->AsHWIntrinsic();
|
||||
genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar);
|
||||
|
||||
if (isScalar)
|
||||
{
|
||||
return node;
|
||||
}
|
||||
|
||||
#if defined(TARGET_ARM64)
|
||||
if (hwOper == GT_NOT)
|
||||
{
|
||||
lhs = op2;
|
||||
rhs = hw->Op(1);
|
||||
}
|
||||
#elif defined(TARGET_XARCH)
|
||||
if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet())
|
||||
{
|
||||
lhs = op2;
|
||||
rhs = hw->Op(1);
|
||||
}
|
||||
#endif // !TARGET_ARM64 && !TARGET_XARCH
|
||||
}
|
||||
|
||||
if ((lhs == nullptr) && op2->OperIsHWIntrinsic())
|
||||
{
|
||||
// Try handle: op1 & ~op2
|
||||
GenTreeHWIntrinsic* hw = op2->AsHWIntrinsic();
|
||||
genTreeOps hwOper = hw->GetOperForHWIntrinsicId(&isScalar);
|
||||
|
||||
if (isScalar)
|
||||
{
|
||||
return node;
|
||||
}
|
||||
|
||||
#if defined(TARGET_ARM64)
|
||||
if (hwOper == GT_NOT)
|
||||
{
|
||||
lhs = op1;
|
||||
rhs = hw->Op(1);
|
||||
}
|
||||
#elif defined(TARGET_XARCH)
|
||||
if ((hwOper == GT_XOR) && hw->Op(2)->IsVectorAllBitsSet())
|
||||
{
|
||||
lhs = op1;
|
||||
rhs = hw->Op(1);
|
||||
}
|
||||
#endif // !TARGET_ARM64 && !TARGET_XARCH
|
||||
}
|
||||
|
||||
if (lhs == nullptr)
|
||||
{
|
||||
break;
|
||||
}
|
||||
assert(rhs != nullptr);
|
||||
|
||||
// Filter out side effecting cases for several reasons:
|
||||
// 1. gtNewSimdBinOpNode may swap operand order.
|
||||
// 2. The code above will swap operand order.
|
||||
// 3. The code above does not handle GTF_REVERSE_OPS.
|
||||
if (((lhs->gtFlags | rhs->gtFlags) & GTF_ALL_EFFECT) != 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
GenTree* andnNode = gtNewSimdBinOpNode(GT_AND_NOT, retType, lhs, rhs, simdBaseJitType, simdSize);
|
||||
|
||||
DEBUG_DESTROY_NODE(node);
|
||||
INDEBUG(andnNode->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED);
|
||||
|
||||
return andnNode;
|
||||
}
|
||||
|
||||
#if defined(TARGET_ARM64)
|
||||
// Transforms:
|
||||
// 1. -(-v1) to v1
|
||||
|
|
|
@ -7919,13 +7919,11 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
|
|||
|
||||
if (oper != GT_NONE)
|
||||
{
|
||||
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
|
||||
assert(oper != GT_AND_NOT);
|
||||
|
||||
#if defined(TARGET_XARCH)
|
||||
if (oper == GT_AND_NOT)
|
||||
{
|
||||
// xarch does: ~arg0VN & arg1VN
|
||||
std::swap(arg0VN, arg1VN);
|
||||
}
|
||||
else if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ))
|
||||
if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ))
|
||||
{
|
||||
if (TypeOfVN(arg1VN) == TYP_SIMD16)
|
||||
{
|
||||
|
@ -8047,6 +8045,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
|
|||
bool isScalar = false;
|
||||
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
|
||||
|
||||
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
|
||||
assert(oper != GT_AND_NOT);
|
||||
|
||||
if (isScalar)
|
||||
{
|
||||
// We don't support folding scalars today
|
||||
|
@ -8108,37 +8109,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
|
|||
break;
|
||||
}
|
||||
|
||||
case GT_AND_NOT:
|
||||
{
|
||||
#if defined(TARGET_XARCH)
|
||||
std::swap(arg0VN, arg1VN);
|
||||
#endif // TARGET_XARCH
|
||||
|
||||
// Handle `x & ~0 == x` and `0 & ~x == 0`
|
||||
ValueNum zeroVN = VNZeroForType(type);
|
||||
|
||||
if (cnsVN == zeroVN)
|
||||
{
|
||||
if (cnsVN == arg0VN)
|
||||
{
|
||||
return zeroVN;
|
||||
}
|
||||
return argVN;
|
||||
}
|
||||
|
||||
// Handle `x & ~AllBitsSet == 0`
|
||||
ValueNum allBitsVN = VNAllBitsForType(type);
|
||||
|
||||
if (cnsVN == allBitsVN)
|
||||
{
|
||||
if (cnsVN == arg1VN)
|
||||
{
|
||||
return zeroVN;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case GT_DIV:
|
||||
{
|
||||
if (varTypeIsFloating(baseType))
|
||||
|
@ -8397,6 +8367,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
|
|||
bool isScalar = false;
|
||||
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);
|
||||
|
||||
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
|
||||
assert(oper != GT_AND_NOT);
|
||||
|
||||
if (isScalar)
|
||||
{
|
||||
// We don't support folding scalars today
|
||||
|
@ -8411,12 +8384,6 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary(GenTreeHWIntrinsic* tree,
|
|||
return arg0VN;
|
||||
}
|
||||
|
||||
case GT_AND_NOT:
|
||||
{
|
||||
// Handle `x & ~x == 0`
|
||||
return VNZeroForType(type);
|
||||
}
|
||||
|
||||
case GT_OR:
|
||||
{
|
||||
// Handle `x | x == x`
|
||||
|
@ -8575,12 +8542,12 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(GenTreeHWIntrinsic* tree,
|
|||
|
||||
switch (ni)
|
||||
{
|
||||
case NI_Vector128_ConditionalSelect:
|
||||
#if defined(TARGET_XARCH)
|
||||
case NI_Vector128_ConditionalSelect:
|
||||
case NI_Vector256_ConditionalSelect:
|
||||
case NI_Vector512_ConditionalSelect:
|
||||
#elif defined(TARGET_ARM64)
|
||||
case NI_Vector64_ConditionalSelect:
|
||||
case NI_AdvSimd_BitwiseSelect:
|
||||
case NI_Sve_ConditionalSelect:
|
||||
#endif
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue