mirror of
https://github.com/VSadov/Satori.git
synced 2025-06-09 09:34:49 +09:00
Faster unsigned division by constants (#49585)
* Faster unsigned division by constants * Fix Arm build and add some tests. * Improve register allocation * Fix ARM64 codegen * Fix MULHI flags * Remove ARM32 codegen * Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation. * Always widen 32bit UDIV to 64bit MUL/MULHI * Cleanup * Final optimization * Fix typo
This commit is contained in:
parent
63ea9c6b96
commit
e4b4807e2f
15 changed files with 328 additions and 120 deletions
|
@ -942,3 +942,13 @@ OF SUCH DAMAGES.
|
|||
You acknowledge that this software is not designed, licensed or
|
||||
intended for use in the design, construction, operation or
|
||||
maintenance of any nuclear facility.
|
||||
|
||||
|
||||
License notice for "Faster Unsigned Division by Constants"
|
||||
------------------------------
|
||||
|
||||
Reference implementations of computing and using the "magic number" approach to dividing
|
||||
by constants, including codegen instructions. The unsigned division incorporates the
|
||||
"round down" optimization per ridiculous_fish.
|
||||
|
||||
This is free and unencumbered software. Any copyright is dedicated to the Public Domain.
|
||||
|
|
|
@ -5152,8 +5152,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta
|
|||
case GT_INTRINSIC:
|
||||
break;
|
||||
|
||||
case GT_INC_SATURATE:
|
||||
case GT_MULHI:
|
||||
assert(false && "Unexpected GT_MULHI node encountered before lowering");
|
||||
assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
|
||||
break;
|
||||
|
||||
case GT_JTRUE:
|
||||
|
|
|
@ -843,6 +843,7 @@ protected:
|
|||
|
||||
void genCodeForDivMod(GenTreeOp* treeNode);
|
||||
void genCodeForMul(GenTreeOp* treeNode);
|
||||
void genCodeForIncSaturate(GenTree* treeNode);
|
||||
void genCodeForMulHi(GenTreeOp* treeNode);
|
||||
void genLeaInstruction(GenTreeAddrMode* lea);
|
||||
void genSetRegToCond(regNumber dstReg, GenTree* tree);
|
||||
|
|
|
@ -1753,6 +1753,28 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
|
|||
}
|
||||
}
|
||||
|
||||
// Produce code for a GT_INC_SATURATE node.
|
||||
void CodeGen::genCodeForIncSaturate(GenTree* tree)
|
||||
{
|
||||
regNumber targetReg = tree->GetRegNum();
|
||||
var_types targetType = tree->TypeGet();
|
||||
|
||||
// The arithmetic node must be sitting in a register (since it's not contained)
|
||||
assert(!tree->isContained());
|
||||
// The dst can only be a register.
|
||||
assert(targetReg != REG_NA);
|
||||
|
||||
GenTree* operand = tree->gtGetOp1();
|
||||
assert(!operand->isContained());
|
||||
// The src must be a register.
|
||||
regNumber operandReg = genConsumeReg(operand);
|
||||
|
||||
GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1);
|
||||
GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS);
|
||||
|
||||
genProduceReg(tree);
|
||||
}
|
||||
|
||||
// Generate code to get the high N bits of a N*N=2N bit multiplication result
|
||||
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
|
||||
{
|
||||
|
|
|
@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
|
|||
|
||||
#ifdef TARGET_ARM64
|
||||
|
||||
case GT_INC_SATURATE:
|
||||
genCodeForIncSaturate(treeNode);
|
||||
break;
|
||||
|
||||
case GT_MULHI:
|
||||
genCodeForMulHi(treeNode->AsOp());
|
||||
break;
|
||||
|
|
|
@ -605,6 +605,27 @@ void CodeGen::genCodeForBswap(GenTree* tree)
|
|||
genProduceReg(tree);
|
||||
}
|
||||
|
||||
// Produce code for a GT_INC_SATURATE node.
|
||||
void CodeGen::genCodeForIncSaturate(GenTree* tree)
|
||||
{
|
||||
regNumber targetReg = tree->GetRegNum();
|
||||
var_types targetType = tree->TypeGet();
|
||||
|
||||
GenTree* operand = tree->gtGetOp1();
|
||||
assert(operand->isUsedFromReg());
|
||||
regNumber operandReg = genConsumeReg(operand);
|
||||
|
||||
if (operandReg != targetReg)
|
||||
{
|
||||
inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
|
||||
}
|
||||
|
||||
inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType));
|
||||
inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType));
|
||||
|
||||
genProduceReg(tree);
|
||||
}
|
||||
|
||||
// Generate code to get the high N bits of a N*N=2N bit multiplication result
|
||||
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
|
||||
{
|
||||
|
@ -1608,6 +1629,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
|
|||
genCodeForIndir(treeNode->AsIndir());
|
||||
break;
|
||||
|
||||
case GT_INC_SATURATE:
|
||||
genCodeForIncSaturate(treeNode);
|
||||
break;
|
||||
|
||||
case GT_MULHI:
|
||||
#ifdef TARGET_X86
|
||||
case GT_MUL_LONG:
|
||||
|
|
|
@ -10793,6 +10793,7 @@ public:
|
|||
case GT_RETFILT:
|
||||
case GT_RUNTIMELOOKUP:
|
||||
case GT_KEEPALIVE:
|
||||
case GT_INC_SATURATE:
|
||||
{
|
||||
GenTreeUnOp* const unOp = node->AsUnOp();
|
||||
if (unOp->gtOp1 != nullptr)
|
||||
|
|
|
@ -4328,6 +4328,7 @@ void GenTree::VisitOperands(TVisitor visitor)
|
|||
#endif // FEATURE_ARG_SPLIT
|
||||
case GT_RETURNTRAP:
|
||||
case GT_KEEPALIVE:
|
||||
case GT_INC_SATURATE:
|
||||
visitor(this->AsUnOp()->gtOp1);
|
||||
return;
|
||||
|
||||
|
|
|
@ -5217,6 +5217,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
|
|||
case GT_BSWAP:
|
||||
case GT_BSWAP16:
|
||||
case GT_KEEPALIVE:
|
||||
case GT_INC_SATURATE:
|
||||
if (def == this->AsUnOp()->gtOp1)
|
||||
{
|
||||
*use = &this->AsUnOp()->gtOp1;
|
||||
|
@ -9315,6 +9316,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
|
|||
case GT_BSWAP:
|
||||
case GT_BSWAP16:
|
||||
case GT_KEEPALIVE:
|
||||
case GT_INC_SATURATE:
|
||||
#if FEATURE_ARG_SPLIT
|
||||
case GT_PUTARG_SPLIT:
|
||||
#endif // FEATURE_ARG_SPLIT
|
||||
|
|
|
@ -132,6 +132,7 @@ GTNODE(RSH , GenTreeOp ,0,GTK_BINOP)
|
|||
GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP)
|
||||
GTNODE(ROL , GenTreeOp ,0,GTK_BINOP)
|
||||
GTNODE(ROR , GenTreeOp ,0,GTK_BINOP)
|
||||
GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
|
||||
GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
|
||||
// GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
|
||||
// the div into a MULHI + some adjustments. In codegen, we only use the
|
||||
|
|
|
@ -5171,31 +5171,48 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
|
|||
if (!comp->opts.MinOpts() && (divisorValue >= 3))
|
||||
{
|
||||
size_t magic;
|
||||
bool add;
|
||||
int shift;
|
||||
bool increment;
|
||||
int preShift;
|
||||
int postShift;
|
||||
bool simpleMul = false;
|
||||
|
||||
if (type == TYP_INT)
|
||||
{
|
||||
magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift);
|
||||
magic =
|
||||
MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift, &postShift);
|
||||
|
||||
#ifdef TARGET_64BIT
|
||||
// avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
|
||||
if (increment || (preShift
|
||||
#ifdef TARGET_XARCH
|
||||
// IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
|
||||
&& static_cast<int32_t>(magic) < 0
|
||||
#endif
|
||||
))
|
||||
{
|
||||
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
|
||||
&postShift, 32);
|
||||
}
|
||||
// otherwise just widen to regular multiplication
|
||||
else
|
||||
{
|
||||
postShift += 32;
|
||||
simpleMul = true;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef TARGET_64BIT
|
||||
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift);
|
||||
magic =
|
||||
MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift, &postShift);
|
||||
#else
|
||||
unreached();
|
||||
#endif
|
||||
}
|
||||
assert(divMod->MarkedDivideByConstOptimized());
|
||||
|
||||
// Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate:
|
||||
// add == false (when divisor == 3 for example):
|
||||
// div = (dividend MULHI magic) RSZ shift
|
||||
// add == true (when divisor == 7 for example):
|
||||
// mulhi = dividend MULHI magic
|
||||
// div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
|
||||
const bool requiresAdjustment = add;
|
||||
const bool requiresDividendMultiuse = requiresAdjustment || !isDiv;
|
||||
const bool requiresDividendMultiuse = !isDiv;
|
||||
const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp);
|
||||
|
||||
if (requiresDividendMultiuse)
|
||||
|
@ -5204,41 +5221,66 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
|
|||
dividend = ReplaceWithLclVar(dividendUse);
|
||||
}
|
||||
|
||||
GenTree* firstNode = nullptr;
|
||||
GenTree* adjustedDividend = dividend;
|
||||
|
||||
// If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
|
||||
if (increment)
|
||||
{
|
||||
adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
|
||||
BlockRange().InsertBefore(divMod, adjustedDividend);
|
||||
firstNode = adjustedDividend;
|
||||
assert(!preShift);
|
||||
}
|
||||
// if "preShift" is required, then do a right shift before
|
||||
else if (preShift)
|
||||
{
|
||||
GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
|
||||
adjustedDividend = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
|
||||
BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
|
||||
firstNode = preShiftBy;
|
||||
}
|
||||
else if (type != TYP_I_IMPL)
|
||||
{
|
||||
adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
|
||||
BlockRange().InsertBefore(divMod, adjustedDividend);
|
||||
firstNode = adjustedDividend;
|
||||
}
|
||||
|
||||
#ifdef TARGET_XARCH
|
||||
// force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
|
||||
// reduntant copies otherwise
|
||||
if (firstNode && !simpleMul)
|
||||
adjustedDividend->SetRegNum(REG_RAX);
|
||||
#endif
|
||||
|
||||
divisor->gtType = TYP_I_IMPL;
|
||||
divisor->AsIntCon()->SetIconValue(magic);
|
||||
|
||||
if (isDiv && !postShift && type == TYP_I_IMPL)
|
||||
{
|
||||
divMod->SetOper(GT_MULHI);
|
||||
divMod->gtOp1 = adjustedDividend;
|
||||
divMod->gtFlags |= GTF_UNSIGNED;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
|
||||
// The existing node will later be transformed into a GT_RSZ/GT_SUB that
|
||||
// computes the final result. This way don't need to find and change the use
|
||||
// of the existing node.
|
||||
GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor);
|
||||
GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
|
||||
mulhi->gtFlags |= GTF_UNSIGNED;
|
||||
divisor->AsIntCon()->SetIconValue(magic);
|
||||
BlockRange().InsertBefore(divMod, mulhi);
|
||||
GenTree* firstNode = mulhi;
|
||||
if (!firstNode)
|
||||
firstNode = mulhi;
|
||||
|
||||
if (requiresAdjustment)
|
||||
if (postShift)
|
||||
{
|
||||
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
|
||||
GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
|
||||
BlockRange().InsertBefore(divMod, dividend, sub);
|
||||
|
||||
GenTree* one = comp->gtNewIconNode(1, TYP_INT);
|
||||
GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
|
||||
BlockRange().InsertBefore(divMod, one, rsz);
|
||||
|
||||
LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub);
|
||||
mulhi = ReplaceWithLclVar(mulhiUse);
|
||||
|
||||
mulhi = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
|
||||
GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
|
||||
BlockRange().InsertBefore(divMod, mulhi, add);
|
||||
|
||||
mulhi = add;
|
||||
shift -= 1;
|
||||
}
|
||||
|
||||
GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
|
||||
GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
|
||||
BlockRange().InsertBefore(divMod, shiftBy);
|
||||
|
||||
if (isDiv)
|
||||
if (isDiv && type == TYP_I_IMPL)
|
||||
{
|
||||
divMod->SetOper(GT_RSZ);
|
||||
divMod->gtOp1 = mulhi;
|
||||
|
@ -5246,19 +5288,39 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
|
|||
}
|
||||
else
|
||||
{
|
||||
GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy);
|
||||
mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
|
||||
BlockRange().InsertBefore(divMod, mulhi);
|
||||
}
|
||||
}
|
||||
|
||||
if (!isDiv)
|
||||
{
|
||||
// divisor UMOD dividend = dividend SUB (div MUL divisor)
|
||||
GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
|
||||
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor);
|
||||
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
|
||||
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
|
||||
|
||||
divMod->SetOper(GT_SUB);
|
||||
divMod->gtOp1 = dividend;
|
||||
divMod->gtOp2 = mul;
|
||||
|
||||
BlockRange().InsertBefore(divMod, div, divisor, mul, dividend);
|
||||
BlockRange().InsertBefore(divMod, divisor, mul, dividend);
|
||||
}
|
||||
else if (type != TYP_I_IMPL)
|
||||
{
|
||||
#ifdef TARGET_ARMARCH
|
||||
divMod->SetOper(GT_CAST);
|
||||
divMod->gtFlags |= GTF_UNSIGNED;
|
||||
divMod->AsCast()->gtCastType = TYP_UINT;
|
||||
#else
|
||||
divMod->SetOper(GT_BITCAST);
|
||||
#endif
|
||||
divMod->gtOp1 = mulhi;
|
||||
divMod->gtOp2 = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (firstNode)
|
||||
ContainCheckRange(firstNode, divMod);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -2242,8 +2242,8 @@ struct UnsignedMagic
|
|||
typedef T DivisorType;
|
||||
|
||||
T magic;
|
||||
bool add;
|
||||
int shift;
|
||||
bool increment;
|
||||
char postShift;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
@ -2260,7 +2260,7 @@ const UnsignedMagic<uint32_t>* TryGetUnsignedMagic(uint32_t divisor)
|
|||
{},
|
||||
{0xcccccccd, false, 2}, // 5
|
||||
{0xaaaaaaab, false, 2}, // 6
|
||||
{0x24924925, true, 3}, // 7
|
||||
{0x49249249, true, 1}, // 7
|
||||
{},
|
||||
{0x38e38e39, false, 1}, // 9
|
||||
{0xcccccccd, false, 3}, // 10
|
||||
|
@ -2279,7 +2279,7 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
|
|||
{},
|
||||
{0xcccccccccccccccd, false, 2}, // 5
|
||||
{0xaaaaaaaaaaaaaaab, false, 2}, // 6
|
||||
{0x2492492492492493, true, 3}, // 7
|
||||
{0x9249249249249249, true, 2}, // 7
|
||||
{},
|
||||
{0xe38e38e38e38e38f, false, 3}, // 9
|
||||
{0xcccccccccccccccd, false, 3}, // 10
|
||||
|
@ -2296,99 +2296,138 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
|
|||
//
|
||||
// Arguments:
|
||||
// d - The divisor
|
||||
// add - Pointer to a flag indicating the kind of code to generate
|
||||
// shift - Pointer to the shift value to be returned
|
||||
// increment - Pointer to a flag indicating if incrementing the numerator is required
|
||||
// preShift - Pointer to the pre-shift value to be returned
|
||||
// postShift - Pointer to the post-shift value to be returned
|
||||
//
|
||||
// Returns:
|
||||
// The magic number.
|
||||
//
|
||||
// Notes:
|
||||
// This code is adapted from _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58.
|
||||
// The paper is based on "Division by invariant integers using multiplication"
|
||||
// by Torbjorn Granlund and Peter L. Montgomery in PLDI 94
|
||||
// Based on "Faster Unsigned Division by Constants" by ridiculous_fish.
|
||||
// https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
|
||||
// https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c
|
||||
|
||||
template <typename T>
|
||||
T GetUnsignedMagic(T d, bool* add /*out*/, int* shift /*out*/)
|
||||
T GetUnsignedMagic(T d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned num_bits)
|
||||
{
|
||||
assert((d >= 3) && !isPow2(d));
|
||||
|
||||
// The numerator must fit in a uint
|
||||
assert(num_bits > 0 && num_bits <= sizeof(T) * CHAR_BIT);
|
||||
|
||||
// Bits in a uint
|
||||
const unsigned UINT_BITS = sizeof(T) * CHAR_BIT;
|
||||
|
||||
if (num_bits == UINT_BITS)
|
||||
{
|
||||
const UnsignedMagic<T>* magic = TryGetUnsignedMagic(d);
|
||||
|
||||
if (magic != nullptr)
|
||||
{
|
||||
*shift = magic->shift;
|
||||
*add = magic->add;
|
||||
*increment = magic->increment;
|
||||
*preShift = 0;
|
||||
*postShift = magic->postShift;
|
||||
return magic->magic;
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename std::make_signed<T>::type ST;
|
||||
// The extra shift implicit in the difference between UINT_BITS and num_bits
|
||||
const unsigned extra_shift = UINT_BITS - num_bits;
|
||||
|
||||
const unsigned bits = sizeof(T) * 8;
|
||||
const unsigned bitsMinus1 = bits - 1;
|
||||
const T twoNMinus1 = T(1) << bitsMinus1;
|
||||
// The initial power of 2 is one less than the first one that can possibly work
|
||||
const T initial_power_of_2 = (T)1 << (UINT_BITS - 1);
|
||||
|
||||
*add = false;
|
||||
const T nc = -ST(1) - -ST(d) % ST(d);
|
||||
unsigned p = bitsMinus1;
|
||||
T q1 = twoNMinus1 / nc;
|
||||
T r1 = twoNMinus1 - (q1 * nc);
|
||||
T q2 = (twoNMinus1 - 1) / d;
|
||||
T r2 = (twoNMinus1 - 1) - (q2 * d);
|
||||
T delta;
|
||||
// The remainder and quotient of our power of 2 divided by d
|
||||
T quotient = initial_power_of_2 / d, remainder = initial_power_of_2 % d;
|
||||
|
||||
do
|
||||
// The magic info for the variant "round down" algorithm
|
||||
T down_multiplier = 0;
|
||||
unsigned down_exponent = 0;
|
||||
int has_magic_down = 0;
|
||||
|
||||
// Compute ceil(log_2 D)
|
||||
unsigned ceil_log_2_D = 0;
|
||||
for (T tmp = d; tmp > 0; tmp >>= 1)
|
||||
ceil_log_2_D += 1;
|
||||
|
||||
// Begin a loop that increments the exponent, until we find a power of 2 that works.
|
||||
unsigned exponent;
|
||||
for (exponent = 0;; exponent++)
|
||||
{
|
||||
p++;
|
||||
|
||||
if (r1 >= (nc - r1))
|
||||
// Quotient and remainder is from previous exponent; compute it for this exponent.
|
||||
if (remainder >= d - remainder)
|
||||
{
|
||||
q1 = 2 * q1 + 1;
|
||||
r1 = 2 * r1 - nc;
|
||||
// Doubling remainder will wrap around D
|
||||
quotient = quotient * 2 + 1;
|
||||
remainder = remainder * 2 - d;
|
||||
}
|
||||
else
|
||||
{
|
||||
q1 = 2 * q1;
|
||||
r1 = 2 * r1;
|
||||
// Remainder will not wrap
|
||||
quotient = quotient * 2;
|
||||
remainder = remainder * 2;
|
||||
}
|
||||
|
||||
if ((r2 + 1) >= (d - r2))
|
||||
// We're done if this exponent works for the round_up algorithm.
|
||||
// Note that exponent may be larger than the maximum shift supported,
|
||||
// so the check for >= ceil_log_2_D is critical.
|
||||
if ((exponent + extra_shift >= ceil_log_2_D) || (d - remainder) <= ((T)1 << (exponent + extra_shift)))
|
||||
break;
|
||||
|
||||
// Set magic_down if we have not set it yet and this exponent works for the round_down algorithm
|
||||
if (!has_magic_down && remainder <= ((T)1 << (exponent + extra_shift)))
|
||||
{
|
||||
if (q2 >= (twoNMinus1 - 1))
|
||||
{
|
||||
*add = true;
|
||||
has_magic_down = 1;
|
||||
down_multiplier = quotient;
|
||||
down_exponent = exponent;
|
||||
}
|
||||
}
|
||||
|
||||
q2 = 2 * q2 + 1;
|
||||
r2 = 2 * r2 + 1 - d;
|
||||
if (exponent < ceil_log_2_D)
|
||||
{
|
||||
// magic_up is efficient
|
||||
*increment = false;
|
||||
*preShift = 0;
|
||||
*postShift = (int)exponent;
|
||||
return quotient + 1;
|
||||
}
|
||||
else if (d & 1)
|
||||
{
|
||||
// Odd divisor, so use magic_down, which must have been set
|
||||
assert(has_magic_down);
|
||||
*increment = true;
|
||||
*preShift = 0;
|
||||
*postShift = (int)down_exponent;
|
||||
return down_multiplier;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (q2 >= twoNMinus1)
|
||||
// Even divisor, so use a prefix-shifted dividend
|
||||
unsigned pre_shift = 0;
|
||||
T shifted_D = d;
|
||||
while ((shifted_D & 1) == 0)
|
||||
{
|
||||
*add = true;
|
||||
shifted_D >>= 1;
|
||||
pre_shift += 1;
|
||||
}
|
||||
|
||||
q2 = 2 * q2;
|
||||
r2 = 2 * r2 + 1;
|
||||
T result = GetUnsignedMagic<T>(shifted_D, increment, preShift, postShift, num_bits - pre_shift);
|
||||
assert(*increment == 0 && *preShift == 0); // expect no increment or pre_shift in this path
|
||||
*preShift = (int)pre_shift;
|
||||
return result;
|
||||
}
|
||||
|
||||
delta = d - 1 - r2;
|
||||
|
||||
} while ((p < (bits * 2)) && ((q1 < delta) || ((q1 == delta) && (r1 == 0))));
|
||||
|
||||
*shift = p - bits; // resulting shift
|
||||
return q2 + 1; // resulting magic number
|
||||
}
|
||||
|
||||
uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/)
|
||||
uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/)
|
||||
{
|
||||
return GetUnsignedMagic<uint32_t>(d, add, shift);
|
||||
return GetUnsignedMagic<uint32_t>(d, increment, preShift, postShift, 32);
|
||||
}
|
||||
|
||||
#ifdef TARGET_64BIT
|
||||
uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/)
|
||||
uint64_t GetUnsigned64Magic(
|
||||
uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits)
|
||||
{
|
||||
return GetUnsignedMagic<uint64_t>(d, add, shift);
|
||||
return GetUnsignedMagic<uint64_t>(d, increment, preShift, postShift, bits);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -756,9 +756,10 @@ private:
|
|||
|
||||
namespace MagicDivide
|
||||
{
|
||||
uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/);
|
||||
uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/);
|
||||
#ifdef TARGET_64BIT
|
||||
uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/);
|
||||
uint64_t GetUnsigned64Magic(
|
||||
uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits = 64);
|
||||
#endif
|
||||
int32_t GetSigned32Magic(int32_t d, int* shift /*out*/);
|
||||
#ifdef TARGET_64BIT
|
||||
|
|
36
src/tests/JIT/Math/Functions/DivideByConst.cs
Normal file
36
src/tests/JIT/Math/Functions/DivideByConst.cs
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace System.MathBenchmarks
|
||||
{
|
||||
public static class DivideByConst
|
||||
{
|
||||
public static void Test()
|
||||
{
|
||||
Verify(ulong.MaxValue, long.MaxValue, uint.MaxValue, int.MaxValue);
|
||||
|
||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||
static void Verify(ulong u64, long i64, uint u32, int i32)
|
||||
{
|
||||
if (u64 / 7 != 0x2492492492492492) throw new Exception($"{u64:x}/7={u64 / 7:x}");
|
||||
if (i64 / 7 != 0x1249249249249249) throw new Exception($"{i64:x}/7={i64 / 7:x}");
|
||||
if (u32 / 7 != 0x24924924) throw new Exception($"{u32:x}/7={u32 / 7:x}");
|
||||
if (i32 / 7 != 0x12492492) throw new Exception($"{i32:x}/7={i32 / 7:x}");
|
||||
|
||||
if (u64 / 14 != 0x1249249249249249) throw new Exception($"{u64:x}/14={u64 / 14:x}");
|
||||
if (i64 / 14 != 0x924924924924924) throw new Exception($"{i64:x}/14={i64 / 14:x}");
|
||||
if (u32 / 14 != 0x12492492) throw new Exception($"{u32:x}/14={u32 / 14:x}");
|
||||
if (i32 / 14 != 0x9249249) throw new Exception($"{i32:x}/14={i32 / 14:x}");
|
||||
|
||||
if (u64 / 56 != 0x492492492492492) throw new Exception($"{u64:x}/56={u64 / 56:x}");
|
||||
if (i64 / 56 != 0x249249249249249) throw new Exception($"{i64:x}/56={i64 / 56:x}");
|
||||
if (u32 / 56 != 0x4924924) throw new Exception($"{u32:x}/56={u32 / 56:x}");
|
||||
if (i32 / 56 != 0x2492492) throw new Exception($"{i32:x}/56={i32 / 56:x}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -74,6 +74,8 @@ namespace System.MathBenchmarks
|
|||
result += Test(singleTests.Tan);
|
||||
result += Test(singleTests.Tanh);
|
||||
|
||||
result += Test(DivideByConst.Test);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue