1
0
Fork 0
mirror of https://github.com/VSadov/Satori.git synced 2025-06-09 17:44:48 +09:00

Faster unsigned division by constants (#49585)

* Faster unsigned division by constants

* Fix Arm build and add some tests.

* Improve register allocation

* Fix ARM64 codegen

* Fix MULHI flags

* Remove ARM32 codegen

* Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation.

* Always widen 32bit UDIV to 64bit MUL/MULHI

* Cleanup

* Final optimization

* Fix typo
This commit is contained in:
Pent Ploompuu 2021-05-18 06:06:50 +03:00 committed by GitHub
parent 63ea9c6b96
commit e4b4807e2f
Signed by: github
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 328 additions and 120 deletions

View file

@ -942,3 +942,13 @@ OF SUCH DAMAGES.
You acknowledge that this software is not designed, licensed or You acknowledge that this software is not designed, licensed or
intended for use in the design, construction, operation or intended for use in the design, construction, operation or
maintenance of any nuclear facility. maintenance of any nuclear facility.
License notice for "Faster Unsigned Division by Constants"
------------------------------
Reference implementations of computing and using the "magic number" approach to dividing
by constants, including codegen instructions. The unsigned division incorporates the
"round down" optimization per ridiculous_fish.
This is free and unencumbered software. Any copyright is dedicated to the Public Domain.

View file

@ -5152,8 +5152,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta
case GT_INTRINSIC: case GT_INTRINSIC:
break; break;
case GT_INC_SATURATE:
case GT_MULHI: case GT_MULHI:
assert(false && "Unexpected GT_MULHI node encountered before lowering"); assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
break; break;
case GT_JTRUE: case GT_JTRUE:

View file

@ -843,6 +843,7 @@ protected:
void genCodeForDivMod(GenTreeOp* treeNode); void genCodeForDivMod(GenTreeOp* treeNode);
void genCodeForMul(GenTreeOp* treeNode); void genCodeForMul(GenTreeOp* treeNode);
void genCodeForIncSaturate(GenTree* treeNode);
void genCodeForMulHi(GenTreeOp* treeNode); void genCodeForMulHi(GenTreeOp* treeNode);
void genLeaInstruction(GenTreeAddrMode* lea); void genLeaInstruction(GenTreeAddrMode* lea);
void genSetRegToCond(regNumber dstReg, GenTree* tree); void genSetRegToCond(regNumber dstReg, GenTree* tree);

View file

@ -1753,6 +1753,28 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
} }
} }
// Produce code for a GT_INC_SATURATE node.
void CodeGen::genCodeForIncSaturate(GenTree* tree)
{
regNumber targetReg = tree->GetRegNum();
var_types targetType = tree->TypeGet();
// The arithmetic node must be sitting in a register (since it's not contained)
assert(!tree->isContained());
// The dst can only be a register.
assert(targetReg != REG_NA);
GenTree* operand = tree->gtGetOp1();
assert(!operand->isContained());
// The src must be a register.
regNumber operandReg = genConsumeReg(operand);
GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1);
GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS);
genProduceReg(tree);
}
// Generate code to get the high N bits of a N*N=2N bit multiplication result // Generate code to get the high N bits of a N*N=2N bit multiplication result
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode) void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
{ {

View file

@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
#ifdef TARGET_ARM64 #ifdef TARGET_ARM64
case GT_INC_SATURATE:
genCodeForIncSaturate(treeNode);
break;
case GT_MULHI: case GT_MULHI:
genCodeForMulHi(treeNode->AsOp()); genCodeForMulHi(treeNode->AsOp());
break; break;

View file

@ -605,6 +605,27 @@ void CodeGen::genCodeForBswap(GenTree* tree)
genProduceReg(tree); genProduceReg(tree);
} }
// Produce code for a GT_INC_SATURATE node.
void CodeGen::genCodeForIncSaturate(GenTree* tree)
{
regNumber targetReg = tree->GetRegNum();
var_types targetType = tree->TypeGet();
GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);
if (operandReg != targetReg)
{
inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
}
inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType));
inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType));
genProduceReg(tree);
}
// Generate code to get the high N bits of a N*N=2N bit multiplication result // Generate code to get the high N bits of a N*N=2N bit multiplication result
void CodeGen::genCodeForMulHi(GenTreeOp* treeNode) void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
{ {
@ -1608,6 +1629,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
genCodeForIndir(treeNode->AsIndir()); genCodeForIndir(treeNode->AsIndir());
break; break;
case GT_INC_SATURATE:
genCodeForIncSaturate(treeNode);
break;
case GT_MULHI: case GT_MULHI:
#ifdef TARGET_X86 #ifdef TARGET_X86
case GT_MUL_LONG: case GT_MUL_LONG:

View file

@ -10793,6 +10793,7 @@ public:
case GT_RETFILT: case GT_RETFILT:
case GT_RUNTIMELOOKUP: case GT_RUNTIMELOOKUP:
case GT_KEEPALIVE: case GT_KEEPALIVE:
case GT_INC_SATURATE:
{ {
GenTreeUnOp* const unOp = node->AsUnOp(); GenTreeUnOp* const unOp = node->AsUnOp();
if (unOp->gtOp1 != nullptr) if (unOp->gtOp1 != nullptr)

View file

@ -4328,6 +4328,7 @@ void GenTree::VisitOperands(TVisitor visitor)
#endif // FEATURE_ARG_SPLIT #endif // FEATURE_ARG_SPLIT
case GT_RETURNTRAP: case GT_RETURNTRAP:
case GT_KEEPALIVE: case GT_KEEPALIVE:
case GT_INC_SATURATE:
visitor(this->AsUnOp()->gtOp1); visitor(this->AsUnOp()->gtOp1);
return; return;

View file

@ -5217,6 +5217,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
case GT_BSWAP: case GT_BSWAP:
case GT_BSWAP16: case GT_BSWAP16:
case GT_KEEPALIVE: case GT_KEEPALIVE:
case GT_INC_SATURATE:
if (def == this->AsUnOp()->gtOp1) if (def == this->AsUnOp()->gtOp1)
{ {
*use = &this->AsUnOp()->gtOp1; *use = &this->AsUnOp()->gtOp1;
@ -9315,6 +9316,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
case GT_BSWAP: case GT_BSWAP:
case GT_BSWAP16: case GT_BSWAP16:
case GT_KEEPALIVE: case GT_KEEPALIVE:
case GT_INC_SATURATE:
#if FEATURE_ARG_SPLIT #if FEATURE_ARG_SPLIT
case GT_PUTARG_SPLIT: case GT_PUTARG_SPLIT:
#endif // FEATURE_ARG_SPLIT #endif // FEATURE_ARG_SPLIT

View file

@ -132,6 +132,7 @@ GTNODE(RSH , GenTreeOp ,0,GTK_BINOP)
GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP) GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP)
GTNODE(ROL , GenTreeOp ,0,GTK_BINOP) GTNODE(ROL , GenTreeOp ,0,GTK_BINOP)
GTNODE(ROR , GenTreeOp ,0,GTK_BINOP) GTNODE(ROR , GenTreeOp ,0,GTK_BINOP)
GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply) GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
// GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn // GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
// the div into a MULHI + some adjustments. In codegen, we only use the // the div into a MULHI + some adjustments. In codegen, we only use the

View file

@ -5171,31 +5171,48 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
if (!comp->opts.MinOpts() && (divisorValue >= 3)) if (!comp->opts.MinOpts() && (divisorValue >= 3))
{ {
size_t magic; size_t magic;
bool add; bool increment;
int shift; int preShift;
int postShift;
bool simpleMul = false;
if (type == TYP_INT) if (type == TYP_INT)
{ {
magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift); magic =
MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift, &postShift);
#ifdef TARGET_64BIT
// avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
if (increment || (preShift
#ifdef TARGET_XARCH
// IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
&& static_cast<int32_t>(magic) < 0
#endif
))
{
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
&postShift, 32);
}
// otherwise just widen to regular multiplication
else
{
postShift += 32;
simpleMul = true;
}
#endif
} }
else else
{ {
#ifdef TARGET_64BIT #ifdef TARGET_64BIT
magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift); magic =
MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift, &postShift);
#else #else
unreached(); unreached();
#endif #endif
} }
assert(divMod->MarkedDivideByConstOptimized()); assert(divMod->MarkedDivideByConstOptimized());
// Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate: const bool requiresDividendMultiuse = !isDiv;
// add == false (when divisor == 3 for example):
// div = (dividend MULHI magic) RSZ shift
// add == true (when divisor == 7 for example):
// mulhi = dividend MULHI magic
// div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
const bool requiresAdjustment = add;
const bool requiresDividendMultiuse = requiresAdjustment || !isDiv;
const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp); const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp);
if (requiresDividendMultiuse) if (requiresDividendMultiuse)
@ -5204,41 +5221,66 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
dividend = ReplaceWithLclVar(dividendUse); dividend = ReplaceWithLclVar(dividendUse);
} }
GenTree* firstNode = nullptr;
GenTree* adjustedDividend = dividend;
// If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
if (increment)
{
adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
BlockRange().InsertBefore(divMod, adjustedDividend);
firstNode = adjustedDividend;
assert(!preShift);
}
// if "preShift" is required, then do a right shift before
else if (preShift)
{
GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
adjustedDividend = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
firstNode = preShiftBy;
}
else if (type != TYP_I_IMPL)
{
adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
BlockRange().InsertBefore(divMod, adjustedDividend);
firstNode = adjustedDividend;
}
#ifdef TARGET_XARCH
// force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
// reduntant copies otherwise
if (firstNode && !simpleMul)
adjustedDividend->SetRegNum(REG_RAX);
#endif
divisor->gtType = TYP_I_IMPL;
divisor->AsIntCon()->SetIconValue(magic);
if (isDiv && !postShift && type == TYP_I_IMPL)
{
divMod->SetOper(GT_MULHI);
divMod->gtOp1 = adjustedDividend;
divMod->gtFlags |= GTF_UNSIGNED;
}
else
{
// Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node. // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
// The existing node will later be transformed into a GT_RSZ/GT_SUB that // The existing node will later be transformed into a GT_RSZ/GT_SUB that
// computes the final result. This way don't need to find and change the use // computes the final result. This way don't need to find and change the use
// of the existing node. // of the existing node.
GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor); GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
mulhi->gtFlags |= GTF_UNSIGNED; mulhi->gtFlags |= GTF_UNSIGNED;
divisor->AsIntCon()->SetIconValue(magic);
BlockRange().InsertBefore(divMod, mulhi); BlockRange().InsertBefore(divMod, mulhi);
GenTree* firstNode = mulhi; if (!firstNode)
firstNode = mulhi;
if (requiresAdjustment) if (postShift)
{ {
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()); GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
BlockRange().InsertBefore(divMod, dividend, sub);
GenTree* one = comp->gtNewIconNode(1, TYP_INT);
GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
BlockRange().InsertBefore(divMod, one, rsz);
LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub);
mulhi = ReplaceWithLclVar(mulhiUse);
mulhi = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
BlockRange().InsertBefore(divMod, mulhi, add);
mulhi = add;
shift -= 1;
}
GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
BlockRange().InsertBefore(divMod, shiftBy); BlockRange().InsertBefore(divMod, shiftBy);
if (isDiv) if (isDiv && type == TYP_I_IMPL)
{ {
divMod->SetOper(GT_RSZ); divMod->SetOper(GT_RSZ);
divMod->gtOp1 = mulhi; divMod->gtOp1 = mulhi;
@ -5246,19 +5288,39 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
} }
else else
{ {
GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy); mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
BlockRange().InsertBefore(divMod, mulhi);
}
}
if (!isDiv)
{
// divisor UMOD dividend = dividend SUB (div MUL divisor) // divisor UMOD dividend = dividend SUB (div MUL divisor)
GenTree* divisor = comp->gtNewIconNode(divisorValue, type); GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor); GenTree* mul = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()); dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
divMod->SetOper(GT_SUB); divMod->SetOper(GT_SUB);
divMod->gtOp1 = dividend; divMod->gtOp1 = dividend;
divMod->gtOp2 = mul; divMod->gtOp2 = mul;
BlockRange().InsertBefore(divMod, div, divisor, mul, dividend); BlockRange().InsertBefore(divMod, divisor, mul, dividend);
} }
else if (type != TYP_I_IMPL)
{
#ifdef TARGET_ARMARCH
divMod->SetOper(GT_CAST);
divMod->gtFlags |= GTF_UNSIGNED;
divMod->AsCast()->gtCastType = TYP_UINT;
#else
divMod->SetOper(GT_BITCAST);
#endif
divMod->gtOp1 = mulhi;
divMod->gtOp2 = nullptr;
}
}
if (firstNode)
ContainCheckRange(firstNode, divMod); ContainCheckRange(firstNode, divMod);
return true; return true;
} }

View file

@ -2242,8 +2242,8 @@ struct UnsignedMagic
typedef T DivisorType; typedef T DivisorType;
T magic; T magic;
bool add; bool increment;
int shift; char postShift;
}; };
template <typename T> template <typename T>
@ -2260,7 +2260,7 @@ const UnsignedMagic<uint32_t>* TryGetUnsignedMagic(uint32_t divisor)
{}, {},
{0xcccccccd, false, 2}, // 5 {0xcccccccd, false, 2}, // 5
{0xaaaaaaab, false, 2}, // 6 {0xaaaaaaab, false, 2}, // 6
{0x24924925, true, 3}, // 7 {0x49249249, true, 1}, // 7
{}, {},
{0x38e38e39, false, 1}, // 9 {0x38e38e39, false, 1}, // 9
{0xcccccccd, false, 3}, // 10 {0xcccccccd, false, 3}, // 10
@ -2279,7 +2279,7 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
{}, {},
{0xcccccccccccccccd, false, 2}, // 5 {0xcccccccccccccccd, false, 2}, // 5
{0xaaaaaaaaaaaaaaab, false, 2}, // 6 {0xaaaaaaaaaaaaaaab, false, 2}, // 6
{0x2492492492492493, true, 3}, // 7 {0x9249249249249249, true, 2}, // 7
{}, {},
{0xe38e38e38e38e38f, false, 3}, // 9 {0xe38e38e38e38e38f, false, 3}, // 9
{0xcccccccccccccccd, false, 3}, // 10 {0xcccccccccccccccd, false, 3}, // 10
@ -2296,99 +2296,138 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
// //
// Arguments: // Arguments:
// d - The divisor // d - The divisor
// add - Pointer to a flag indicating the kind of code to generate // increment - Pointer to a flag indicating if incrementing the numerator is required
// shift - Pointer to the shift value to be returned // preShift - Pointer to the pre-shift value to be returned
// postShift - Pointer to the post-shift value to be returned
// //
// Returns: // Returns:
// The magic number. // The magic number.
// //
// Notes: // Notes:
// This code is adapted from _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58. // Based on "Faster Unsigned Division by Constants" by ridiculous_fish.
// The paper is based on "Division by invariant integers using multiplication" // https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
// by Torbjorn Granlund and Peter L. Montgomery in PLDI 94 // https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c
template <typename T> template <typename T>
T GetUnsignedMagic(T d, bool* add /*out*/, int* shift /*out*/) T GetUnsignedMagic(T d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned num_bits)
{ {
assert((d >= 3) && !isPow2(d)); assert((d >= 3) && !isPow2(d));
// The numerator must fit in a uint
assert(num_bits > 0 && num_bits <= sizeof(T) * CHAR_BIT);
// Bits in a uint
const unsigned UINT_BITS = sizeof(T) * CHAR_BIT;
if (num_bits == UINT_BITS)
{
const UnsignedMagic<T>* magic = TryGetUnsignedMagic(d); const UnsignedMagic<T>* magic = TryGetUnsignedMagic(d);
if (magic != nullptr) if (magic != nullptr)
{ {
*shift = magic->shift; *increment = magic->increment;
*add = magic->add; *preShift = 0;
*postShift = magic->postShift;
return magic->magic; return magic->magic;
} }
}
typedef typename std::make_signed<T>::type ST; // The extra shift implicit in the difference between UINT_BITS and num_bits
const unsigned extra_shift = UINT_BITS - num_bits;
const unsigned bits = sizeof(T) * 8; // The initial power of 2 is one less than the first one that can possibly work
const unsigned bitsMinus1 = bits - 1; const T initial_power_of_2 = (T)1 << (UINT_BITS - 1);
const T twoNMinus1 = T(1) << bitsMinus1;
*add = false; // The remainder and quotient of our power of 2 divided by d
const T nc = -ST(1) - -ST(d) % ST(d); T quotient = initial_power_of_2 / d, remainder = initial_power_of_2 % d;
unsigned p = bitsMinus1;
T q1 = twoNMinus1 / nc;
T r1 = twoNMinus1 - (q1 * nc);
T q2 = (twoNMinus1 - 1) / d;
T r2 = (twoNMinus1 - 1) - (q2 * d);
T delta;
do // The magic info for the variant "round down" algorithm
T down_multiplier = 0;
unsigned down_exponent = 0;
int has_magic_down = 0;
// Compute ceil(log_2 D)
unsigned ceil_log_2_D = 0;
for (T tmp = d; tmp > 0; tmp >>= 1)
ceil_log_2_D += 1;
// Begin a loop that increments the exponent, until we find a power of 2 that works.
unsigned exponent;
for (exponent = 0;; exponent++)
{ {
p++; // Quotient and remainder is from previous exponent; compute it for this exponent.
if (remainder >= d - remainder)
if (r1 >= (nc - r1))
{ {
q1 = 2 * q1 + 1; // Doubling remainder will wrap around D
r1 = 2 * r1 - nc; quotient = quotient * 2 + 1;
remainder = remainder * 2 - d;
} }
else else
{ {
q1 = 2 * q1; // Remainder will not wrap
r1 = 2 * r1; quotient = quotient * 2;
remainder = remainder * 2;
} }
if ((r2 + 1) >= (d - r2)) // We're done if this exponent works for the round_up algorithm.
// Note that exponent may be larger than the maximum shift supported,
// so the check for >= ceil_log_2_D is critical.
if ((exponent + extra_shift >= ceil_log_2_D) || (d - remainder) <= ((T)1 << (exponent + extra_shift)))
break;
// Set magic_down if we have not set it yet and this exponent works for the round_down algorithm
if (!has_magic_down && remainder <= ((T)1 << (exponent + extra_shift)))
{ {
if (q2 >= (twoNMinus1 - 1)) has_magic_down = 1;
{ down_multiplier = quotient;
*add = true; down_exponent = exponent;
}
} }
q2 = 2 * q2 + 1; if (exponent < ceil_log_2_D)
r2 = 2 * r2 + 1 - d; {
// magic_up is efficient
*increment = false;
*preShift = 0;
*postShift = (int)exponent;
return quotient + 1;
}
else if (d & 1)
{
// Odd divisor, so use magic_down, which must have been set
assert(has_magic_down);
*increment = true;
*preShift = 0;
*postShift = (int)down_exponent;
return down_multiplier;
} }
else else
{ {
if (q2 >= twoNMinus1) // Even divisor, so use a prefix-shifted dividend
unsigned pre_shift = 0;
T shifted_D = d;
while ((shifted_D & 1) == 0)
{ {
*add = true; shifted_D >>= 1;
pre_shift += 1;
} }
T result = GetUnsignedMagic<T>(shifted_D, increment, preShift, postShift, num_bits - pre_shift);
q2 = 2 * q2; assert(*increment == 0 && *preShift == 0); // expect no increment or pre_shift in this path
r2 = 2 * r2 + 1; *preShift = (int)pre_shift;
return result;
} }
delta = d - 1 - r2;
} while ((p < (bits * 2)) && ((q1 < delta) || ((q1 == delta) && (r1 == 0))));
*shift = p - bits; // resulting shift
return q2 + 1; // resulting magic number
} }
uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/) uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/)
{ {
return GetUnsignedMagic<uint32_t>(d, add, shift); return GetUnsignedMagic<uint32_t>(d, increment, preShift, postShift, 32);
} }
#ifdef TARGET_64BIT #ifdef TARGET_64BIT
uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/) uint64_t GetUnsigned64Magic(
uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits)
{ {
return GetUnsignedMagic<uint64_t>(d, add, shift); return GetUnsignedMagic<uint64_t>(d, increment, preShift, postShift, bits);
} }
#endif #endif

View file

@ -756,9 +756,10 @@ private:
namespace MagicDivide namespace MagicDivide
{ {
uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/); uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/);
#ifdef TARGET_64BIT #ifdef TARGET_64BIT
uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/); uint64_t GetUnsigned64Magic(
uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits = 64);
#endif #endif
int32_t GetSigned32Magic(int32_t d, int* shift /*out*/); int32_t GetSigned32Magic(int32_t d, int* shift /*out*/);
#ifdef TARGET_64BIT #ifdef TARGET_64BIT

View file

@ -0,0 +1,36 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Runtime.CompilerServices;
namespace System.MathBenchmarks
{
public static class DivideByConst
{
public static void Test()
{
Verify(ulong.MaxValue, long.MaxValue, uint.MaxValue, int.MaxValue);
[MethodImpl(MethodImplOptions.NoInlining)]
static void Verify(ulong u64, long i64, uint u32, int i32)
{
if (u64 / 7 != 0x2492492492492492) throw new Exception($"{u64:x}/7={u64 / 7:x}");
if (i64 / 7 != 0x1249249249249249) throw new Exception($"{i64:x}/7={i64 / 7:x}");
if (u32 / 7 != 0x24924924) throw new Exception($"{u32:x}/7={u32 / 7:x}");
if (i32 / 7 != 0x12492492) throw new Exception($"{i32:x}/7={i32 / 7:x}");
if (u64 / 14 != 0x1249249249249249) throw new Exception($"{u64:x}/14={u64 / 14:x}");
if (i64 / 14 != 0x924924924924924) throw new Exception($"{i64:x}/14={i64 / 14:x}");
if (u32 / 14 != 0x12492492) throw new Exception($"{u32:x}/14={u32 / 14:x}");
if (i32 / 14 != 0x9249249) throw new Exception($"{i32:x}/14={i32 / 14:x}");
if (u64 / 56 != 0x492492492492492) throw new Exception($"{u64:x}/56={u64 / 56:x}");
if (i64 / 56 != 0x249249249249249) throw new Exception($"{i64:x}/56={i64 / 56:x}");
if (u32 / 56 != 0x4924924) throw new Exception($"{u32:x}/56={u32 / 56:x}");
if (i32 / 56 != 0x2492492) throw new Exception($"{i32:x}/56={i32 / 56:x}");
}
}
}
}

View file

@ -74,6 +74,8 @@ namespace System.MathBenchmarks
result += Test(singleTests.Tan); result += Test(singleTests.Tan);
result += Test(singleTests.Tanh); result += Test(singleTests.Tanh);
result += Test(DivideByConst.Test);
return result; return result;
} }