Faster unsigned division by constants (#49585)

* Faster unsigned division by constants * Fix Arm build and add some tests. * Improve register allocation * Fix ARM64 codegen * Fix MULHI flags * Remove ARM32 codegen * Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation. * Always widen 32bit UDIV to 64bit MUL/MULHI * Cleanup * Final optimization * Fix typo
2025-06-09 17:44:48 +09:00 · 2021-05-18 06:06:50 +03:00 · 2021-05-18 06:06:50 +03:00 · e4b4807e2f
commit e4b4807e2f
parent 63ea9c6b96
15 changed files with 328 additions and 120 deletions
--- a/THIRD-PARTY-NOTICES.TXT
+++ b/THIRD-PARTY-NOTICES.TXT
@ -942,3 +942,13 @@ OF SUCH DAMAGES.
 You acknowledge that this software is not designed, licensed or
 intended  for  use  in  the  design, construction, operation or
 maintenance of any nuclear facility.
 License notice for "Faster Unsigned Division by Constants"
 ------------------------------
 Reference implementations of computing and using the "magic number" approach to dividing
 by constants, including codegen instructions. The unsigned division incorporates the
 "round down" optimization per ridiculous_fish.
 This is free and unencumbered software. Any copyright is dedicated to the Public Domain.
--- a/src/coreclr/jit/assertionprop.cpp
+++ b/src/coreclr/jit/assertionprop.cpp
@ -5152,8 +5152,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta
        case GT_INTRINSIC:
            break;
        case GT_INC_SATURATE:
        case GT_MULHI:
-            assert(false && "Unexpected GT_MULHI node encountered before lowering");
+            assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
            break;
        case GT_JTRUE:
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@ -843,6 +843,7 @@ protected:
    void genCodeForDivMod(GenTreeOp* treeNode);
    void genCodeForMul(GenTreeOp* treeNode);
    void genCodeForIncSaturate(GenTree* treeNode);
    void genCodeForMulHi(GenTreeOp* treeNode);
    void genLeaInstruction(GenTreeAddrMode* lea);
    void genSetRegToCond(regNumber dstReg, GenTree* tree);
--- a/src/coreclr/jit/codegenarm64.cpp
+++ b/src/coreclr/jit/codegenarm64.cpp
@ -1753,6 +1753,28 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
    }
 }
 // Produce code for a GT_INC_SATURATE node.
 void CodeGen::genCodeForIncSaturate(GenTree* tree)
 {
    regNumber targetReg  = tree->GetRegNum();
    var_types targetType = tree->TypeGet();
    // The arithmetic node must be sitting in a register (since it's not contained)
    assert(!tree->isContained());
    // The dst can only be a register.
    assert(targetReg != REG_NA);
    GenTree* operand = tree->gtGetOp1();
    assert(!operand->isContained());
    // The src must be a register.
    regNumber operandReg = genConsumeReg(operand);
    GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1);
    GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS);
    genProduceReg(tree);
 }
 // Generate code to get the high N bits of a N*N=2N bit multiplication result
 void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
 {
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
 #ifdef TARGET_ARM64
        case GT_INC_SATURATE:
            genCodeForIncSaturate(treeNode);
            break;
        case GT_MULHI:
            genCodeForMulHi(treeNode->AsOp());
            break;
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@ -605,6 +605,27 @@ void CodeGen::genCodeForBswap(GenTree* tree)
    genProduceReg(tree);
 }
 // Produce code for a GT_INC_SATURATE node.
 void CodeGen::genCodeForIncSaturate(GenTree* tree)
 {
    regNumber targetReg  = tree->GetRegNum();
    var_types targetType = tree->TypeGet();
    GenTree* operand = tree->gtGetOp1();
    assert(operand->isUsedFromReg());
    regNumber operandReg = genConsumeReg(operand);
    if (operandReg != targetReg)
    {
        inst_RV_RV(INS_mov, targetReg, operandReg, targetType);
    }
    inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType));
    inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType));
    genProduceReg(tree);
 }
 // Generate code to get the high N bits of a N*N=2N bit multiplication result
 void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
 {
@ -1608,6 +1629,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
            genCodeForIndir(treeNode->AsIndir());
            break;
        case GT_INC_SATURATE:
            genCodeForIncSaturate(treeNode);
            break;
        case GT_MULHI:
 #ifdef TARGET_X86
        case GT_MUL_LONG:
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@ -10793,6 +10793,7 @@ public:
            case GT_RETFILT:
            case GT_RUNTIMELOOKUP:
            case GT_KEEPALIVE:
            case GT_INC_SATURATE:
            {
                GenTreeUnOp* const unOp = node->AsUnOp();
                if (unOp->gtOp1 != nullptr)
--- a/src/coreclr/jit/compiler.hpp
+++ b/src/coreclr/jit/compiler.hpp
@ -4328,6 +4328,7 @@ void GenTree::VisitOperands(TVisitor visitor)
 #endif // FEATURE_ARG_SPLIT
        case GT_RETURNTRAP:
        case GT_KEEPALIVE:
        case GT_INC_SATURATE:
            visitor(this->AsUnOp()->gtOp1);
            return;
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@ -5217,6 +5217,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use)
        case GT_BSWAP:
        case GT_BSWAP16:
        case GT_KEEPALIVE:
        case GT_INC_SATURATE:
            if (def == this->AsUnOp()->gtOp1)
            {
                *use = &this->AsUnOp()->gtOp1;
@ -9315,6 +9316,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node)
        case GT_BSWAP:
        case GT_BSWAP16:
        case GT_KEEPALIVE:
        case GT_INC_SATURATE:
 #if FEATURE_ARG_SPLIT
        case GT_PUTARG_SPLIT:
 #endif // FEATURE_ARG_SPLIT
--- a/src/coreclr/jit/gtlist.h
+++ b/src/coreclr/jit/gtlist.h
@ -132,6 +132,7 @@ GTNODE(RSH              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(RSZ              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(ROL              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(ROR              , GenTreeOp          ,0,GTK_BINOP)
 GTNODE(INC_SATURATE     , GenTreeOp          ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
 GTNODE(MULHI            , GenTreeOp          ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
                                                           // GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
                                                           // the div into a MULHI + some adjustments. In codegen, we only use the
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@ -5171,31 +5171,48 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
    if (!comp->opts.MinOpts() && (divisorValue >= 3))
    {
        size_t magic;
-        bool   add;
+        bool   increment;
-        int    shift;
+        int    preShift;
        int    postShift;
        bool   simpleMul = false;
        if (type == TYP_INT)
        {
-            magic = MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &add, &shift);
+            magic =
                MagicDivide::GetUnsigned32Magic(static_cast<uint32_t>(divisorValue), &increment, &preShift, &postShift);
 #ifdef TARGET_64BIT
            // avoid inc_saturate/multiple shifts by widening to 32x64 MULHI
            if (increment || (preShift
 #ifdef TARGET_XARCH
                              // IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension
                              && static_cast<int32_t>(magic) < 0
 #endif
                              ))
            {
                magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift,
                                                        &postShift, 32);
            }
            // otherwise just widen to regular multiplication
            else
            {
                postShift += 32;
                simpleMul = true;
            }
 #endif
        }
        else
        {
 #ifdef TARGET_64BIT
-            magic = MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &add, &shift);
+            magic =
                MagicDivide::GetUnsigned64Magic(static_cast<uint64_t>(divisorValue), &increment, &preShift, &postShift);
 #else
            unreached();
 #endif
        }
        assert(divMod->MarkedDivideByConstOptimized());
-        // Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate:
+        const bool                 requiresDividendMultiuse = !isDiv;
        // add == false (when divisor == 3 for example):
        //     div = (dividend MULHI magic) RSZ shift
        // add == true (when divisor == 7 for example):
        //     mulhi = dividend MULHI magic
        //     div   = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1)
        const bool                 requiresAdjustment       = add;
        const bool                 requiresDividendMultiuse = requiresAdjustment || !isDiv;
        const BasicBlock::weight_t curBBWeight              = m_block->getBBWeight(comp);
        if (requiresDividendMultiuse)
@ -5204,41 +5221,66 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
            dividend = ReplaceWithLclVar(dividendUse);
        }
        GenTree* firstNode        = nullptr;
        GenTree* adjustedDividend = dividend;
        // If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first
        if (increment)
        {
            adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend);
            BlockRange().InsertBefore(divMod, adjustedDividend);
            firstNode = adjustedDividend;
            assert(!preShift);
        }
        // if "preShift" is required, then do a right shift before
        else if (preShift)
        {
            GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT);
            adjustedDividend    = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy);
            BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
            firstNode = preShiftBy;
        }
        else if (type != TYP_I_IMPL)
        {
            adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
            BlockRange().InsertBefore(divMod, adjustedDividend);
            firstNode = adjustedDividend;
        }
 #ifdef TARGET_XARCH
        // force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes
        // reduntant copies otherwise
        if (firstNode && !simpleMul)
            adjustedDividend->SetRegNum(REG_RAX);
 #endif
        divisor->gtType = TYP_I_IMPL;
        divisor->AsIntCon()->SetIconValue(magic);
        if (isDiv && !postShift && type == TYP_I_IMPL)
        {
            divMod->SetOper(GT_MULHI);
            divMod->gtOp1 = adjustedDividend;
            divMod->gtFlags |= GTF_UNSIGNED;
        }
        else
        {
            // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node.
            // The existing node will later be transformed into a GT_RSZ/GT_SUB that
            // computes the final result. This way don't need to find and change the use
            // of the existing node.
-        GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor);
+            GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
            mulhi->gtFlags |= GTF_UNSIGNED;
        divisor->AsIntCon()->SetIconValue(magic);
            BlockRange().InsertBefore(divMod, mulhi);
-        GenTree* firstNode = mulhi;
+            if (!firstNode)
                firstNode = mulhi;
-        if (requiresAdjustment)
+            if (postShift)
            {
-            dividend     = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
+                GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT);
            GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi);
            BlockRange().InsertBefore(divMod, dividend, sub);
            GenTree* one = comp->gtNewIconNode(1, TYP_INT);
            GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one);
            BlockRange().InsertBefore(divMod, one, rsz);
            LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub);
            mulhi = ReplaceWithLclVar(mulhiUse);
            mulhi        = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet());
            GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi);
            BlockRange().InsertBefore(divMod, mulhi, add);
            mulhi = add;
            shift -= 1;
        }
        GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT);
                BlockRange().InsertBefore(divMod, shiftBy);
-        if (isDiv)
+                if (isDiv && type == TYP_I_IMPL)
                {
                    divMod->SetOper(GT_RSZ);
                    divMod->gtOp1 = mulhi;
@ -5246,19 +5288,39 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
                }
                else
                {
-            GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy);
+                    mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy);
                    BlockRange().InsertBefore(divMod, mulhi);
                }
            }
            if (!isDiv)
            {
                // divisor UMOD dividend = dividend SUB (div MUL divisor)
                GenTree* divisor = comp->gtNewIconNode(divisorValue, type);
-            GenTree* mul     = comp->gtNewOperNode(GT_MUL, type, div, divisor);
+                GenTree* mul     = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor);
                dividend         = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet());
                divMod->SetOper(GT_SUB);
                divMod->gtOp1 = dividend;
                divMod->gtOp2 = mul;
-            BlockRange().InsertBefore(divMod, div, divisor, mul, dividend);
+                BlockRange().InsertBefore(divMod, divisor, mul, dividend);
            }
            else if (type != TYP_I_IMPL)
            {
 #ifdef TARGET_ARMARCH
                divMod->SetOper(GT_CAST);
                divMod->gtFlags |= GTF_UNSIGNED;
                divMod->AsCast()->gtCastType = TYP_UINT;
 #else
                divMod->SetOper(GT_BITCAST);
 #endif
                divMod->gtOp1 = mulhi;
                divMod->gtOp2 = nullptr;
            }
        }
        if (firstNode)
            ContainCheckRange(firstNode, divMod);
        return true;
    }
--- a/src/coreclr/jit/utils.cpp
+++ b/src/coreclr/jit/utils.cpp
@ -2242,8 +2242,8 @@ struct UnsignedMagic
    typedef T DivisorType;
    T    magic;
-    bool add;
+    bool increment;
-    int  shift;
+    char postShift;
 };
 template <typename T>
@ -2260,7 +2260,7 @@ const UnsignedMagic<uint32_t>* TryGetUnsignedMagic(uint32_t divisor)
        {},
        {0xcccccccd, false, 2}, // 5
        {0xaaaaaaab, false, 2}, // 6
-        {0x24924925, true, 3},  // 7
+        {0x49249249, true, 1},  // 7
        {},
        {0x38e38e39, false, 1}, // 9
        {0xcccccccd, false, 3}, // 10
@ -2279,7 +2279,7 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
        {},
        {0xcccccccccccccccd, false, 2}, // 5
        {0xaaaaaaaaaaaaaaab, false, 2}, // 6
-        {0x2492492492492493, true, 3},  // 7
+        {0x9249249249249249, true, 2},  // 7
        {},
        {0xe38e38e38e38e38f, false, 3}, // 9
        {0xcccccccccccccccd, false, 3}, // 10
@ -2296,99 +2296,138 @@ const UnsignedMagic<uint64_t>* TryGetUnsignedMagic(uint64_t divisor)
 //
 // Arguments:
 //    d     - The divisor
-//    add   - Pointer to a flag indicating the kind of code to generate
+//    increment   - Pointer to a flag indicating if incrementing the numerator is required
-//    shift - Pointer to the shift value to be returned
+//    preShift - Pointer to the pre-shift value to be returned
 //    postShift - Pointer to the post-shift value to be returned
 //
 // Returns:
 //    The magic number.
 //
 // Notes:
-//    This code is adapted from _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58.
+//    Based on "Faster Unsigned Division by Constants" by ridiculous_fish.
-//    The paper is based on "Division by invariant integers using multiplication"
+//    https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf
-//    by Torbjorn Granlund and Peter L. Montgomery in PLDI 94
+//    https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c
 template <typename T>
-T GetUnsignedMagic(T d, bool* add /*out*/, int* shift /*out*/)
+T GetUnsignedMagic(T d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned num_bits)
 {
    assert((d >= 3) && !isPow2(d));
    // The numerator must fit in a uint
    assert(num_bits > 0 && num_bits <= sizeof(T) * CHAR_BIT);
    // Bits in a uint
    const unsigned UINT_BITS = sizeof(T) * CHAR_BIT;
    if (num_bits == UINT_BITS)
    {
        const UnsignedMagic<T>* magic = TryGetUnsignedMagic(d);
        if (magic != nullptr)
        {
-        *shift = magic->shift;
+            *increment = magic->increment;
-        *add   = magic->add;
+            *preShift  = 0;
            *postShift = magic->postShift;
            return magic->magic;
        }
    }
-    typedef typename std::make_signed<T>::type ST;
+    // The extra shift implicit in the difference between UINT_BITS and num_bits
    const unsigned extra_shift = UINT_BITS - num_bits;
-    const unsigned bits       = sizeof(T) * 8;
+    // The initial power of 2 is one less than the first one that can possibly work
-    const unsigned bitsMinus1 = bits - 1;
+    const T initial_power_of_2 = (T)1 << (UINT_BITS - 1);
    const T        twoNMinus1 = T(1) << bitsMinus1;
-    *add        = false;
+    // The remainder and quotient of our power of 2 divided by d
-    const T  nc = -ST(1) - -ST(d) % ST(d);
+    T quotient = initial_power_of_2 / d, remainder = initial_power_of_2 % d;
    unsigned p  = bitsMinus1;
    T        q1 = twoNMinus1 / nc;
    T        r1 = twoNMinus1 - (q1 * nc);
    T        q2 = (twoNMinus1 - 1) / d;
    T        r2 = (twoNMinus1 - 1) - (q2 * d);
    T        delta;
-    do
+    // The magic info for the variant "round down" algorithm
    T        down_multiplier = 0;
    unsigned down_exponent   = 0;
    int      has_magic_down  = 0;
    // Compute ceil(log_2 D)
    unsigned ceil_log_2_D = 0;
    for (T tmp = d; tmp > 0; tmp >>= 1)
        ceil_log_2_D += 1;
    // Begin a loop that increments the exponent, until we find a power of 2 that works.
    unsigned exponent;
    for (exponent = 0;; exponent++)
    {
-        p++;
+        // Quotient and remainder is from previous exponent; compute it for this exponent.
-
+        if (remainder >= d - remainder)
        if (r1 >= (nc - r1))
        {
-            q1 = 2 * q1 + 1;
+            // Doubling remainder will wrap around D
-            r1 = 2 * r1 - nc;
+            quotient  = quotient * 2 + 1;
            remainder = remainder * 2 - d;
        }
        else
        {
-            q1 = 2 * q1;
+            // Remainder will not wrap
-            r1 = 2 * r1;
+            quotient  = quotient * 2;
            remainder = remainder * 2;
        }
-        if ((r2 + 1) >= (d - r2))
+        // We're done if this exponent works for the round_up algorithm.
        // Note that exponent may be larger than the maximum shift supported,
        // so the check for >= ceil_log_2_D is critical.
        if ((exponent + extra_shift >= ceil_log_2_D) || (d - remainder) <= ((T)1 << (exponent + extra_shift)))
            break;
        // Set magic_down if we have not set it yet and this exponent works for the round_down algorithm
        if (!has_magic_down && remainder <= ((T)1 << (exponent + extra_shift)))
        {
-            if (q2 >= (twoNMinus1 - 1))
+            has_magic_down  = 1;
-            {
+            down_multiplier = quotient;
-                *add = true;
+            down_exponent   = exponent;
        }
    }
-            q2 = 2 * q2 + 1;
+    if (exponent < ceil_log_2_D)
-            r2 = 2 * r2 + 1 - d;
+    {
        // magic_up is efficient
        *increment = false;
        *preShift  = 0;
        *postShift = (int)exponent;
        return quotient + 1;
    }
    else if (d & 1)
    {
        // Odd divisor, so use magic_down, which must have been set
        assert(has_magic_down);
        *increment = true;
        *preShift  = 0;
        *postShift = (int)down_exponent;
        return down_multiplier;
    }
    else
    {
-            if (q2 >= twoNMinus1)
+        // Even divisor, so use a prefix-shifted dividend
        unsigned pre_shift = 0;
        T        shifted_D = d;
        while ((shifted_D & 1) == 0)
        {
-                *add = true;
+            shifted_D >>= 1;
            pre_shift += 1;
        }
-
+        T result = GetUnsignedMagic<T>(shifted_D, increment, preShift, postShift, num_bits - pre_shift);
-            q2 = 2 * q2;
+        assert(*increment == 0 && *preShift == 0); // expect no increment or pre_shift in this path
-            r2 = 2 * r2 + 1;
+        *preShift = (int)pre_shift;
        return result;
    }
        delta = d - 1 - r2;
    } while ((p < (bits * 2)) && ((q1 < delta) || ((q1 == delta) && (r1 == 0))));
    *shift = p - bits; // resulting shift
    return q2 + 1;     // resulting magic number
 }
-uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/)
+uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/)
 {
-    return GetUnsignedMagic<uint32_t>(d, add, shift);
+    return GetUnsignedMagic<uint32_t>(d, increment, preShift, postShift, 32);
 }
 #ifdef TARGET_64BIT
-uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/)
+uint64_t GetUnsigned64Magic(
    uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits)
 {
-    return GetUnsignedMagic<uint64_t>(d, add, shift);
+    return GetUnsignedMagic<uint64_t>(d, increment, preShift, postShift, bits);
 }
 #endif
--- a/src/coreclr/jit/utils.h
+++ b/src/coreclr/jit/utils.h
@ -756,9 +756,10 @@ private:
 namespace MagicDivide
 {
-uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/);
+uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/);
 #ifdef TARGET_64BIT
-uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/);
+uint64_t GetUnsigned64Magic(
    uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits = 64);
 #endif
 int32_t GetSigned32Magic(int32_t d, int* shift /*out*/);
 #ifdef TARGET_64BIT
--- a/src/tests/JIT/Math/Functions/DivideByConst.cs
+++ b/src/tests/JIT/Math/Functions/DivideByConst.cs
@ -0,0 +1,36 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 using System;
 using System.Runtime.CompilerServices;
 namespace System.MathBenchmarks
 {
    public static class DivideByConst
    {
        public static void Test()
        {
            Verify(ulong.MaxValue, long.MaxValue, uint.MaxValue, int.MaxValue);
            [MethodImpl(MethodImplOptions.NoInlining)]
            static void Verify(ulong u64, long i64, uint u32, int i32)
            {
                if (u64 / 7 != 0x2492492492492492) throw new Exception($"{u64:x}/7={u64 / 7:x}");
                if (i64 / 7 != 0x1249249249249249) throw new Exception($"{i64:x}/7={i64 / 7:x}");
                if (u32 / 7 != 0x24924924) throw new Exception($"{u32:x}/7={u32 / 7:x}");
                if (i32 / 7 != 0x12492492) throw new Exception($"{i32:x}/7={i32 / 7:x}");
                if (u64 / 14 != 0x1249249249249249) throw new Exception($"{u64:x}/14={u64 / 14:x}");
                if (i64 / 14 != 0x924924924924924) throw new Exception($"{i64:x}/14={i64 / 14:x}");
                if (u32 / 14 != 0x12492492) throw new Exception($"{u32:x}/14={u32 / 14:x}");
                if (i32 / 14 != 0x9249249) throw new Exception($"{i32:x}/14={i32 / 14:x}");
                if (u64 / 56 != 0x492492492492492) throw new Exception($"{u64:x}/56={u64 / 56:x}");
                if (i64 / 56 != 0x249249249249249) throw new Exception($"{i64:x}/56={i64 / 56:x}");
                if (u32 / 56 != 0x4924924) throw new Exception($"{u32:x}/56={u32 / 56:x}");
                if (i32 / 56 != 0x2492492) throw new Exception($"{i32:x}/56={i32 / 56:x}");
            }
        }
    }
 }
--- a/src/tests/JIT/Math/Functions/Program.cs
+++ b/src/tests/JIT/Math/Functions/Program.cs
@ -74,6 +74,8 @@ namespace System.MathBenchmarks
            result += Test(singleTests.Tan);
            result += Test(singleTests.Tanh);
            result += Test(DivideByConst.Test);
            return result;
        }