From 6986c63ade66988c29495bb41efdabb276b20059 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Sun, 14 Mar 2021 00:09:24 +0200 Subject: [PATCH 01/11] Faster unsigned division by constants --- THIRD-PARTY-NOTICES.TXT | 10 ++ src/coreclr/jit/assertionprop.cpp | 3 +- src/coreclr/jit/codegen.h | 1 + src/coreclr/jit/codegenarmarch.cpp | 26 +++++ src/coreclr/jit/codegenxarch.cpp | 25 +++++ src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/compiler.hpp | 1 + src/coreclr/jit/gentree.cpp | 2 + src/coreclr/jit/gtlist.h | 1 + src/coreclr/jit/lower.cpp | 126 ++++++++++++---------- src/coreclr/jit/utils.cpp | 162 ++++++++++++++++++----------- src/coreclr/jit/utils.h | 4 +- 12 files changed, 241 insertions(+), 121 deletions(-) diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index b5571e67209f7a..0b28a517123d04 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -948,3 +948,13 @@ OF SUCH DAMAGES. You acknowledge that this software is not designed, licensed or intended for use in the design, construction, operation or maintenance of any nuclear facility. + + +License notice for "Faster Unsigned Division by Constants" +------------------------------ + +Reference implementations of computing and using the "magic number" approach to dividing +by constants, including codegen instructions. The unsigned division incorporates the +"round down" optimization per ridiculous_fish. + +This is free and unencumbered software. Any copyright is dedicated to the Public Domain. diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index fd4eec7118ce49..959391f2661ae3 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -5136,8 +5136,9 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta case GT_INTRINSIC: break; + case GT_INC_SATURATE: case GT_MULHI: - assert(false && "Unexpected GT_MULHI node encountered before lowering"); + assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering"); break; case GT_JTRUE: diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 1770947d640210..c7d6c010171835 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -843,6 +843,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genCodeForDivMod(GenTreeOp* treeNode); void genCodeForMul(GenTreeOp* treeNode); + void genCodeForIncSaturate(GenTree* treeNode); void genCodeForMulHi(GenTreeOp* treeNode); void genLeaInstruction(GenTreeAddrMode* lea); void genSetRegToCond(regNumber dstReg, GenTree* tree); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 51ab100a33eac0..6797304df3a887 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -303,6 +303,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) #ifdef TARGET_ARM64 + case GT_INC_SATURATE: + genCodeForIncSaturate(treeNode); + break; + case GT_MULHI: genCodeForMulHi(treeNode->AsOp()); break; @@ -4239,4 +4243,26 @@ void CodeGen::genPushCalleeSavedRegisters() #endif // TARGET_ARM64 } +// Produce code for a GT_INC_SATURATE node. +void CodeGen::genCodeForIncSaturate(GenTree* tree) +{ + regNumber targetReg = tree->GetRegNum(); + var_types targetType = tree->TypeGet(); + + // The arithmetic node must be sitting in a register (since it's not contained) + assert(!tree->isContained()); + // The dst can only be a register. + assert(targetReg != REG_NA); + + GenTree* operand = tree->gtGetOp1(); + assert(!operand->isContained()); + // The src must be a register. + regNumber operandReg = genConsumeReg(operand); + + GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1); + GetEmitter()->emitIns_R_R_COND(INS_mvn, emitActualTypeSize(tree), targetReg, REG_ZR, INS_COND_HS); + + genProduceReg(tree); +} + #endif // TARGET_ARMARCH diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index f1f08a68e855e2..b343c13d3407db 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -611,6 +611,27 @@ void CodeGen::genCodeForBswap(GenTree* tree) genProduceReg(tree); } +// Produce code for a GT_INC_SATURATE node. +void CodeGen::genCodeForIncSaturate(GenTree* tree) +{ + regNumber targetReg = tree->GetRegNum(); + var_types targetType = tree->TypeGet(); + + GenTree* operand = tree->gtGetOp1(); + assert(operand->isUsedFromReg()); + regNumber operandReg = genConsumeReg(operand); + + if (operandReg != targetReg) + { + inst_RV_RV(INS_mov, targetReg, operandReg, targetType); + } + + inst_RV_IV(INS_add, targetReg, 1, emitActualTypeSize(targetType)); + inst_RV_IV(INS_sbb, targetReg, 0, emitActualTypeSize(targetType)); + + genProduceReg(tree); +} + // Generate code to get the high N bits of a N*N=2N bit multiplication result void CodeGen::genCodeForMulHi(GenTreeOp* treeNode) { @@ -1633,6 +1654,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) genCodeForIndir(treeNode->AsIndir()); break; + case GT_INC_SATURATE: + genCodeForIncSaturate(treeNode); + break; + case GT_MULHI: #ifdef TARGET_X86 case GT_MUL_LONG: diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 64635e15cb2960..bc4765e09688c8 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10636,6 +10636,7 @@ class GenTreeVisitor case GT_RETFILT: case GT_RUNTIMELOOKUP: case GT_KEEPALIVE: + case GT_INC_SATURATE: { GenTreeUnOp* const unOp = node->AsUnOp(); if (unOp->gtOp1 != nullptr) diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 1266978683adff..7f20201e62f1a5 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -4399,6 +4399,7 @@ void GenTree::VisitOperands(TVisitor visitor) #endif // FEATURE_ARG_SPLIT case GT_RETURNTRAP: case GT_KEEPALIVE: + case GT_INC_SATURATE: visitor(this->AsUnOp()->gtOp1); return; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ec5352b1338f16..2d6dd20838bbeb 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -5210,6 +5210,7 @@ bool GenTree::TryGetUse(GenTree* def, GenTree*** use) case GT_BSWAP: case GT_BSWAP16: case GT_KEEPALIVE: + case GT_INC_SATURATE: if (def == this->AsUnOp()->gtOp1) { *use = &this->AsUnOp()->gtOp1; @@ -9215,6 +9216,7 @@ GenTreeUseEdgeIterator::GenTreeUseEdgeIterator(GenTree* node) case GT_BSWAP: case GT_BSWAP16: case GT_KEEPALIVE: + case GT_INC_SATURATE: #if FEATURE_ARG_SPLIT case GT_PUTARG_SPLIT: #endif // FEATURE_ARG_SPLIT diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index 638bbf3d39c128..248bada7752035 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -132,6 +132,7 @@ GTNODE(RSH , GenTreeOp ,0,GTK_BINOP) GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP) GTNODE(ROL , GenTreeOp ,0,GTK_BINOP) GTNODE(ROR , GenTreeOp ,0,GTK_BINOP) +GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod) GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply) // GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn // the div into a MULHI + some adjustments. In codegen, we only use the diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6641f412a417cf..ff145481c5fef3 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5171,31 +5171,27 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) if (!comp->opts.MinOpts() && (divisorValue >= 3)) { size_t magic; - bool add; - int shift; + bool increment; + int preShift; + int postShift; if (type == TYP_INT) { - magic = MagicDivide::GetUnsigned32Magic(static_cast(divisorValue), &add, &shift); + magic = + MagicDivide::GetUnsigned32Magic(static_cast(divisorValue), &increment, &preShift, &postShift); } else { #ifdef TARGET_64BIT - magic = MagicDivide::GetUnsigned64Magic(static_cast(divisorValue), &add, &shift); + magic = + MagicDivide::GetUnsigned64Magic(static_cast(divisorValue), &increment, &preShift, &postShift); #else unreached(); #endif } assert(divMod->MarkedDivideByConstOptimized()); - // Depending on the "add" flag returned by GetUnsignedMagicNumberForDivide we need to generate: - // add == false (when divisor == 3 for example): - // div = (dividend MULHI magic) RSZ shift - // add == true (when divisor == 7 for example): - // mulhi = dividend MULHI magic - // div = (((dividend SUB mulhi) RSZ 1) ADD mulhi)) RSZ (shift - 1) - const bool requiresAdjustment = add; - const bool requiresDividendMultiuse = requiresAdjustment || !isDiv; + const bool requiresDividendMultiuse = !isDiv; const BasicBlock::weight_t curBBWeight = m_block->getBBWeight(comp); if (requiresDividendMultiuse) @@ -5204,62 +5200,80 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) dividend = ReplaceWithLclVar(dividendUse); } - // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node. - // The existing node will later be transformed into a GT_RSZ/GT_SUB that - // computes the final result. This way don't need to find and change the use - // of the existing node. - GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, dividend, divisor); - mulhi->gtFlags |= GTF_UNSIGNED; - divisor->AsIntCon()->SetIconValue(magic); - BlockRange().InsertBefore(divMod, mulhi); - GenTree* firstNode = mulhi; + GenTree* firstNode = nullptr; + GenTree* adjustedDividend = dividend; - if (requiresAdjustment) + // If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first + if (increment) { - dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()); - GenTree* sub = comp->gtNewOperNode(GT_SUB, type, dividend, mulhi); - BlockRange().InsertBefore(divMod, dividend, sub); - - GenTree* one = comp->gtNewIconNode(1, TYP_INT); - GenTree* rsz = comp->gtNewOperNode(GT_RSZ, type, sub, one); - BlockRange().InsertBefore(divMod, one, rsz); - - LIR::Use mulhiUse(BlockRange(), &sub->AsOp()->gtOp2, sub); - mulhi = ReplaceWithLclVar(mulhiUse); - - mulhi = comp->gtNewLclvNode(mulhi->AsLclVar()->GetLclNum(), mulhi->TypeGet()); - GenTree* add = comp->gtNewOperNode(GT_ADD, type, rsz, mulhi); - BlockRange().InsertBefore(divMod, mulhi, add); - - mulhi = add; - shift -= 1; + adjustedDividend = comp->gtNewOperNode(GT_INC_SATURATE, type, adjustedDividend); + BlockRange().InsertBefore(divMod, adjustedDividend); + firstNode = adjustedDividend; + assert(!preShift); + } + // if "preShift" is required, then do a right shift before + else if (preShift) + { + GenTree* preShiftBy = comp->gtNewIconNode(preShift, TYP_INT); + adjustedDividend = comp->gtNewOperNode(GT_RSZ, type, adjustedDividend, preShiftBy); + BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend); + firstNode = preShiftBy; } - GenTree* shiftBy = comp->gtNewIconNode(shift, TYP_INT); - BlockRange().InsertBefore(divMod, shiftBy); - - if (isDiv) + if (isDiv && !postShift) { - divMod->SetOper(GT_RSZ); - divMod->gtOp1 = mulhi; - divMod->gtOp2 = shiftBy; + divMod->SetOper(GT_MULHI); + divMod->gtOp1 = adjustedDividend; + divisor->AsIntCon()->SetIconValue(magic); } else { - GenTree* div = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy); + // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node. + // The existing node will later be transformed into a GT_RSZ/GT_SUB that + // computes the final result. This way don't need to find and change the use + // of the existing node. + GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, adjustedDividend, divisor); + mulhi->gtFlags |= GTF_UNSIGNED; + divisor->AsIntCon()->SetIconValue(magic); + BlockRange().InsertBefore(divMod, mulhi); + if (!firstNode) + firstNode = mulhi; + + if (postShift) + { + GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT); + BlockRange().InsertBefore(divMod, shiftBy); - // divisor UMOD dividend = dividend SUB (div MUL divisor) - GenTree* divisor = comp->gtNewIconNode(divisorValue, type); - GenTree* mul = comp->gtNewOperNode(GT_MUL, type, div, divisor); - dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()); + if (isDiv) + { + divMod->SetOper(GT_RSZ); + divMod->gtOp1 = mulhi; + divMod->gtOp2 = shiftBy; + } + else + { + mulhi = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy); + BlockRange().InsertBefore(divMod, mulhi); + } + } + + if (!isDiv) + { + // divisor UMOD dividend = dividend SUB (div MUL divisor) + GenTree* divisor = comp->gtNewIconNode(divisorValue, type); + GenTree* mul = comp->gtNewOperNode(GT_MUL, type, mulhi, divisor); + dividend = comp->gtNewLclvNode(dividend->AsLclVar()->GetLclNum(), dividend->TypeGet()); - divMod->SetOper(GT_SUB); - divMod->gtOp1 = dividend; - divMod->gtOp2 = mul; + divMod->SetOper(GT_SUB); + divMod->gtOp1 = dividend; + divMod->gtOp2 = mul; - BlockRange().InsertBefore(divMod, div, divisor, mul, dividend); + BlockRange().InsertBefore(divMod, divisor, mul, dividend); + } } - ContainCheckRange(firstNode, divMod); + + if (firstNode) + ContainCheckRange(firstNode, divMod); return true; } #endif diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp index dafa816193cb77..0d78efc1f7acf8 100644 --- a/src/coreclr/jit/utils.cpp +++ b/src/coreclr/jit/utils.cpp @@ -2198,8 +2198,8 @@ struct UnsignedMagic typedef T DivisorType; T magic; - bool add; - int shift; + bool increment; + char postShift; }; template @@ -2216,7 +2216,7 @@ const UnsignedMagic* TryGetUnsignedMagic(uint32_t divisor) {}, {0xcccccccd, false, 2}, // 5 {0xaaaaaaab, false, 2}, // 6 - {0x24924925, true, 3}, // 7 + {0x49249249, true, 1}, // 7 {}, {0x38e38e39, false, 1}, // 9 {0xcccccccd, false, 3}, // 10 @@ -2235,7 +2235,7 @@ const UnsignedMagic* TryGetUnsignedMagic(uint64_t divisor) {}, {0xcccccccccccccccd, false, 2}, // 5 {0xaaaaaaaaaaaaaaab, false, 2}, // 6 - {0x2492492492492493, true, 3}, // 7 + {0x9249249249249249, true, 2}, // 7 {}, {0xe38e38e38e38e38f, false, 3}, // 9 {0xcccccccccccccccd, false, 3}, // 10 @@ -2252,99 +2252,137 @@ const UnsignedMagic* TryGetUnsignedMagic(uint64_t divisor) // // Arguments: // d - The divisor -// add - Pointer to a flag indicating the kind of code to generate -// shift - Pointer to the shift value to be returned +// increment - Pointer to a flag indicating if incrementing the numerator is required +// preShift - Pointer to the pre-shift value to be returned +// postShift - Pointer to the post-shift value to be returned // // Returns: // The magic number. // // Notes: -// This code is adapted from _The_PowerPC_Compiler_Writer's_Guide_, pages 57-58. -// The paper is based on "Division by invariant integers using multiplication" -// by Torbjorn Granlund and Peter L. Montgomery in PLDI 94 +// Based on "Faster Unsigned Division by Constants" by ridiculous_fish. +// https://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf +// https://github.com/ridiculousfish/libdivide/blob/master/doc/divide_by_constants_codegen_reference.c template -T GetUnsignedMagic(T d, bool* add /*out*/, int* shift /*out*/) +T GetUnsignedMagic(T d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned num_bits) { assert((d >= 3) && !isPow2(d)); - const UnsignedMagic* magic = TryGetUnsignedMagic(d); + // The numerator must fit in a uint + assert(num_bits > 0 && num_bits <= sizeof(T) * CHAR_BIT); - if (magic != nullptr) + // Bits in a uint + const unsigned UINT_BITS = sizeof(T) * CHAR_BIT; + + if (num_bits == UINT_BITS) { - *shift = magic->shift; - *add = magic->add; - return magic->magic; + const UnsignedMagic* magic = TryGetUnsignedMagic(d); + + if (magic != nullptr) + { + *increment = magic->increment; + *preShift = 0; + *postShift = magic->postShift; + return magic->magic; + } } - typedef typename std::make_signed::type ST; + // The extra shift implicit in the difference between UINT_BITS and num_bits + const unsigned extra_shift = UINT_BITS - num_bits; - const unsigned bits = sizeof(T) * 8; - const unsigned bitsMinus1 = bits - 1; - const T twoNMinus1 = T(1) << bitsMinus1; + // The initial power of 2 is one less than the first one that can possibly work + const T initial_power_of_2 = (T)1 << (UINT_BITS - 1); - *add = false; - const T nc = -ST(1) - -ST(d) % ST(d); - unsigned p = bitsMinus1; - T q1 = twoNMinus1 / nc; - T r1 = twoNMinus1 - (q1 * nc); - T q2 = (twoNMinus1 - 1) / d; - T r2 = (twoNMinus1 - 1) - (q2 * d); - T delta; + // The remainder and quotient of our power of 2 divided by d + T quotient = initial_power_of_2 / d, remainder = initial_power_of_2 % d; - do - { - p++; + // The magic info for the variant "round down" algorithm + T down_multiplier = 0; + unsigned down_exponent = 0; + int has_magic_down = 0; + + // Compute ceil(log_2 D) + unsigned ceil_log_2_D = 0; + for (T tmp = d; tmp > 0; tmp >>= 1) + ceil_log_2_D += 1; - if (r1 >= (nc - r1)) + // Begin a loop that increments the exponent, until we find a power of 2 that works. + unsigned exponent; + for (exponent = 0;; exponent++) + { + // Quotient and remainder is from previous exponent; compute it for this exponent. + if (remainder >= d - remainder) { - q1 = 2 * q1 + 1; - r1 = 2 * r1 - nc; + // Doubling remainder will wrap around D + quotient = quotient * 2 + 1; + remainder = remainder * 2 - d; } else { - q1 = 2 * q1; - r1 = 2 * r1; + // Remainder will not wrap + quotient = quotient * 2; + remainder = remainder * 2; } - if ((r2 + 1) >= (d - r2)) - { - if (q2 >= (twoNMinus1 - 1)) - { - *add = true; - } + // We're done if this exponent works for the round_up algorithm. + // Note that exponent may be larger than the maximum shift supported, + // so the check for >= ceil_log_2_D is critical. + if ((exponent + extra_shift >= ceil_log_2_D) || (d - remainder) <= ((T)1 << (exponent + extra_shift))) + break; - q2 = 2 * q2 + 1; - r2 = 2 * r2 + 1 - d; - } - else + // Set magic_down if we have not set it yet and this exponent works for the round_down algorithm + if (!has_magic_down && remainder <= ((T)1 << (exponent + extra_shift))) { - if (q2 >= twoNMinus1) - { - *add = true; - } - - q2 = 2 * q2; - r2 = 2 * r2 + 1; + has_magic_down = 1; + down_multiplier = quotient; + down_exponent = exponent; } + } - delta = d - 1 - r2; - - } while ((p < (bits * 2)) && ((q1 < delta) || ((q1 == delta) && (r1 == 0)))); - - *shift = p - bits; // resulting shift - return q2 + 1; // resulting magic number + if (exponent < ceil_log_2_D) + { + // magic_up is efficient + *increment = false; + *preShift = 0; + *postShift = (int)exponent; + return quotient + 1; + } + else if (d & 1) + { + // Odd divisor, so use magic_down, which must have been set + assert(has_magic_down); + *increment = true; + *preShift = 0; + *postShift = (int)down_exponent; + return down_multiplier; + } + else + { + // Even divisor, so use a prefix-shifted dividend + unsigned pre_shift = 0; + T shifted_D = d; + while ((shifted_D & 1) == 0) + { + shifted_D >>= 1; + pre_shift += 1; + } + T result = GetUnsignedMagic(shifted_D, increment, preShift, postShift, num_bits - pre_shift); + assert(*increment == 0 && *preShift == 0); // expect no increment or pre_shift in this path + *preShift = (int)pre_shift; + return result; + } } -uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/) +uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/) { - return GetUnsignedMagic(d, add, shift); + return GetUnsignedMagic(d, increment, preShift, postShift, 32); } #ifdef TARGET_64BIT -uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/) +uint64_t GetUnsigned64Magic(uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/) { - return GetUnsignedMagic(d, add, shift); + return GetUnsignedMagic(d, increment, preShift, postShift, 64); } #endif diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h index 3f08f3e25618d7..0c293a54e5816b 100644 --- a/src/coreclr/jit/utils.h +++ b/src/coreclr/jit/utils.h @@ -756,9 +756,9 @@ class CritSecHolder namespace MagicDivide { -uint32_t GetUnsigned32Magic(uint32_t d, bool* add /*out*/, int* shift /*out*/); +uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/); #ifdef TARGET_64BIT -uint64_t GetUnsigned64Magic(uint64_t d, bool* add /*out*/, int* shift /*out*/); +uint64_t GetUnsigned64Magic(uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/); #endif int32_t GetSigned32Magic(int32_t d, int* shift /*out*/); #ifdef TARGET_64BIT From adea5a0fa42f13d4e5fd8c6eb7b3d435e8122590 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Sun, 14 Mar 2021 02:55:36 +0200 Subject: [PATCH 02/11] Fix Arm build and add some tests. --- src/coreclr/jit/codegenarmarch.cpp | 10 +++++- src/tests/JIT/Math/Functions/DivideByConst.cs | 36 +++++++++++++++++++ src/tests/JIT/Math/Functions/Program.cs | 2 ++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 src/tests/JIT/Math/Functions/DivideByConst.cs diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 6797304df3a887..a1ab758533f182 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -4259,8 +4259,16 @@ void CodeGen::genCodeForIncSaturate(GenTree* tree) // The src must be a register. regNumber operandReg = genConsumeReg(operand); +#ifdef TARGET_ARM64 GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1); - GetEmitter()->emitIns_R_R_COND(INS_mvn, emitActualTypeSize(tree), targetReg, REG_ZR, INS_COND_HS); + GetEmitter()->emitIns_R_R_R_COND(INS_csinv, emitActualTypeSize(tree), targetReg, targetReg, REG_ZR, INS_COND_LO); +#else + regNumber tmpReg = tree->GetSingleTempReg(); + GetEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(tree), tmpReg, 0); + GetEmitter()->emitIns_R_R_I(INS_add, emitActualTypeSize(tree), targetReg, operandReg, 1, INS_FLAGS_SET); + GetEmitter()->emitIns_R_R(INS_adc, emitActualTypeSize(tree), tmpReg, tmpReg); + GetEmitter()->emitIns_R_R(INS_sub, emitActualTypeSize(tree), targetReg, tmpReg); +#endif genProduceReg(tree); } diff --git a/src/tests/JIT/Math/Functions/DivideByConst.cs b/src/tests/JIT/Math/Functions/DivideByConst.cs new file mode 100644 index 00000000000000..600213370e9b09 --- /dev/null +++ b/src/tests/JIT/Math/Functions/DivideByConst.cs @@ -0,0 +1,36 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Runtime.CompilerServices; + +namespace System.MathBenchmarks +{ + public static class DivideByConst + { + public static void Test() + { + Verify(ulong.MaxValue, long.MaxValue, uint.MaxValue, int.MaxValue); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Verify(ulong u64, long i64, uint u32, int i32) + { + if (u64 / 7 != 0x2492492492492492) throw new Exception($"{u64:x}/7={u64 / 7:x}"); + if (i64 / 7 != 0x1249249249249249) throw new Exception($"{i64:x}/7={i64 / 7:x}"); + if (u32 / 7 != 0x24924924) throw new Exception($"{u32:x}/7={u32 / 7:x}"); + if (i32 / 7 != 0x12492492) throw new Exception($"{i32:x}/7={i32 / 7:x}"); + + if (u64 / 14 != 0x1249249249249249) throw new Exception($"{u64:x}/14={u64 / 14:x}"); + if (i64 / 14 != 0x924924924924924) throw new Exception($"{i64:x}/14={i64 / 14:x}"); + if (u32 / 14 != 0x12492492) throw new Exception($"{u32:x}/14={u32 / 14:x}"); + if (i32 / 14 != 0x9249249) throw new Exception($"{i32:x}/14={i32 / 14:x}"); + + if (u64 / 56 != 0x492492492492492) throw new Exception($"{u64:x}/56={u64 / 56:x}"); + if (i64 / 56 != 0x249249249249249) throw new Exception($"{i64:x}/56={i64 / 56:x}"); + if (u32 / 56 != 0x4924924) throw new Exception($"{u32:x}/56={u32 / 56:x}"); + if (i32 / 56 != 0x2492492) throw new Exception($"{i32:x}/56={i32 / 56:x}"); + } + } + } +} diff --git a/src/tests/JIT/Math/Functions/Program.cs b/src/tests/JIT/Math/Functions/Program.cs index ba502dd5c7ec00..ae9105eab6dbde 100644 --- a/src/tests/JIT/Math/Functions/Program.cs +++ b/src/tests/JIT/Math/Functions/Program.cs @@ -74,6 +74,8 @@ public static int Main(string[] args) result += Test(singleTests.Tan); result += Test(singleTests.Tanh); + result += Test(DivideByConst.Test); + return result; } From e00ff7661d780edad202b7779f423d8a460b7dab Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Sun, 14 Mar 2021 03:57:48 +0200 Subject: [PATCH 03/11] Improve register allocation --- src/coreclr/jit/lsraxarch.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 3872e794aa884e..24217375e77576 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -320,6 +320,11 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = BuildModDiv(tree->AsOp()); break; + case GT_INC_SATURATE: + srcCount = BuildOperandUses(tree->gtGetOp1()); + BuildDef(tree, RBM_EAX); // force EAX because the following MULHI will require EAX:EDX input anyway + break; + #if defined(TARGET_X86) case GT_MUL_LONG: dstCount = 2; From ee00ec5b5d4487bcd0fdb95541ff94324c360ec2 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Sun, 14 Mar 2021 04:04:10 +0200 Subject: [PATCH 04/11] Fix ARM64 codegen --- src/coreclr/jit/codegenarmarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index a1ab758533f182..8b443dd931c0f2 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -4261,7 +4261,7 @@ void CodeGen::genCodeForIncSaturate(GenTree* tree) #ifdef TARGET_ARM64 GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1); - GetEmitter()->emitIns_R_R_R_COND(INS_csinv, emitActualTypeSize(tree), targetReg, targetReg, REG_ZR, INS_COND_LO); + GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS); #else regNumber tmpReg = tree->GetSingleTempReg(); GetEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(tree), tmpReg, 0); From cb4cebc837b38a9e72eb788a03bb7b1c228afe99 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Mon, 15 Mar 2021 02:07:32 +0200 Subject: [PATCH 05/11] Fix MULHI flags --- src/coreclr/jit/lower.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index ff145481c5fef3..ad7c92a5ba2ae1 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5224,6 +5224,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) { divMod->SetOper(GT_MULHI); divMod->gtOp1 = adjustedDividend; + divMod->gtFlags |= GTF_UNSIGNED; divisor->AsIntCon()->SetIconValue(magic); } else From 37177ee0db48116353141c7e65d3c6addaee92a6 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Mon, 15 Mar 2021 02:16:57 +0200 Subject: [PATCH 06/11] Remove ARM32 codegen --- src/coreclr/jit/codegenarm64.cpp | 22 ++++++++++++++++++++++ src/coreclr/jit/codegenarmarch.cpp | 30 ------------------------------ 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 737db8dc424a65..b0377ea45389e9 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -1753,6 +1753,28 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre } } +// Produce code for a GT_INC_SATURATE node. +void CodeGen::genCodeForIncSaturate(GenTree* tree) +{ + regNumber targetReg = tree->GetRegNum(); + var_types targetType = tree->TypeGet(); + + // The arithmetic node must be sitting in a register (since it's not contained) + assert(!tree->isContained()); + // The dst can only be a register. + assert(targetReg != REG_NA); + + GenTree* operand = tree->gtGetOp1(); + assert(!operand->isContained()); + // The src must be a register. + regNumber operandReg = genConsumeReg(operand); + + GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1); + GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS); + + genProduceReg(tree); +} + // Generate code to get the high N bits of a N*N=2N bit multiplication result void CodeGen::genCodeForMulHi(GenTreeOp* treeNode) { diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 8b443dd931c0f2..f8d87e41232c49 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -4243,34 +4243,4 @@ void CodeGen::genPushCalleeSavedRegisters() #endif // TARGET_ARM64 } -// Produce code for a GT_INC_SATURATE node. -void CodeGen::genCodeForIncSaturate(GenTree* tree) -{ - regNumber targetReg = tree->GetRegNum(); - var_types targetType = tree->TypeGet(); - - // The arithmetic node must be sitting in a register (since it's not contained) - assert(!tree->isContained()); - // The dst can only be a register. - assert(targetReg != REG_NA); - - GenTree* operand = tree->gtGetOp1(); - assert(!operand->isContained()); - // The src must be a register. - regNumber operandReg = genConsumeReg(operand); - -#ifdef TARGET_ARM64 - GetEmitter()->emitIns_R_R_I(INS_adds, emitActualTypeSize(tree), targetReg, operandReg, 1); - GetEmitter()->emitIns_R_R_COND(INS_cinv, emitActualTypeSize(tree), targetReg, targetReg, INS_COND_HS); -#else - regNumber tmpReg = tree->GetSingleTempReg(); - GetEmitter()->emitIns_R_I(INS_mov, emitActualTypeSize(tree), tmpReg, 0); - GetEmitter()->emitIns_R_R_I(INS_add, emitActualTypeSize(tree), targetReg, operandReg, 1, INS_FLAGS_SET); - GetEmitter()->emitIns_R_R(INS_adc, emitActualTypeSize(tree), tmpReg, tmpReg); - GetEmitter()->emitIns_R_R(INS_sub, emitActualTypeSize(tree), targetReg, tmpReg); -#endif - - genProduceReg(tree); -} - #endif // TARGET_ARMARCH From ba227e503ad2e658bc8bd50ea31faee39b34a4c0 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Tue, 16 Mar 2021 05:44:04 +0200 Subject: [PATCH 07/11] Widen 32bit UDIV to 64bit MULHI when possible. Improve register allocation. --- src/coreclr/jit/lower.cpp | 49 +++++++++++++++++++++++++++++------ src/coreclr/jit/lsraxarch.cpp | 5 ---- src/coreclr/jit/utils.cpp | 5 ++-- src/coreclr/jit/utils.h | 3 ++- 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index ad7c92a5ba2ae1..d45bc804350726 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5170,15 +5170,27 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!comp->opts.MinOpts() && (divisorValue >= 3)) { - size_t magic; - bool increment; - int preShift; - int postShift; + size_t magic; + bool increment; + int preShift; + int postShift; + var_types mulType = type; if (type == TYP_INT) { magic = MagicDivide::GetUnsigned32Magic(static_cast(divisorValue), &increment, &preShift, &postShift); + +#ifdef TARGET_64BIT + // avoid inc_saturate/multiple shifts by widening the multiplication to 32x64 + if (increment || preShift && postShift) + { + mulType = TYP_LONG; + divisor->gtType = TYP_LONG; + magic = MagicDivide::GetUnsigned64Magic(static_cast(divisorValue), &increment, &preShift, + &postShift, 32); + } +#endif } else { @@ -5219,13 +5231,26 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend); firstNode = preShiftBy; } + else if (mulType != type) + { + adjustedDividend = comp->gtNewCastNode(mulType, adjustedDividend, true, TYP_ULONG); + BlockRange().InsertBefore(divMod, adjustedDividend); + firstNode = adjustedDividend; + } - if (isDiv && !postShift) +#ifdef TARGET_XARCH + // force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes + // reduntant copies otherwise + if (firstNode) + adjustedDividend->SetRegNum(REG_RAX); +#endif + + divisor->AsIntCon()->SetIconValue(magic); + if (isDiv && !postShift && mulType == type) { divMod->SetOper(GT_MULHI); divMod->gtOp1 = adjustedDividend; divMod->gtFlags |= GTF_UNSIGNED; - divisor->AsIntCon()->SetIconValue(magic); } else { @@ -5233,9 +5258,8 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) // The existing node will later be transformed into a GT_RSZ/GT_SUB that // computes the final result. This way don't need to find and change the use // of the existing node. - GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, type, adjustedDividend, divisor); + GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, mulType, adjustedDividend, divisor); mulhi->gtFlags |= GTF_UNSIGNED; - divisor->AsIntCon()->SetIconValue(magic); BlockRange().InsertBefore(divMod, mulhi); if (!firstNode) firstNode = mulhi; @@ -5257,6 +5281,15 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) BlockRange().InsertBefore(divMod, mulhi); } } + else if (isDiv) + { + assert(mulType != type); + divMod->SetOper(GT_CAST); + divMod->gtOp1 = mulhi; + divMod->gtOp2 = nullptr; + divMod->gtFlags |= GTF_UNSIGNED; + divMod->AsCast()->gtCastType = TYP_UINT; + } if (!isDiv) { diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 24217375e77576..3872e794aa884e 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -320,11 +320,6 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = BuildModDiv(tree->AsOp()); break; - case GT_INC_SATURATE: - srcCount = BuildOperandUses(tree->gtGetOp1()); - BuildDef(tree, RBM_EAX); // force EAX because the following MULHI will require EAX:EDX input anyway - break; - #if defined(TARGET_X86) case GT_MUL_LONG: dstCount = 2; diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp index 0d78efc1f7acf8..dbd5b7150cdcd1 100644 --- a/src/coreclr/jit/utils.cpp +++ b/src/coreclr/jit/utils.cpp @@ -2380,9 +2380,10 @@ uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift / } #ifdef TARGET_64BIT -uint64_t GetUnsigned64Magic(uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/) +uint64_t GetUnsigned64Magic( + uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits) { - return GetUnsignedMagic(d, increment, preShift, postShift, 64); + return GetUnsignedMagic(d, increment, preShift, postShift, bits); } #endif diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h index 0c293a54e5816b..b51c7a9810f996 100644 --- a/src/coreclr/jit/utils.h +++ b/src/coreclr/jit/utils.h @@ -758,7 +758,8 @@ namespace MagicDivide { uint32_t GetUnsigned32Magic(uint32_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/); #ifdef TARGET_64BIT -uint64_t GetUnsigned64Magic(uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/); +uint64_t GetUnsigned64Magic( + uint64_t d, bool* increment /*out*/, int* preShift /*out*/, int* postShift /*out*/, unsigned bits = 64); #endif int32_t GetSigned32Magic(int32_t d, int* shift /*out*/); #ifdef TARGET_64BIT From d3246185d0e6dee6907fc379960878b34bb537e8 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Tue, 16 Mar 2021 21:14:31 +0200 Subject: [PATCH 08/11] Always widen 32bit UDIV to 64bit MUL/MULHI --- src/coreclr/jit/lower.cpp | 67 ++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index d45bc804350726..5f831854d4e908 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5170,11 +5170,11 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!comp->opts.MinOpts() && (divisorValue >= 3)) { - size_t magic; - bool increment; - int preShift; - int postShift; - var_types mulType = type; + size_t magic; + bool increment; + int preShift; + int postShift; + bool simpleMul = false; if (type == TYP_INT) { @@ -5182,14 +5182,23 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) MagicDivide::GetUnsigned32Magic(static_cast(divisorValue), &increment, &preShift, &postShift); #ifdef TARGET_64BIT - // avoid inc_saturate/multiple shifts by widening the multiplication to 32x64 - if (increment || preShift && postShift) + // avoid inc_saturate/multiple shifts by widening to 32x64 MULHI + if (increment || (preShift +#ifdef TARGET_XARCH + // IMUL reg,reg,imm32 can't be used if magic<0 because of sign-extension + && static_cast(magic) < 0 +#endif + )) { - mulType = TYP_LONG; - divisor->gtType = TYP_LONG; magic = MagicDivide::GetUnsigned64Magic(static_cast(divisorValue), &increment, &preShift, &postShift, 32); } + // otherwise just widen to regular multiplication + else + { + postShift += 32; + simpleMul = true; + } #endif } else @@ -5231,9 +5240,9 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend); firstNode = preShiftBy; } - else if (mulType != type) + else if (type != TYP_I_IMPL) { - adjustedDividend = comp->gtNewCastNode(mulType, adjustedDividend, true, TYP_ULONG); + adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL); BlockRange().InsertBefore(divMod, adjustedDividend); firstNode = adjustedDividend; } @@ -5241,12 +5250,13 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) #ifdef TARGET_XARCH // force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes // reduntant copies otherwise - if (firstNode) + if (firstNode && !simpleMul) adjustedDividend->SetRegNum(REG_RAX); #endif + divisor->gtType = TYP_I_IMPL; divisor->AsIntCon()->SetIconValue(magic); - if (isDiv && !postShift && mulType == type) + if (isDiv && !postShift && type == TYP_I_IMPL) { divMod->SetOper(GT_MULHI); divMod->gtOp1 = adjustedDividend; @@ -5258,7 +5268,15 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) // The existing node will later be transformed into a GT_RSZ/GT_SUB that // computes the final result. This way don't need to find and change the use // of the existing node. - GenTree* mulhi = comp->gtNewOperNode(GT_MULHI, mulType, adjustedDividend, divisor); + GenTree* mulhi; + if (simpleMul) + { + mulhi = comp->gtNewOperNode(GT_MUL, TYP_I_IMPL, adjustedDividend, divisor); + } + else + { + mulhi = comp->gtNewOperNode(GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor); + } mulhi->gtFlags |= GTF_UNSIGNED; BlockRange().InsertBefore(divMod, mulhi); if (!firstNode) @@ -5269,7 +5287,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT); BlockRange().InsertBefore(divMod, shiftBy); - if (isDiv) + if (isDiv && type == TYP_I_IMPL) { divMod->SetOper(GT_RSZ); divMod->gtOp1 = mulhi; @@ -5277,19 +5295,10 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) } else { - mulhi = comp->gtNewOperNode(GT_RSZ, type, mulhi, shiftBy); + mulhi = comp->gtNewOperNode(GT_RSZ, TYP_I_IMPL, mulhi, shiftBy); BlockRange().InsertBefore(divMod, mulhi); } } - else if (isDiv) - { - assert(mulType != type); - divMod->SetOper(GT_CAST); - divMod->gtOp1 = mulhi; - divMod->gtOp2 = nullptr; - divMod->gtFlags |= GTF_UNSIGNED; - divMod->AsCast()->gtCastType = TYP_UINT; - } if (!isDiv) { @@ -5304,6 +5313,14 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) BlockRange().InsertBefore(divMod, divisor, mul, dividend); } + else if (type != TYP_I_IMPL) + { + divMod->SetOper(GT_CAST); + divMod->gtOp1 = mulhi; + divMod->gtOp2 = nullptr; + divMod->gtFlags |= GTF_UNSIGNED; + divMod->AsCast()->gtCastType = TYP_UINT; + } } if (firstNode) From ab7ce58f0daca169a7c662e69d966a0fc4774344 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Tue, 16 Mar 2021 22:22:15 +0200 Subject: [PATCH 09/11] Cleanup --- src/coreclr/jit/lower.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 5f831854d4e908..7b3144eb11cfdb 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5256,6 +5256,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) divisor->gtType = TYP_I_IMPL; divisor->AsIntCon()->SetIconValue(magic); + if (isDiv && !postShift && type == TYP_I_IMPL) { divMod->SetOper(GT_MULHI); @@ -5268,15 +5269,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) // The existing node will later be transformed into a GT_RSZ/GT_SUB that // computes the final result. This way don't need to find and change the use // of the existing node. - GenTree* mulhi; - if (simpleMul) - { - mulhi = comp->gtNewOperNode(GT_MUL, TYP_I_IMPL, adjustedDividend, divisor); - } - else - { - mulhi = comp->gtNewOperNode(GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor); - } + GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor); mulhi->gtFlags |= GTF_UNSIGNED; BlockRange().InsertBefore(divMod, mulhi); if (!firstNode) From 77c85b34a0115ae966cfeff0084d7c0053914b67 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Wed, 17 Mar 2021 14:04:07 +0200 Subject: [PATCH 10/11] Final optimization --- src/coreclr/jit/lower.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 7b3144eb11cfdb..6ab90ebdf6b6fb 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5308,11 +5308,15 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) } else if (type != TYP_I_IMPL) { +#ifdef TARGET_ARM divMod->SetOper(GT_CAST); - divMod->gtOp1 = mulhi; - divMod->gtOp2 = nullptr; divMod->gtFlags |= GTF_UNSIGNED; divMod->AsCast()->gtCastType = TYP_UINT; +#else + divMod->SetOper(GT_BITCAST); +#endif + divMod->gtOp1 = mulhi; + divMod->gtOp2 = nullptr; } } From c9f6ef63a9b8e7f3531a812451859f878ed1a299 Mon Sep 17 00:00:00 2001 From: Pent Ploompuu Date: Wed, 17 Mar 2021 17:30:08 +0200 Subject: [PATCH 11/11] Fix typo --- src/coreclr/jit/lower.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6ab90ebdf6b6fb..b6a30118907306 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5308,7 +5308,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) } else if (type != TYP_I_IMPL) { -#ifdef TARGET_ARM +#ifdef TARGET_ARMARCH divMod->SetOper(GT_CAST); divMod->gtFlags |= GTF_UNSIGNED; divMod->AsCast()->gtCastType = TYP_UINT;