From ff6057ad5cbe28b7008da0602b18d3fb89465a4a Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Sun, 22 Aug 2021 22:18:03 +0300 Subject: [PATCH 1/6] Refactor LONG_MUL recognition Move it to a GenTree method so that it can be reused in lowering. No diffs except for a couple (~5 across all SPMI collection) on ARM due to the new operand swapping behavior. --- src/coreclr/jit/gentree.cpp | 124 +++++++++++++++++++++++++++++------- src/coreclr/jit/gentree.h | 8 ++- src/coreclr/jit/gtlist.h | 11 ++-- src/coreclr/jit/morph.cpp | 94 ++++----------------------- 4 files changed, 129 insertions(+), 108 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index a9f5402b50852c..36d530e5264565 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2445,41 +2445,81 @@ GenTree* Compiler::gtReverseCond(GenTree* tree) return tree; } -/*****************************************************************************/ +#if !defined(TARGET_64BIT) || defined(TARGET_ARM64) +//------------------------------------------------------------------------------ +// IsValidLongMul : Check for long multiplication with 32 bit operands. +// +// Recognizes the following tree: MUL(CAST(long <- int), CAST(long <- int) or CONST), +// where CONST must be an integer constant that fits in 32 bits. Will try to detect +// cases when the multiplication cannot overflow and return "true" for them. +// +// This function does not change the state of the tree and is usable in LIR. +// +// Return Value: +// Whether this GT_MUL tree is a valid long multiplication candidate. +// +bool GenTreeOp::IsValidLongMul() +{ + assert(OperIs(GT_MUL)); -#ifdef DEBUG + GenTree* op1 = gtGetOp1(); + GenTree* op2 = gtGetOp2(); -bool GenTree::gtIsValid64RsltMul() -{ - if (!OperIs(GT_MUL) || !Is64RsltMul()) + if (!TypeIs(TYP_LONG)) { return false; } - GenTree* op1 = AsOp()->gtGetOp1(); - GenTree* op2 = AsOp()->gtGetOp2(); + assert(op1->TypeIs(TYP_LONG)); + assert(op2->TypeIs(TYP_LONG)); - if (!TypeIs(TYP_LONG) || !op1->TypeIs(TYP_LONG) || !op2->TypeIs(TYP_LONG)) + if (!(op1->OperIs(GT_CAST) && genActualTypeIsInt(op1->AsCast()->CastOp()))) { return false; } - if (gtOverflow()) + if (!(op2->OperIs(GT_CAST) && genActualTypeIsInt(op2->AsCast()->CastOp())) && + !(op2->IsIntegralConst() && FitsIn(op2->AsIntConCommon()->IntegralValue()))) { return false; } - // op1 has to be CAST(long <- int). - if (!(op1->OperIs(GT_CAST) && genActualTypeIsInt(op1->AsCast()->CastOp()))) + if (op1->gtOverflow() || op2->gtOverflowEx()) { return false; } - // op2 has to be CAST(long <- int) or a suitably small constant. - if (!(op2->OperIs(GT_CAST) && genActualTypeIsInt(op2->AsCast()->CastOp())) && - !(op2->IsIntegralConst() && FitsIn(op2->AsIntConCommon()->IntegralValue()))) + if (gtOverflow()) { - return false; + auto getMaxValue = [this](GenTree* op) -> int64_t { + if (op->OperIs(GT_CAST)) + { + if (op->IsUnsigned()) + { + switch (op->AsCast()->CastOp()->TypeGet()) + { + case TYP_UBYTE: + return UINT8_MAX; + case TYP_USHORT: + return UINT16_MAX; + default: + return UINT32_MAX; + } + } + + return IsUnsigned() ? static_cast(UINT64_MAX) : INT32_MIN; + } + + return op->AsIntConCommon()->IntegralValue(); + }; + + int64_t maxOp1 = getMaxValue(op1); + int64_t maxOp2 = getMaxValue(op2); + + if (CheckedOps::MulOverflows(maxOp1, maxOp2, IsUnsigned())) + { + return false; + } } // Both operands must extend the same way. @@ -2491,16 +2531,56 @@ bool GenTree::gtIsValid64RsltMul() return false; } - // Do unsigned mul iff both operands are zero-extending. - if (op1->IsUnsigned() != IsUnsigned()) - { - return false; - } - return true; } -#endif // DEBUG +#if !defined(TARGET_64BIT) && defined(DEBUG) +//------------------------------------------------------------------------------ +// DebugCheckLongMul : Checks that a GTF_MUL_64RSLT tree is a valid MUL_LONG. +// +// Notes: +// This function is defined for 32 bit targets only because we *must* maintain +// the MUL_LONG-compatible tree shape throughout the compilation from morph to +// decomposition, since we do not have (great) ability to create new calls in LIR. +// +// It is for this reason that we recognize MUL_LONGs early in morph, mark them with +// a flag and then pessimize various places (e. g. assertion propagation) to not look +// at them. In contrast, on ARM64 we recognize MUL_LONGs late, in lowering, and thus +// do not need this function. +// +void GenTreeOp::DebugCheckLongMul() +{ + assert(OperIs(GT_MUL)); + assert(Is64RsltMul()); + assert(TypeIs(TYP_LONG)); + assert(!gtOverflow()); + + GenTree* op1 = gtGetOp1(); + GenTree* op2 = gtGetOp2(); + + assert(op1->TypeIs(TYP_LONG)); + assert(op2->TypeIs(TYP_LONG)); + + // op1 has to be CAST(long <- int) + assert(op1->OperIs(GT_CAST) && genActualTypeIsInt(op1->AsCast()->CastOp())); + assert(!op1->gtOverflow()); + + // op2 has to be CAST(long <- int) or a suitably small constant. + assert((op2->OperIs(GT_CAST) && genActualTypeIsInt(op2->AsCast()->CastOp())) || + (op2->IsIntegralConst() && FitsIn(op2->AsIntConCommon()->IntegralValue()))); + assert(!op2->gtOverflowEx()); + + // Both operands must extend the same way. + bool op1ZeroExtends = op1->IsUnsigned(); + bool op2ZeroExtends = op2->OperIs(GT_CAST) ? op2->IsUnsigned() : op2->AsIntConCommon()->IntegralValue() >= 0; + bool op2AnyExtensionIsSuitable = op2->IsIntegralConst() && op2ZeroExtends; + assert((op1ZeroExtends == op2ZeroExtends) || op2AnyExtensionIsSuitable); + + // Do unsigned mul iff both operands are zero-extending. + assert(op1->IsUnsigned() == IsUnsigned()); +} +#endif // !defined(TARGET_64BIT) && defined(DEBUG) +#endif // !defined(TARGET_64BIT) || defined(TARGET_ARM64) //------------------------------------------------------------------------------ // gtSetListOrder : Figure out the evaluation order for a list of values. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 53f73dbf7d1906..90681f1daf86b4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -2229,7 +2229,6 @@ struct GenTree bool gtRequestSetFlags(); #ifdef DEBUG - bool gtIsValid64RsltMul(); static int gtDispFlags(GenTreeFlags flags, GenTreeDebugFlags debugFlags); #endif @@ -3017,6 +3016,13 @@ struct GenTreeOp : public GenTreeUnOp return (gtFlags & GTF_DIV_BY_CNS_OPT) != 0; } +#if !defined(TARGET_64BIT) || defined(TARGET_ARM64) + bool IsValidLongMul(); +#if !defined(TARGET_64BIT) && defined(DEBUG) + void DebugCheckLongMul(); +#endif +#endif + #if DEBUGGABLE_GENTREE GenTreeOp() : GenTreeUnOp(), gtOp2(nullptr) { diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index d5da31b1f9df24..08b19e13f81cfa 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -188,11 +188,12 @@ GTNODE(SUB_LO , GenTreeOp ,0,GTK_BINOP) GTNODE(SUB_HI , GenTreeOp ,0,GTK_BINOP) // A mul that returns the 2N bit result of an NxN multiply. This op is used for -// multiplies that take two ints and return a long result. All other multiplies -// with long results are morphed into helper calls. It is similar to GT_MULHI, -// the difference being that GT_MULHI drops the lo part of the result, whereas -// GT_MUL_LONG keeps both parts of the result. -#if !defined(TARGET_64BIT) +// multiplies that take two ints and return a long result. On 32 bit targets, +// all other multiplies with long results are morphed into helper calls. +// It is similar to GT_MULHI, the difference being that GT_MULHI drops the lo +// part of the result, whereas GT_MUL_LONG keeps both parts of the result. +// MUL_LONG is also used on AMR64, where 64 bit multiplication is more expensive. +#if !defined(TARGET_64BIT) || defined(TARGET_ARM64) GTNODE(MUL_LONG , GenTreeMultiRegOp ,1,GTK_BINOP) #endif diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index a5fab4626e2e57..420b061170971a 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -10958,19 +10958,17 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, MorphAddrContext* mac) // We are seeing this node again. // Morph only the children of casts, // so as to avoid losing them. - assert(tree->gtIsValid64RsltMul()); tree = fgMorphLongMul(tree->AsOp()); goto DONE_MORPHING_CHILDREN; } tree = fgRecognizeAndMorphLongMul(tree->AsOp()); + op1 = tree->AsOp()->gtGetOp1(); + op2 = tree->AsOp()->gtGetOp2(); if (tree->Is64RsltMul()) { - op1 = tree->AsOp()->gtGetOp1(); - op2 = tree->AsOp()->gtGetOp2(); - goto DONE_MORPHING_CHILDREN; } else @@ -11968,7 +11966,7 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, MorphAddrContext* mac) if (typ == TYP_LONG) { // This must be GTF_MUL_64RSLT - assert(tree->gtIsValid64RsltMul()); + INDEBUG(tree->AsOp()->DebugCheckLongMul()); return tree; } #endif // TARGET_64BIT @@ -14469,17 +14467,15 @@ GenTree* Compiler::fgRecognizeAndMorphBitwiseRotation(GenTree* tree) //------------------------------------------------------------------------------ // fgRecognizeAndMorphLongMul : Check for and morph long multiplication with 32 bit operands. // -// Recognizes the following tree: MUL(CAST(long <- int) or CONST, CAST(long <- int) or CONST), -// where CONST must be an integer constant that fits in 32 bits. Note that if both operands are -// constants, the original tree is returned unmodified, i. e. the caller is responsible for -// folding or correct code generation (e. g. for overflow cases). May swap operands if the -// first one is a constant and the second one is not. +// Uses "GenTree::IsValidLongMul" to check for the long multiplication pattern. Will swap +// operands if the first one is a constant and the second one is not, even for trees which +// end up not being eligibile for long multiplication. // // Arguments: // mul - GT_MUL tree to check for a long multiplication opportunity // // Return Value: -// The original tree unmodified if it is not eligible for long multiplication. +// The original tree, with operands possibly swapped, if it is not eligible for long multiplication. // Tree with GTF_MUL_64RSLT set, side effect flags propagated, and children morphed if it is. // GenTreeOp* Compiler::fgRecognizeAndMorphLongMul(GenTreeOp* mul) @@ -14490,85 +14486,19 @@ GenTreeOp* Compiler::fgRecognizeAndMorphLongMul(GenTreeOp* mul) GenTree* op1 = mul->gtGetOp1(); GenTree* op2 = mul->gtGetOp2(); - assert(op1->TypeIs(TYP_LONG) && op2->TypeIs(TYP_LONG)); - - if (!(op1->OperIs(GT_CAST) && genActualTypeIsInt(op1->AsCast()->CastOp())) && - !(op1->IsIntegralConst() && FitsIn(op1->AsIntConCommon()->IntegralValue()))) - { - return mul; - } - - if (!(op2->OperIs(GT_CAST) && genActualTypeIsInt(op2->AsCast()->CastOp())) && - !(op2->IsIntegralConst() && FitsIn(op2->AsIntConCommon()->IntegralValue()))) - { - return mul; - } - - // Let fgMorphSmpOp take care of folding. - if (op1->IsIntegralConst() && op2->IsIntegralConst()) - { - return mul; - } - - // We don't handle checked casts. - if (op1->gtOverflowEx() || op2->gtOverflowEx()) - { - return mul; - } - - // Move the constant operand to the right to make the logic below more straightfoward. - if (op2->OperIs(GT_CAST) && op1->IsIntegralConst()) + // "IsValidLongMul" and decomposition do not handle constant op1. + if (op1->IsIntegralConst()) { std::swap(op1, op2); mul->gtOp1 = op1; mul->gtOp2 = op2; } - // The operands must have the same extending behavior, since the instruction - // used to compute the result will sign/zero-extend both operands at once. - bool op1ZeroExtends = op1->IsUnsigned(); - bool op2ZeroExtends = op2->OperIs(GT_CAST) ? op2->IsUnsigned() : op2->AsIntConCommon()->IntegralValue() >= 0; - bool op2AnyExtensionIsSuitable = op2->IsIntegralConst() && op2ZeroExtends; - if ((op1ZeroExtends != op2ZeroExtends) && !op2AnyExtensionIsSuitable) + if (!mul->IsValidLongMul()) { return mul; } - if (mul->gtOverflow()) - { - auto getMaxValue = [mul](GenTree* op) -> int64_t { - if (op->OperIs(GT_CAST)) - { - if (op->IsUnsigned()) - { - switch (op->AsCast()->CastOp()->TypeGet()) - { - case TYP_UBYTE: - return UINT8_MAX; - case TYP_USHORT: - return UINT16_MAX; - default: - return UINT32_MAX; - } - } - - return mul->IsUnsigned() ? static_cast(UINT64_MAX) : INT32_MIN; - } - - return op->AsIntConCommon()->IntegralValue(); - }; - - int64_t maxOp1 = getMaxValue(op1); - int64_t maxOp2 = getMaxValue(op2); - - if (CheckedOps::MulOverflows(maxOp1, maxOp2, mul->IsUnsigned())) - { - return mul; - } - - mul->ClearOverflow(); - } - // MUL_LONG needs to do the work the casts would have done. mul->ClearUnsigned(); if (op1->IsUnsigned()) @@ -14576,6 +14506,8 @@ GenTreeOp* Compiler::fgRecognizeAndMorphLongMul(GenTreeOp* mul) mul->SetUnsigned(); } + // "IsValidLongMul" returned "true", so this GT_MUL cannot overflow. + mul->ClearOverflow(); mul->Set64RsltMul(); return fgMorphLongMul(mul); @@ -14595,6 +14527,8 @@ GenTreeOp* Compiler::fgRecognizeAndMorphLongMul(GenTreeOp* mul) // GenTreeOp* Compiler::fgMorphLongMul(GenTreeOp* mul) { + INDEBUG(mul->DebugCheckLongMul()); + GenTree* op1 = mul->gtGetOp1(); GenTree* op2 = mul->gtGetOp2(); From 47f3e0be98c3c8c901a5285385a87c65867fc16d Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Sun, 22 Aug 2021 23:28:23 +0300 Subject: [PATCH 2/6] Fix the definition for MUL_LONG Move it out of the "#if !defined(TARGET_64BIT)" block. Also some miscellaneous moving of comments around. --- src/coreclr/jit/gtlist.h | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index 08b19e13f81cfa..127cf9b8811df1 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -133,11 +133,25 @@ GTNODE(RSH , GenTreeOp ,0,GTK_BINOP) GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP) GTNODE(ROL , GenTreeOp ,0,GTK_BINOP) GTNODE(ROR , GenTreeOp ,0,GTK_BINOP) -GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod) -GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply) - // GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn - // the div into a MULHI + some adjustments. In codegen, we only use the - // results of the high register, and we drop the low results. +GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod) + +// Returns high bits (top N bits of the 2N bit result of an NxN multiply) +// GT_MULHI is used in division by a constant (LowerUnsignedDivOrMod). We turn +// the div into a MULHI + some adjustments. In codegen, we only use the +// results of the high register, and we drop the low results. +GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) + +// A mul that returns the 2N bit result of an NxN multiply. This op is used for +// multiplies that take two ints and return a long result. For 32 bit targets, +// all other multiplies with long results are morphed into helper calls. +// It is similar to GT_MULHI, the difference being that GT_MULHI drops the lo +// part of the result, whereas GT_MUL_LONG keeps both parts of the result. +// MUL_LONG is also used on ARM64, where 64 bit multiplication is more expensive. +#if !defined(TARGET_64BIT) +GTNODE(MUL_LONG , GenTreeMultiRegOp ,1,GTK_BINOP) +#elif defined(TARGET_ARM64) +GTNODE(MUL_LONG , GenTreeOp ,1,GTK_BINOP) +#endif GTNODE(ASG , GenTreeOp ,0,(GTK_BINOP|GTK_NOTLIR)) GTNODE(EQ , GenTreeOp ,0,(GTK_BINOP|GTK_RELOP)) @@ -187,16 +201,6 @@ GTNODE(ADD_HI , GenTreeOp ,1,GTK_BINOP) GTNODE(SUB_LO , GenTreeOp ,0,GTK_BINOP) GTNODE(SUB_HI , GenTreeOp ,0,GTK_BINOP) -// A mul that returns the 2N bit result of an NxN multiply. This op is used for -// multiplies that take two ints and return a long result. On 32 bit targets, -// all other multiplies with long results are morphed into helper calls. -// It is similar to GT_MULHI, the difference being that GT_MULHI drops the lo -// part of the result, whereas GT_MUL_LONG keeps both parts of the result. -// MUL_LONG is also used on AMR64, where 64 bit multiplication is more expensive. -#if !defined(TARGET_64BIT) || defined(TARGET_ARM64) -GTNODE(MUL_LONG , GenTreeMultiRegOp ,1,GTK_BINOP) -#endif - // The following are nodes that specify shifts that take a GT_LONG op1. The GT_LONG // contains the hi and lo parts of three operand shift form where one op will be // shifted into the other op as part of the operation (LSH_HI will shift From 3fcc830a93a670302e5460b34b5bf03f700250d7 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Mon, 23 Aug 2021 00:12:14 +0300 Subject: [PATCH 3/6] Support MUL_LONG for ARM64 in codegen and LSRA Note: this commit breaks magic division. --- src/coreclr/jit/codegen.h | 5 +---- src/coreclr/jit/codegenarm.cpp | 21 -------------------- src/coreclr/jit/codegenarm64.cpp | 10 +--------- src/coreclr/jit/codegenarmarch.cpp | 31 +++++++++++++++++++++++++++--- src/coreclr/jit/lsraarm64.cpp | 1 + src/coreclr/jit/lsrabuild.cpp | 2 +- 6 files changed, 32 insertions(+), 38 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 92f3d975073f0f..30793a7d19b93e 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -841,6 +841,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genCodeForMul(GenTreeOp* treeNode); void genCodeForIncSaturate(GenTree* treeNode); void genCodeForMulHi(GenTreeOp* treeNode); + void genCodeForMulLong(GenTreeOp* mul); void genLeaInstruction(GenTreeAddrMode* lea); void genSetRegToCond(regNumber dstReg, GenTree* tree); @@ -848,10 +849,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genScaledAdd(emitAttr attr, regNumber targetReg, regNumber baseReg, regNumber indexReg, int scale); #endif // TARGET_ARMARCH -#if defined(TARGET_ARM) - void genCodeForMulLong(GenTreeMultiRegOp* treeNode); -#endif // TARGET_ARM - #if !defined(TARGET_64BIT) void genLongToIntCast(GenTree* treeNode); #endif diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 140c86a97dcb9d..0ae592f0bdbc23 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -1642,27 +1642,6 @@ void CodeGen::genEmitHelperCall(unsigned helper, int argSize, emitAttr retSize, regSet.verifyRegistersUsed(RBM_CALLEE_TRASH); } -//------------------------------------------------------------------------ -// genCodeForMulLong: Generates code for int*int->long multiplication -// -// Arguments: -// node - the GT_MUL_LONG node -// -// Return Value: -// None. -// -void CodeGen::genCodeForMulLong(GenTreeMultiRegOp* node) -{ - assert(node->OperGet() == GT_MUL_LONG); - genConsumeOperands(node); - GenTree* src1 = node->gtOp1; - GenTree* src2 = node->gtOp2; - instruction ins = node->IsUnsigned() ? INS_umull : INS_smull; - GetEmitter()->emitIns_R_R_R_R(ins, EA_4BYTE, node->GetRegNum(), node->gtOtherReg, src1->GetRegNum(), - src2->GetRegNum()); - genProduceReg(node); -} - #ifdef PROFILING_SUPPORTED //----------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 90ad7f5ded5110..0d4d9e9da2c20c 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -1849,16 +1849,8 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) // The arithmetic node must be sitting in a register (since it's not contained) assert(targetReg != REG_NA); - emitAttr attr = emitActualTypeSize(treeNode); - // UMULL/SMULL is twice as fast for 32*32->64bit MUL - if ((oper == GT_MUL) && (targetType == TYP_LONG) && genActualTypeIsInt(op1) && genActualTypeIsInt(op2)) - { - ins = treeNode->IsUnsigned() ? INS_umull : INS_smull; - attr = EA_4BYTE; - } - - regNumber r = emit->emitInsTernary(ins, attr, treeNode, op1, op2); + regNumber r = emit->emitInsTernary(ins, emitActualTypeSize(treeNode), treeNode, op1, op2); assert(r == targetReg); genProduceReg(treeNode); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index d540c522af52f7..8b9cf4b6a5f6d8 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -295,11 +295,9 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) genCodeForIndir(treeNode->AsIndir()); break; -#ifdef TARGET_ARM case GT_MUL_LONG: - genCodeForMulLong(treeNode->AsMultiRegOp()); + genCodeForMulLong(treeNode->AsOp()); break; -#endif // TARGET_ARM #ifdef TARGET_ARM64 @@ -3466,6 +3464,33 @@ void CodeGen::genScaledAdd(emitAttr attr, regNumber targetReg, regNumber baseReg } } +//------------------------------------------------------------------------ +// genCodeForMulLong: Generates code for int*int->long multiplication. +// +// Arguments: +// mul - the GT_MUL_LONG node +// +// Return Value: +// None. +// +void CodeGen::genCodeForMulLong(GenTreeOp* mul) +{ + assert(mul->OperIs(GT_MUL_LONG)); + + genConsumeOperands(mul); + + regNumber srcReg1 = mul->gtGetOp1()->GetRegNum(); + regNumber srcReg2 = mul->gtGetOp2()->GetRegNum(); + instruction ins = mul->IsUnsigned() ? INS_umull : INS_smull; +#ifdef TARGET_ARM + GetEmitter()->emitIns_R_R_R_R(ins, EA_4BYTE, mul->GetRegNum(), mul->AsMultiRegOp()->gtOtherReg, srcReg1, srcReg2); +#else + GetEmitter()->emitIns_R_R_R(ins, EA_4BYTE, mul->GetRegNum(), srcReg1, srcReg2); +#endif + + genProduceReg(mul); +} + //------------------------------------------------------------------------ // genLeaInstruction: Produce code for a GT_LEA node. // diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 4e3fc015ae98a7..3bb42a1269d57e 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -308,6 +308,7 @@ int LinearScan::BuildNode(GenTree* tree) case GT_DIV: case GT_MULHI: + case GT_MUL_LONG: case GT_UDIV: { srcCount = BuildBinaryUses(tree->AsOp()); diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 0287746e452dab..38e7683f431bb9 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1049,7 +1049,7 @@ regMaskTP LinearScan::getKillSetForNode(GenTree* tree) case GT_MUL: case GT_MULHI: -#if !defined(TARGET_64BIT) +#if !defined(TARGET_64BIT) || defined(TARGET_ARM64) case GT_MUL_LONG: #endif killMask = getKillSetForMul(tree->AsOp()); From 537f63284e240d65e91ceb28ee4c9d61c31a1660 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Mon, 23 Aug 2021 01:51:12 +0300 Subject: [PATCH 4/6] Fix magic division To use the new GT_MUL_LONG support. Plus some tweaks to make it conform to the coding guide. Credit to @pentp for the substantive changes. --- src/coreclr/jit/gentree.h | 6 ++-- src/coreclr/jit/lower.cpp | 59 ++++++++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 90681f1daf86b4..52d8da22db3887 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1428,7 +1428,7 @@ struct GenTree static bool OperIsMul(genTreeOps gtOper) { return (gtOper == GT_MUL) || (gtOper == GT_MULHI) -#if !defined(TARGET_64BIT) +#if !defined(TARGET_64BIT) || defined(TARGET_ARM64) || (gtOper == GT_MUL_LONG) #endif ; @@ -2112,13 +2112,13 @@ struct GenTree void SetUnsigned() { - assert(OperIs(GT_ADD, GT_SUB, GT_MUL, GT_CAST)); + assert(OperIs(GT_ADD, GT_SUB, GT_CAST) || OperIsMul()); gtFlags |= GTF_UNSIGNED; } void ClearUnsigned() { - assert(OperIs(GT_ADD, GT_SUB, GT_MUL, GT_CAST)); + assert(OperIs(GT_ADD, GT_SUB, GT_CAST) || OperIsMul()); gtFlags &= ~GTF_UNSIGNED; } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 92388eb0ea8864..34f72b5c0c2a30 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5300,6 +5300,14 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) GenTree* firstNode = nullptr; GenTree* adjustedDividend = dividend; +#ifdef TARGET_ARM64 + // On ARM64 we will use a 32x32->64 bit multiply instead of a 64x64->64 one. + bool widenToNativeIntForMul = (type != TYP_I_IMPL) && !simpleMul; +#else + CLANG_FORMAT_COMMENT_ANCHOR; + bool widenToNativeIntForMul = (type != TYP_I_IMPL); +#endif + // If "increment" flag is returned by GetUnsignedMagic we need to do Saturating Increment first if (increment) { @@ -5316,13 +5324,9 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend); firstNode = preShiftBy; } - else if (type != TYP_I_IMPL -#ifdef TARGET_ARM64 - && !simpleMul // On ARM64 we will use a 32x32->64 bit multiply as that's faster. -#endif - ) + else if (widenToNativeIntForMul) { - adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL); + adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_I_IMPL); BlockRange().InsertBefore(divMod, adjustedDividend); firstNode = adjustedDividend; } @@ -5331,44 +5335,50 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) // force input transformation to RAX because the following MULHI will kill RDX:RAX anyway and LSRA often causes // reduntant copies otherwise if (firstNode && !simpleMul) + { adjustedDividend->SetRegNum(REG_RAX); + } #endif - divisor->gtType = TYP_I_IMPL; - -#ifdef TARGET_ARM64 - if (simpleMul) + if (widenToNativeIntForMul) { - divisor->gtType = TYP_INT; + divisor->gtType = TYP_I_IMPL; } -#endif - divisor->AsIntCon()->SetIconValue(magic); - if (isDiv && !postShift && type == TYP_I_IMPL) + if (isDiv && !postShift && (type == TYP_I_IMPL)) { divMod->SetOper(GT_MULHI); divMod->gtOp1 = adjustedDividend; - divMod->gtFlags |= GTF_UNSIGNED; + divMod->SetUnsigned(); } else { - // Insert a new GT_MULHI node before the existing GT_UDIV/GT_UMOD node. +#ifdef TARGET_ARM64 + // 64-bit MUL is more expensive than UMULL on ARM64. + genTreeOps mulOper = simpleMul ? GT_MUL_LONG : GT_MULHI; +#else + // 64-bit IMUL is less expensive than MUL eax:edx on x64. + genTreeOps mulOper = simpleMul ? GT_MUL : GT_MULHI; +#endif + // Insert a new multiplication node before the existing GT_UDIV/GT_UMOD node. // The existing node will later be transformed into a GT_RSZ/GT_SUB that // computes the final result. This way don't need to find and change the use // of the existing node. - GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor); - mulhi->gtFlags |= GTF_UNSIGNED; + GenTree* mulhi = comp->gtNewOperNode(mulOper, TYP_I_IMPL, adjustedDividend, divisor); + mulhi->SetUnsigned(); BlockRange().InsertBefore(divMod, mulhi); - if (!firstNode) + if (firstNode == nullptr) + { firstNode = mulhi; + } if (postShift) { GenTree* shiftBy = comp->gtNewIconNode(postShift, TYP_INT); BlockRange().InsertBefore(divMod, shiftBy); - if (isDiv && type == TYP_I_IMPL) + if (isDiv && (type == TYP_I_IMPL)) { divMod->SetOper(GT_RSZ); divMod->gtOp1 = mulhi; @@ -5398,8 +5408,8 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) { #ifdef TARGET_ARMARCH divMod->SetOper(GT_CAST); - divMod->gtFlags |= GTF_UNSIGNED; - divMod->AsCast()->gtCastType = TYP_UINT; + divMod->SetUnsigned(); + divMod->AsCast()->gtCastType = TYP_INT; #else divMod->SetOper(GT_BITCAST); #endif @@ -5408,8 +5418,11 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod) } } - if (firstNode) + if (firstNode != nullptr) + { ContainCheckRange(firstNode, divMod); + } + return true; } #endif From 5336b7249a6bdacfd1a91ccd56a5fbc313b38884 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Mon, 23 Aug 2021 12:08:30 +0300 Subject: [PATCH 5/6] Recognize MUL_LONG in lowering for ARM64 --- src/coreclr/jit/lower.cpp | 5 ++- src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerarmarch.cpp | 54 ++++++++++++++++++++++++++++++++ src/coreclr/jit/lowerxarch.cpp | 20 ++++++++++++ 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 34f72b5c0c2a30..fdc29db7f8c319 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -144,11 +144,10 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_MUL: case GT_MULHI: -#if defined(TARGET_X86) +#if defined(TARGET_X86) || defined(TARGET_ARM64) case GT_MUL_LONG: #endif - ContainCheckMul(node->AsOp()); - break; + return LowerMul(node->AsOp()); case GT_UDIV: case GT_UMOD: diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 1cc2be0f66a763..65e936b8b21671 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -296,6 +296,7 @@ class Lowering final : public Phase void LowerIndir(GenTreeIndir* ind); void LowerStoreIndir(GenTreeStoreInd* node); GenTree* LowerAdd(GenTreeOp* node); + GenTree* LowerMul(GenTreeOp* mul); bool LowerUnsignedDivOrMod(GenTreeOp* divMod); GenTree* LowerConstIntDivOrMod(GenTree* node); GenTree* LowerSignedDivOrMod(GenTree* node); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 5faae030f800bc..c1691ea6b508ad 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -223,6 +223,60 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) ContainCheckStoreIndir(node); } +//------------------------------------------------------------------------ +// LowerMul: Lower a GT_MUL/GT_MULHI/GT_MUL_LONG node. +// +// For ARM64 recognized GT_MULs that can be turned into GT_MUL_LONGs, as +// those are cheaper. Performs contaiment checks. +// +// Arguments: +// mul - The node to lower +// +// Return Value: +// The next node to lower. +// +GenTree* Lowering::LowerMul(GenTreeOp* mul) +{ + assert(mul->OperIsMul()); + +#ifdef TARGET_ARM64 + if (comp->opts.OptimizationEnabled() && mul->OperIs(GT_MUL) && mul->IsValidLongMul()) + { + GenTreeCast* op1 = mul->gtGetOp1()->AsCast(); + GenTree* op2 = mul->gtGetOp2(); + + mul->ClearOverflow(); + mul->ClearUnsigned(); + if (op1->IsUnsigned()) + { + mul->SetUnsigned(); + } + + mul->gtOp1 = op1->CastOp(); + BlockRange().Remove(op1); + + if (op2->OperIs(GT_CAST)) + { + mul->gtOp2 = op2->AsCast()->CastOp(); + BlockRange().Remove(op2); + } + else + { + assert(op2->IsIntegralConst()); + assert(FitsIn(op2->AsIntConCommon()->IntegralValue())); + + op2->ChangeType(TYP_INT); + } + + mul->ChangeOper(GT_MUL_LONG); + } +#endif // TARGET_ARM64 + + ContainCheckMul(mul); + + return mul->gtNext; +} + //------------------------------------------------------------------------ // LowerBlockStore: Lower a block store node // diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 51d19542e47552..9f4909aa240494 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -170,6 +170,26 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) ContainCheckStoreIndir(node); } +//------------------------------------------------------------------------ +// LowerMul: Lower a GT_MUL/GT_MULHI/GT_MUL_LONG node. +// +// Currently only performs containment checks. +// +// Arguments: +// mul - The node to lower +// +// Return Value: +// The next node to lower. +// +GenTree* Lowering::LowerMul(GenTreeOp* mul) +{ + assert(mul->OperIsMul()); + + ContainCheckMul(mul); + + return mul->gtNext; +} + //------------------------------------------------------------------------ // LowerBlockStore: Lower a block store node // From 8a7932e06c2b5de894e67cfa36ba3bf5a9b1ed7f Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Tue, 5 Oct 2021 17:52:53 +0300 Subject: [PATCH 6/6] Fix #ifdef's --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/gentree.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 30793a7d19b93e..7fb88ab2b5806b 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -841,12 +841,12 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genCodeForMul(GenTreeOp* treeNode); void genCodeForIncSaturate(GenTree* treeNode); void genCodeForMulHi(GenTreeOp* treeNode); - void genCodeForMulLong(GenTreeOp* mul); void genLeaInstruction(GenTreeAddrMode* lea); void genSetRegToCond(regNumber dstReg, GenTree* tree); #if defined(TARGET_ARMARCH) void genScaledAdd(emitAttr attr, regNumber targetReg, regNumber baseReg, regNumber indexReg, int scale); + void genCodeForMulLong(GenTreeOp* mul); #endif // TARGET_ARMARCH #if !defined(TARGET_64BIT) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 52d8da22db3887..a51ca884484b1b 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -3018,10 +3018,11 @@ struct GenTreeOp : public GenTreeUnOp #if !defined(TARGET_64BIT) || defined(TARGET_ARM64) bool IsValidLongMul(); +#endif + #if !defined(TARGET_64BIT) && defined(DEBUG) void DebugCheckLongMul(); #endif -#endif #if DEBUGGABLE_GENTREE GenTreeOp() : GenTreeUnOp(), gtOp2(nullptr)