Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fix ARM64 unsigned div by const perf regression
  • Loading branch information
pentp committed Aug 13, 2021
commit 976bf2a74a6f1b3d82df4edf20f82f901b2f21c5
5 changes: 4 additions & 1 deletion src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5270,7 +5270,10 @@ Compiler::fgWalkResult Compiler::optVNConstantPropCurStmt(BasicBlock* block, Sta

case GT_INC_SATURATE:
case GT_MULHI:
assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI node encountered before lowering");
#ifdef TARGET_ARM64
case GT_MULWIDE:
#endif
assert(false && "Unexpected GT_INC_SATURATE/GT_MULHI/GT_MULWIDE node encountered before lowering");
break;

case GT_JTRUE:
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void genCodeForMul(GenTreeOp* treeNode);
void genCodeForIncSaturate(GenTree* treeNode);
void genCodeForMulHi(GenTreeOp* treeNode);
void genCodeForMulWide(GenTreeOp* treeNode);
void genLeaInstruction(GenTreeAddrMode* lea);
void genSetRegToCond(regNumber dstReg, GenTree* tree);

Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1817,6 +1817,26 @@ void CodeGen::genCodeForMulHi(GenTreeOp* treeNode)
genProduceReg(treeNode);
}

// Generate code to get the 64-bits of a 32*32 bit multiplication result
void CodeGen::genCodeForMulWide(GenTreeOp* treeNode)
Comment on lines +1820 to +1821
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this a GT_MUL_LONG? We only use for 32 bit targets now, but it should be straightforward to use it on ARM64 too.

{
assert(!treeNode->gtOverflowEx());
assert(!varTypeIsFloating(treeNode->TypeGet()));

genConsumeOperands(treeNode);

// The arithmetic node must be sitting in a register (since it's not contained)
regNumber targetReg = treeNode->GetRegNum();
assert(targetReg != REG_NA);

assert(EA_SIZE(emitActualTypeSize(treeNode)) == EA_8BYTE);
instruction ins = (treeNode->gtFlags & GTF_UNSIGNED) ? INS_umull : INS_smull;
regNumber r = GetEmitter()->emitInsTernary(ins, EA_4BYTE, treeNode, treeNode->gtGetOp1(), treeNode->gtGetOp2());
assert(r == targetReg);

genProduceReg(treeNode);
}

// Generate code for ADD, SUB, MUL, DIV, UDIV, AND, OR and XOR
// This method is expected to have called genConsumeOperands() before calling it.
void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
genCodeForMulHi(treeNode->AsOp());
break;

case GT_MULWIDE:
genCodeForMulWide(treeNode->AsOp());
break;

case GT_SWAP:
genCodeForSwap(treeNode->AsOp());
break;
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,9 @@ struct GenTree
static bool OperIsMul(genTreeOps gtOper)
{
return (gtOper == GT_MUL) || (gtOper == GT_MULHI)
#ifdef TARGET_ARM64
|| (gtOper == GT_MULWIDE)
#endif
#if !defined(TARGET_64BIT)
|| (gtOper == GT_MUL_LONG)
#endif
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/gtlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP)
GTNODE(ROL , GenTreeOp ,0,GTK_BINOP)
GTNODE(ROR , GenTreeOp ,0,GTK_BINOP)
GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod)
#ifdef TARGET_ARM64
GTNODE(MULWIDE , GenTreeOp ,1,GTK_BINOP) // widening multiply, used in division by a constant on ARM64 (LowerUnsignedDivOrMod)
#endif
GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) // returns high bits (top N bits of the 2N bit result of an NxN multiply)
// GT_MULHI is used in division by a constant (fgMorphDivByConst). We turn
// the div into a MULHI + some adjustments. In codegen, we only use the
Expand Down
23 changes: 19 additions & 4 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ GenTree* Lowering::LowerNode(GenTree* node)

case GT_MUL:
case GT_MULHI:
#ifdef TARGET_ARM64
case GT_MULWIDE:
#endif
#if defined(TARGET_X86)
case GT_MUL_LONG:
#endif
Expand Down Expand Up @@ -5254,7 +5257,11 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
BlockRange().InsertBefore(divMod, preShiftBy, adjustedDividend);
firstNode = preShiftBy;
}
else if (type != TYP_I_IMPL)
else if (type != TYP_I_IMPL
#ifdef TARGET_ARM64
&& !simpleMul
#endif
)
{
adjustedDividend = comp->gtNewCastNode(TYP_I_IMPL, adjustedDividend, true, TYP_U_IMPL);
BlockRange().InsertBefore(divMod, adjustedDividend);
Expand All @@ -5268,7 +5275,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
adjustedDividend->SetRegNum(REG_RAX);
#endif

divisor->gtType = TYP_I_IMPL;
divisor->gtType = simpleMul ? TYP_INT : TYP_I_IMPL;
divisor->AsIntCon()->SetIconValue(magic);

if (isDiv && !postShift && type == TYP_I_IMPL)
Expand All @@ -5283,7 +5290,12 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
// The existing node will later be transformed into a GT_RSZ/GT_SUB that
// computes the final result. This way don't need to find and change the use
// of the existing node.
GenTree* mulhi = comp->gtNewOperNode(simpleMul ? GT_MUL : GT_MULHI, TYP_I_IMPL, adjustedDividend, divisor);
#ifdef TARGET_ARM64
genTreeOps mulOp = simpleMul ? GT_MULWIDE : GT_MULHI; // 64-bit MUL is more expensive than UMULL on ARM64
#else
genTreeOps mulOp = simpleMul ? GT_MUL : GT_MULHI; // 64-bit IMUL is less expensive than MUL eax:edx on x64
#endif
GenTree* mulhi = comp->gtNewOperNode(mulOp, TYP_I_IMPL, adjustedDividend, divisor);
mulhi->gtFlags |= GTF_UNSIGNED;
BlockRange().InsertBefore(divMod, mulhi);
if (!firstNode)
Expand Down Expand Up @@ -5322,7 +5334,7 @@ bool Lowering::LowerUnsignedDivOrMod(GenTreeOp* divMod)
}
else if (type != TYP_I_IMPL)
{
#ifdef TARGET_ARMARCH
#ifdef TARGET_ARM64
divMod->SetOper(GT_CAST);
divMod->gtFlags |= GTF_UNSIGNED;
divMod->AsCast()->gtCastType = TYP_UINT;
Expand Down Expand Up @@ -6394,6 +6406,9 @@ void Lowering::ContainCheckNode(GenTree* node)
#endif
case GT_MUL:
case GT_MULHI:
#ifdef TARGET_ARM64
case GT_MULWIDE:
#endif
ContainCheckMul(node->AsOp());
break;
case GT_DIV:
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ int LinearScan::BuildNode(GenTree* tree)

case GT_DIV:
case GT_MULHI:
case GT_MULWIDE:
case GT_UDIV:
{
srcCount = BuildBinaryUses(tree->AsOp());
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1049,6 +1049,9 @@ regMaskTP LinearScan::getKillSetForNode(GenTree* tree)

case GT_MUL:
case GT_MULHI:
#ifdef TARGET_ARM64
case GT_MULWIDE:
#endif
#if !defined(TARGET_64BIT)
case GT_MUL_LONG:
#endif
Expand Down