Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b88ff31
Replace successive "ldr" and "str" instructions with "ldp" and "stp"
AndyJGraham Sep 6, 2022
f0c918c
No longer use a temporary buffer to build the optimized instruction.
AndyJGraham Oct 31, 2022
f1b236e
Addressed assorted review comments.
AndyJGraham Nov 1, 2022
c0533bd
Now optimizes ascending locations and decending locations with
AndyJGraham Nov 3, 2022
372ee97
Modification to remove last instructions.
AndyJGraham Nov 14, 2022
12fc291
Merge branch 'main'
AndyJGraham Nov 15, 2022
0b377ed
Ongoing improvements to remove previously-emitted instruction
AndyJGraham Nov 29, 2022
46b85f8
Stopped optimization of consecutive instructions that straddled an in…
AndyJGraham Dec 1, 2022
e4741f9
Addressed code change requests in GitHub.
AndyJGraham Dec 1, 2022
2822f64
Merge branch 'main'
AndyJGraham Dec 1, 2022
10a4510
Various fixes to ldp/stp optimization
BruceForstall Dec 2, 2022
d80a69a
Merge pull request #1 from BruceForstall/LdpStp_Modifications_Fixes
AndyJGraham Dec 5, 2022
f6a49bf
Delete unnecessary and incorrect assert
BruceForstall Dec 7, 2022
ed4d070
Merge pull request #2 from BruceForstall/LdpStp_Modifications_FixAsse…
AndyJGraham Dec 7, 2022
4b0e51e
Diagnostic change only, to confirm whether a theory is correct or
AndyJGraham Dec 9, 2022
2997a8e
Revert "Diagnostic change only, to confirm whether a theory is correc…
AndyJGraham Dec 14, 2022
f0907cc
Do not merge. Temporarily removed calls to
AndyJGraham Dec 14, 2022
c5c4234
Modifications to better update the IP mapping table for a replaced in…
AndyJGraham Dec 15, 2022
bb8fdea
Merge branch 'main' of ssh://gerrit.oss.arm.com/enterprise-llt/dotnet…
AndyJGraham Dec 16, 2022
65eed90
Minor formatting change.
AndyJGraham Dec 16, 2022
e03b375
Check for out of range offsets
a74nh Jan 10, 2023
2cef6fc
Don't optimise during prolog/epilog
a74nh Jan 16, 2023
41a9828
Merge branch 'dotnet:main' into LdpStp_Modifications
a74nh Jan 16, 2023
ba89fd3
Fix windows build error
a74nh Jan 16, 2023
1fbf423
Merge branch main
a74nh Jan 19, 2023
ca9a325
IGF_HAS_REMOVED_INSTR is ARM64 only
a74nh Jan 20, 2023
e66ad66
Add OptimizeLdrStr function
a74nh Jan 20, 2023
8b44843
Fix formatting
a74nh Jan 20, 2023
2e7aaf6
Ensure local variables are tracked
a74nh Jan 24, 2023
fe76782
Don't peephole local variables
a74nh Jan 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Replace successive "ldr" and "str" instructions with "ldp" and "stp"
This change serves to address the following four Github tickets:

    1. ARM64: Optimize pair of "ldr reg, [fp]" to ldp #35130
    2. ARM64: Optimize pair of "ldr reg, [reg]" to ldp #35132
    3. ARM64: Optimize pair of "str reg, [reg]" to stp #35133
    4. ARM64: Optimize pair of "str reg, [fp]" to stp  #35134

A technique was employed that involved detecting an optimisation
opportunity as instruction sequences were being generated.
The optimised instruction was then generated on top of the previous
instruction, with no second instruction generated. Thus, there were no
changes to instruction group size at “emission time” and no changes to
jump instructions.
  • Loading branch information
AndyJGraham committed Oct 27, 2022
commit b88ff31656035dee297bf90363b38f99dbd3b8a9
221 changes: 215 additions & 6 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5676,6 +5676,22 @@ void emitter::emitIns_R_R_I(
{
return;
}

if (emitComp->opts.OptimizationEnabled() && IsOptimisableLdrStr(ins, reg1, reg2, imm, size, fmt))
{
regNumber oldReg1 = emitLastIns->idReg1();
ssize_t oldImm =
emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
ssize_t scaledOldImm = oldImm * size;

// Overwrite the "sub-optimal" instruction with the *optimised* instruction, directly
// into the output buffer.
emitIns_R_R_R_I(optIns, attr, oldReg1, reg1, reg2, scaledOldImm, INS_OPTS_NONE, EA_UNKNOWN, emitLastIns);

// And now stop here, as the second instruction descriptor is no longer emitted.
return;
}
}
else if (isAddSub)
{
Expand Down Expand Up @@ -6491,7 +6507,8 @@ void emitter::emitIns_R_R_R_I(instruction ins,
regNumber reg3,
ssize_t imm,
insOpts opt /* = INS_OPTS_NONE */,
emitAttr attrReg2 /* = EA_UNKNOWN */)
emitAttr attrReg2 /* = EA_UNKNOWN */,
instrDesc* reuseInstr /* = nullptr */)
{
emitAttr size = EA_SIZE(attr);
emitAttr elemsize = EA_UNKNOWN;
Expand Down Expand Up @@ -6626,6 +6643,7 @@ void emitter::emitIns_R_R_R_I(instruction ins,
scale = (size == EA_8BYTE) ? 3 : 2;
}
isLdSt = true;
fmt = IF_LS_3C;
break;

case INS_ld1:
Expand Down Expand Up @@ -6906,7 +6924,58 @@ void emitter::emitIns_R_R_R_I(instruction ins,
}
assert(fmt != IF_NONE);

instrDesc* id = emitNewInstrCns(attr, imm);
// An "instrDesc" will *always* be required.
// Under normal circumstances the instruction
// will be added to the emitted group. However,
// this is not correct for instructions that
// are going to overwrite already-emitted
// instructions and we therefore need space to
// hold the new instruction descriptor.
instrDesc* id;

// One cannot simply instantiate an instruction
// descriptor, so this array will be used to
// hold the instruction being built.
unsigned char tempInstrDesc[sizeof(instrDesc)];

// Now the instruction is either emitted OR
// used to overwrite the previously-emitted
// instruction.
if (reuseInstr == nullptr)
{
id = emitNewInstrCns(attr, imm);
}
else
{
id = (instrDesc*)tempInstrDesc;

memset(id, 0, sizeof(tempInstrDesc));

// Store the size and handle the two special
// values that indicate GCref and ByRef

if (EA_IS_GCREF(attr))
{
// A special value indicates a GCref pointer value

id->idGCref(GCT_GCREF);
id->idOpSize(EA_PTRSIZE);
}
else if (EA_IS_BYREF(attr))
{
// A special value indicates a Byref pointer value

id->idGCref(GCT_BYREF);
id->idOpSize(EA_PTRSIZE);
}
else
{
id->idGCref(GCT_NONE);
id->idOpSize(EA_SIZE(attr));
}

id->idSmallCns(imm);
}

id->idIns(ins);
id->idInsFmt(fmt);
Expand All @@ -6932,8 +7001,18 @@ void emitter::emitIns_R_R_R_I(instruction ins,
}
}

dispIns(id);
appendToCurIG(id);
// Now the instruction is EITHER emitted OR used to overwrite the previously-emitted instruction.
if (reuseInstr == nullptr)
{
// Then this is the standard exit path and the instruction is to be appended to the instruction group.
dispIns(id);
appendToCurIG(id);
}
else
{
// The instruction is copied over the last emitted insdtruction.
memcpy(reuseInstr, id, sizeof(tempInstrDesc));
}
}

/*****************************************************************************
Expand Down Expand Up @@ -7623,8 +7702,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
{
bool useRegForImm = false;
ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate

imm = disp;
imm = disp;
if (imm == 0)
{
fmt = IF_LS_2A;
Expand Down Expand Up @@ -7670,6 +7748,25 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va

assert(fmt != IF_NONE);

// This handles LDR duplicate instructions
if (emitComp->opts.OptimizationEnabled() && IsOptimisableLdrStr(ins, reg1, reg2, imm, size, fmt))
{
regNumber oldReg1 = emitLastIns->idReg1();
ssize_t oldImm =
emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
ssize_t scaledOldImm = oldImm * size;

// Overwrite the "sub-optimal" instruction with the *optimised* instruction, directly
// into the output buffer.
emitIns_R_R_R_I(optIns, attr, oldReg1, reg1, reg2, scaledOldImm, INS_OPTS_NONE, EA_UNKNOWN, emitLastIns);

// And now stop here, as the second instruction descriptor is no longer emitted.
return;
}

// We need to simply emit the instruction unchanged

instrDesc* id = emitNewInstrCns(attr, imm);

id->idIns(ins);
Expand Down Expand Up @@ -7901,6 +7998,22 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va

assert(fmt != IF_NONE);

if (emitComp->opts.OptimizationEnabled() && IsOptimisableLdrStr(ins, reg1, reg2, imm, size, fmt))
{
regNumber oldReg1 = emitLastIns->idReg1();
ssize_t oldImm =
emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();
instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
ssize_t scaledOldImm = oldImm * size;

// Overwrite the "sub-optimal" instruction with the *optimised* instruction, directly
// into the output buffer.
emitIns_R_R_R_I(optIns, attr, oldReg1, reg1, reg2, scaledOldImm, INS_OPTS_NONE, EA_UNKNOWN, emitLastIns);

// And now stop here, as the second instruction descriptor is no longer emitted.
return;
}

instrDesc* id = emitNewInstrCns(attr, imm);

id->idIns(ins);
Expand Down Expand Up @@ -16128,4 +16241,100 @@ bool emitter::IsRedundantLdStr(

return false;
}

//-----------------------------------------------------------------------------------
// IsOptimisableLdrStr: Check if it is possible to optimise two "ldr" or "str"
// instructions into a single "ldp" or "stp" instruction.
//
// Arguments:
// ins - The instruction code
// reg1 - Register 1 number
// reg2 - Register 2 number
// imm - Immediate offset, prior to scaling by operand size
// size - Operand size
// fmt - Instruction format
//

bool emitter::IsOptimisableLdrStr(
instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt)
{
bool isFirstInstrInBlock = (emitCurIGinsCnt == 0) && ((emitCurIG->igFlags & IGF_EXTEND) == 0);

if (((ins != INS_ldr) && (ins != INS_str)) || (isFirstInstrInBlock) || (emitLastIns == nullptr))
{
return false;
}

regNumber prevReg1 = emitLastIns->idReg1();
regNumber prevReg2 = emitLastIns->idReg2();
insFormat lastInsFmt = emitLastIns->idInsFmt();
emitAttr prevSize = emitLastIns->idOpSize();
ssize_t prevImm = emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns();

// Signed, *raw* immediate value fits in 7 bits, so
// for LDP/ STP the raw value is from -64 to +63.
// For LDR/ STR, there are 9 bits, so we need to
// limit the range explicitly in software.
if ((imm < -64) || (imm > 63) || (prevImm < -64) || (prevImm > 63))
{
// Then one or more of the immediate values is
// out of range, so we cannot optimise.
return false;
}

if ((!isGeneralRegisterOrZR(reg1)) || (!isGeneralRegisterOrZR(prevReg1)))
{
// Either register 1 is not a general register
// or previous register 1 is not a general register
// or the zero register, so we cannot optimise.
return false;
}

if (!((ins == emitLastIns->idIns()) && (ins == INS_ldr || ins == INS_str)))
{
// Not successive ldr or str instructions
return false;
}

if (lastInsFmt != fmt)
{
// The formats of the two instructions differ.
return false;
}

if ((emitInsIsLoad(ins)) && (reg1 == prevReg1))
{
// Cannot load to the same register twice.
return false;
}

if (prevSize != size)
{
// Operand sizes differ.
return false;
}

if (imm != (prevImm + 1))
{
// Not consecutive immediate values.
return false;
}

if (emitSizeOfInsDsc(emitLastIns) != sizeof(instrDesc))
{
// Not instruction descriptors of the
// same, standard size.
return false;
}

if (!((reg2 == prevReg2) && isGeneralRegisterOrSP(reg2)))
{
// The "register 2" numbers need to be
// the same AND general registers or
// the stack pointer.
return false;
}
return true;
}

#endif // defined(TARGET_ARM64)
7 changes: 4 additions & 3 deletions src/coreclr/jit/emitarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ static UINT64 Replicate_helper(UINT64 value, unsigned width, emitAttr size);
static bool IsMovInstruction(instruction ins);
bool IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regNumber src, bool canSkip);
bool IsRedundantLdStr(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);

bool IsOptimisableLdrStr(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
/************************************************************************
*
* This union is used to to encode/decode the special ARM64 immediate values
Expand Down Expand Up @@ -775,8 +775,9 @@ void emitIns_R_R_R_I(instruction ins,
regNumber reg2,
regNumber reg3,
ssize_t imm,
insOpts opt = INS_OPTS_NONE,
emitAttr attrReg2 = EA_UNKNOWN);
insOpts opt = INS_OPTS_NONE,
emitAttr attrReg2 = EA_UNKNOWN,
instrDesc* reuseInstr = nullptr);

void emitIns_R_R_R_Ext(instruction ins,
emitAttr attr,
Expand Down