Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use "push" for structs less than 16 bytes in size
It is smaller and a little faster than using an XMM
register to first load and then store the struct.
  • Loading branch information
SingleAccretion committed Feb 15, 2022
commit 1b59ae9597ea79c6027ed88ee3cc084343585c1d
149 changes: 50 additions & 99 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3337,108 +3337,64 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
GenTree* src = putArgNode->AsOp()->gtOp1;
// We will never call this method for SIMD types, which are stored directly
// in genPutStructArgStk().
noway_assert(src->TypeGet() == TYP_STRUCT);
assert(src->isContained() && src->OperIs(GT_OBJ) && src->TypeIs(TYP_STRUCT));
assert(!src->AsObj()->GetLayout()->HasGCPtr());
#ifdef TARGET_X86
assert(!m_pushStkArg);
#endif

unsigned size = putArgNode->GetStackByteSize();
assert(size <= CPBLK_UNROLL_LIMIT);

emitter* emit = GetEmitter();
unsigned putArgOffset = putArgNode->getArgOffset();

assert(src->isContained());

assert(src->gtOper == GT_OBJ);
assert((XMM_REGSIZE_BYTES <= size) && (size <= CPBLK_UNROLL_LIMIT));

if (src->AsOp()->gtOp1->isUsedFromReg())
{
genConsumeReg(src->AsOp()->gtOp1);
}

unsigned offset = 0;

unsigned offset = 0;
regNumber xmmTmpReg = REG_NA;
regNumber intTmpReg = REG_NA;
regNumber longTmpReg = REG_NA;
#ifdef TARGET_X86
// On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's
// less than 16 bytes, we will just be using pushes
if (size >= 8)
{
xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
longTmpReg = xmmTmpReg;
}
if ((size & 0x7) != 0)
{
intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
}
#else // !TARGET_X86
// On x64 we use an XMM register only for 16-byte chunks.

if (size >= XMM_REGSIZE_BYTES)
{
xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
}
if ((size & 0xf) != 0)
if ((size % XMM_REGSIZE_BYTES) != 0)
{
intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
longTmpReg = intTmpReg;
intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
}
#endif // !TARGET_X86

// If the size of this struct is larger than 16 bytes
// let's use SSE2 to be able to do 16 byte at a time
// loads and stores.
if (size >= XMM_REGSIZE_BYTES)
{
#ifdef TARGET_X86
assert(!m_pushStkArg);
#endif // TARGET_X86
size_t slots = size / XMM_REGSIZE_BYTES;

assert(putArgNode->gtGetOp1()->isContained());
assert(putArgNode->gtGetOp1()->AsOp()->gtOper == GT_OBJ);
longTmpReg = xmmTmpReg;
#else
longTmpReg = intTmpReg;
#endif

// Let's use SSE2 to be able to do 16 byte at a time with loads and stores.
size_t slots = size / XMM_REGSIZE_BYTES;
while (slots-- > 0)
{
// TODO: In the below code the load and store instructions are for 16 bytes, but the
// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
// this probably needs to be changed.
while (slots-- > 0)
{
// Load
genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
// type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but
// this probably needs to be changed.

// Store
genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);
// Load
genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset);
// Store
genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset);

offset += XMM_REGSIZE_BYTES;
}
offset += XMM_REGSIZE_BYTES;
}

// Fill the remainder (15 bytes or less) if there's one.
if ((size & 0xf) != 0)
if ((size % XMM_REGSIZE_BYTES) != 0)
{
#ifdef TARGET_X86
if (m_pushStkArg)
{
// This case is currently supported only for the case where the total size is
// less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse
// order. However, morph has ensured that we have a struct that is an even
// multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment.
assert(((size & 0xc) == size) && (offset == 0));
// If we have a 4 byte chunk, load it from either offset 0 or 8, depending on
// whether we've got an 8 byte chunk, and then push it on the stack.
unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, size & 0x8);
// Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk)
// and push it on the stack.
pushedBytes += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, 0);
}
else
#endif // TARGET_X86
{
offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
assert(offset == size);
}
offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
assert(offset == size);
}
}

Expand Down Expand Up @@ -3474,15 +3430,16 @@ void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
// putArgNode - the PutArgStk tree.
//
// Notes:
// Used only on x86, for structs that contain GC pointers - they are guaranteed to be sized
// correctly (to a multiple of TARGET_POINTER_SIZE) by the VM.
// Used only on x86, in two cases:
// - Structs 4, 8, or 12 bytes in size (less than XMM_REGSIZE_BYTES, multiple of TARGET_POINTER_SIZE).
// - Structs that contain GC pointers - they are guaranteed to be sized correctly by the VM.
//
void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode)
{
// On x86, any struct that has contains GC references must be stored to the stack using `push` instructions so
// On x86, any struct that contains GC references must be stored to the stack using `push` instructions so
// that the emitter properly detects the need to update the method's GC information.
//
// Strictly speaking, it is only necessary to use `push` to store the GC references themselves, so for structs
// Strictly speaking, it is only necessary to use "push" to store the GC references themselves, so for structs
// with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the
// future.
assert(m_pushStkArg);
Expand Down Expand Up @@ -7574,40 +7531,30 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
}
#endif // FEATURE_SIMD

// Determine if we need to pre-adjust the stack, that is, we're not
// going to use "push"es. We'll be using those if and only if:
//
// - We have a "push" kind set in Lowering (see "LowerPutArgStk" for details).
// - We have an "Unroll" kind set for a struct less than 16 bytes in size.
// This is a special case in which we're going to call "genStructPutArgUnroll"
// and use "push".

bool preAdjustStack;
#ifdef DEBUG
switch (putArgStk->gtPutArgStkKind)
{
case GenTreePutArgStk::Kind::RepInstr:
assert(!source->AsObj()->GetLayout()->HasGCPtr());
preAdjustStack = true;
break;

case GenTreePutArgStk::Kind::Unroll:
assert(!source->AsObj()->GetLayout()->HasGCPtr());
preAdjustStack = argSize >= XMM_REGSIZE_BYTES;
break;

case GenTreePutArgStk::Kind::Push:
case GenTreePutArgStk::Kind::PushAllSlots:
assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr());
preAdjustStack = false;
assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() ||
(argSize < XMM_REGSIZE_BYTES));
break;

default:
unreached();
}
#endif // DEBUG

m_pushStkArg = !preAdjustStack;

if (preAdjustStack)
// In lowering (see "LowerPutArgStk") we have determined what sort of instructions
// are going to be used for this node. If we'll not be using "push"es, the stack
// needs to be adjusted first (s. t. the SP points to the base of the outgoing arg).
//
if (!putArgStk->isPushKind())
{
// If argSize is large, we need to probe the stack like we do in the prolog (genAllocLclFrame)
// or for localloc (genLclHeap), to ensure we touch the stack pages sequentially, and don't miss
Expand All @@ -7629,9 +7576,13 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
}

AddStackLevel(argSize);
m_pushStkArg = false;
return true;
}

return preAdjustStack;
// Otherwise, "push" will be adjusting the stack for us.
m_pushStkArg = true;
return false;
}

//---------------------------------------------------------------------
Expand Down
7 changes: 1 addition & 6 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -6775,12 +6775,7 @@ struct GenTreePutArgStk : public GenTreeUnOp
// block node.

enum class Kind : __int8{
Invalid,
RepInstr,
PartialRepInstr,
Unroll,
Push,
PushAllSlots,
Invalid, RepInstr, PartialRepInstr, Unroll, Push, PushAllSlots,
};
Kind gtPutArgStkKind;
#endif
Expand Down
18 changes: 15 additions & 3 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -599,11 +599,23 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)

// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).
CLANG_FORMAT_COMMENT_ANCHOR;

if (!layout->HasGCPtr())
{
if (size <= CPBLK_UNROLL_LIMIT)
#ifdef TARGET_X86
if (size < XMM_REGSIZE_BYTES)
{
// Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE
// chunks. As such, the correctness of this code depends on the fact that
// morph will copy any "mis-sized" (too small) non-local OBJs into a temp,
// thus preventing any possible out-of-bounds memory reads.
assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() ||
(src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr()));
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
else
#endif // TARGET_X86
if (size <= CPBLK_UNROLL_LIMIT)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
Expand All @@ -618,7 +630,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
// On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
// the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
#else // !TARGET_X86
#else // !TARGET_X86
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr;
#endif // !TARGET_X86
}
Expand Down
11 changes: 4 additions & 7 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1587,22 +1587,19 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
buildInternalIntRegisterDefForNode(putArgStk, regMask);
}

#ifdef TARGET_X86
if (size >= 8)
#else // !TARGET_X86
if (size >= XMM_REGSIZE_BYTES)
#endif // !TARGET_X86
{
// If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux,
// or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a
// series of 16-byte loads and stores.
// If we have a buffer larger than or equal to XMM_REGSIZE_BYTES, reserve
// an XMM register to use it for a series of 16-byte loads and stores.
buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
SetContainsAVXFlags();
}
break;

case GenTreePutArgStk::Kind::RepInstr:
#ifndef TARGET_X86
case GenTreePutArgStk::Kind::PartialRepInstr:
#endif
buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI);
buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX);
buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI);
Expand Down