diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index bb1206bfb5c5bf..4435e2aea5f691 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3471,36 +3471,34 @@ unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree* void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) { GenTree* src = putArgNode->AsOp()->gtOp1; - // We will never call this method for SIMD types, which are stored directly - // in genPutStructArgStk(). + // We will never call this method for SIMD types, which are stored directly in genPutStructArgStk(). assert(src->isContained() && src->OperIs(GT_OBJ) && src->TypeIs(TYP_STRUCT)); - assert(!src->AsObj()->GetLayout()->HasGCPtr()); #ifdef TARGET_X86 assert(!m_pushStkArg); #endif - unsigned size = putArgNode->GetStackByteSize(); -#ifdef TARGET_X86 - assert((XMM_REGSIZE_BYTES <= size) && (size <= CPBLK_UNROLL_LIMIT)); -#else // !TARGET_X86 - assert(size <= CPBLK_UNROLL_LIMIT); -#endif // !TARGET_X86 - if (src->AsOp()->gtOp1->isUsedFromReg()) { genConsumeReg(src->AsOp()->gtOp1); } + unsigned loadSize = putArgNode->GetArgLoadSize(); + assert(!src->AsObj()->GetLayout()->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT)); + unsigned offset = 0; regNumber xmmTmpReg = REG_NA; regNumber intTmpReg = REG_NA; regNumber longTmpReg = REG_NA; - if (size >= XMM_REGSIZE_BYTES) +#ifdef TARGET_X86 + if (loadSize >= 8) +#else + if (loadSize >= XMM_REGSIZE_BYTES) +#endif { xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT); } - if ((size % XMM_REGSIZE_BYTES) != 0) + if ((loadSize % XMM_REGSIZE_BYTES) != 0) { intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); } @@ -3512,7 +3510,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) #endif // Let's use SSE2 to be able to do 16 byte at a time with loads and stores. - size_t slots = size / XMM_REGSIZE_BYTES; + size_t slots = loadSize / XMM_REGSIZE_BYTES; while (slots-- > 0) { // TODO: In the below code the load and store instructions are for 16 bytes, but the @@ -3528,13 +3526,13 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) } // Fill the remainder (15 bytes or less) if there's one. - if ((size % XMM_REGSIZE_BYTES) != 0) + if ((loadSize % XMM_REGSIZE_BYTES) != 0) { - offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - assert(offset == size); + offset += genMove8IfNeeded(loadSize, longTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove4IfNeeded(loadSize, intTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove2IfNeeded(loadSize, intTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove1IfNeeded(loadSize, intTmpReg, src->AsOp()->gtOp1, offset); + assert(offset == loadSize); } } @@ -3570,8 +3568,9 @@ void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode) // putArgNode - the PutArgStk tree. // // Notes: -// Used only on x86, in two cases: +// Used (only) on x86 for: // - Structs 4, 8, or 12 bytes in size (less than XMM_REGSIZE_BYTES, multiple of TARGET_POINTER_SIZE). +// - Local structs less than 16 bytes in size (it is ok to load "too much" from our stack frame). // - Structs that contain GC pointers - they are guaranteed to be sized correctly by the VM. // void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode) @@ -3605,9 +3604,9 @@ void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode) } ClassLayout* layout = src->AsObj()->GetLayout(); - const unsigned byteSize = putArgNode->GetStackByteSize(); - assert((byteSize % TARGET_POINTER_SIZE == 0) && ((byteSize < XMM_REGSIZE_BYTES) || layout->HasGCPtr())); - const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + const unsigned loadSize = putArgNode->GetArgLoadSize(); + assert(((loadSize < XMM_REGSIZE_BYTES) || layout->HasGCPtr()) && ((loadSize % TARGET_POINTER_SIZE) == 0)); + const unsigned numSlots = loadSize / TARGET_POINTER_SIZE; for (int i = numSlots - 1; i >= 0; --i) { @@ -3654,9 +3653,9 @@ void CodeGen::genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgNode) #endif // DEBUG assert(layout->HasGCPtr()); - const unsigned byteSize = putArgNode->GetStackByteSize(); - assert(byteSize % TARGET_POINTER_SIZE == 0); - const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + const unsigned argSize = putArgNode->GetStackByteSize(); + assert(argSize % TARGET_POINTER_SIZE == 0); + const unsigned numSlots = argSize / TARGET_POINTER_SIZE; // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always. for (unsigned i = 0; i < numSlots;) @@ -5525,23 +5524,17 @@ void CodeGen::genCall(GenTreeCall* call) GenTree* argNode = arg.GetEarlyNode(); if (argNode->OperIs(GT_PUTARG_STK) && (arg.GetLateNode() == nullptr)) { - GenTree* source = argNode->AsPutArgStk()->gtGetOp1(); - unsigned size = argNode->AsPutArgStk()->GetStackByteSize(); - stackArgBytes += size; + GenTree* source = argNode->AsPutArgStk()->gtGetOp1(); + unsigned argSize = argNode->AsPutArgStk()->GetStackByteSize(); + stackArgBytes += argSize; + #ifdef DEBUG - assert(size == arg.AbiInfo.ByteSize); + assert(argSize == arg.AbiInfo.ByteSize); #ifdef FEATURE_PUT_STRUCT_ARG_STK - if (!source->OperIs(GT_FIELD_LIST) && (source->TypeGet() == TYP_STRUCT)) + if (source->TypeIs(TYP_STRUCT) && !source->OperIs(GT_FIELD_LIST)) { - GenTreeObj* obj = source->AsObj(); - unsigned argBytes = roundUp(obj->GetLayout()->GetSize(), TARGET_POINTER_SIZE); -#ifdef TARGET_X86 - // If we have an OBJ, we must have created a copy if the original arg was not a - // local and was not a multiple of TARGET_POINTER_SIZE. - // Note that on x64/ux this will be handled by unrolling in genStructPutArgUnroll. - assert((argBytes == obj->GetLayout()->GetSize()) || obj->Addr()->IsLocalAddrExpr()); -#endif // TARGET_X86 - assert(arg.AbiInfo.ByteSize == argBytes); + unsigned loadSize = source->AsObj()->GetLayout()->GetSize(); + assert(argSize == roundUp(loadSize, TARGET_POINTER_SIZE)); } #endif // FEATURE_PUT_STRUCT_ARG_STK #endif // DEBUG diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 6ef9cb3032e0af..1f249d4607e5d5 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7442,12 +7442,18 @@ struct GenTreePutArgStk : public GenTreeUnOp // TODO-Throughput: The following information should be obtained from the child // block node. - enum class Kind : __int8{ + enum class Kind : int8_t{ Invalid, RepInstr, PartialRepInstr, Unroll, Push, }; Kind gtPutArgStkKind; -#endif +#ifdef TARGET_XARCH +private: + uint8_t m_argLoadSizeDelta; +#endif // TARGET_XARCH +#endif // FEATURE_PUT_STRUCT_ARG_STK + +public: GenTreePutArgStk(genTreeOps oper, var_types type, GenTree* op1, @@ -7473,7 +7479,10 @@ struct GenTreePutArgStk : public GenTreeUnOp #endif // FEATURE_FASTTAILCALL #if defined(FEATURE_PUT_STRUCT_ARG_STK) , gtPutArgStkKind(Kind::Invalid) +#if defined(TARGET_XARCH) + , m_argLoadSizeDelta(UINT8_MAX) #endif +#endif // FEATURE_PUT_STRUCT_ARG_STK { } @@ -7520,6 +7529,36 @@ struct GenTreePutArgStk : public GenTreeUnOp return m_byteSize; } +#ifdef TARGET_XARCH + //------------------------------------------------------------------------ + // SetArgLoadSize: Set the optimal number of bytes to load for this argument. + // + // On XARCH, it is profitable to use wider loads when our source is a local + // variable. To not duplicate the logic between lowering, LSRA and codegen, + // we do the legality check once, in lowering, and save the result here, as + // a negative delta relative to the size of the argument with padding. + // + // Arguments: + // loadSize - The optimal number of bytes to load + // + void SetArgLoadSize(unsigned loadSize) + { + unsigned argSize = GetStackByteSize(); + assert(roundUp(loadSize, TARGET_POINTER_SIZE) == argSize); + + m_argLoadSizeDelta = static_cast(argSize - loadSize); + } + + //------------------------------------------------------------------------ + // GetArgLoadSize: Get the optimal number of bytes to load for this argument. + // + unsigned GetArgLoadSize() const + { + assert(m_argLoadSizeDelta != UINT8_MAX); + return GetStackByteSize() - m_argLoadSizeDelta; + } +#endif // TARGET_XARCH + // Return true if this is a PutArgStk of a SIMD12 struct. // This is needed because such values are re-typed to SIMD16, and the type of PutArgStk is VOID. unsigned isSIMD12() const diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 46990d9f6dbc64..abe5043f59d76e 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -545,30 +545,24 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // The cpyXXXX code is rather complex and this could cause it to be more complex, but // it might be the right thing to do. - unsigned size = putArgStk->GetStackByteSize(); - unsigned loadSize = layout->GetSize(); - - assert(loadSize <= size); + // If possible, widen the load, this results in more compact code. + unsigned loadSize = srcIsLocal ? roundUp(layout->GetSize(), TARGET_POINTER_SIZE) : layout->GetSize(); + putArgStk->SetArgLoadSize(loadSize); // TODO-X86-CQ: The helper call either is not supported on x86 or required more work // (I don't know which). - if (!layout->HasGCPtr()) { #ifdef TARGET_X86 // Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE - // chunks. As such, the correctness of this code depends on the fact that - // morph will copy any "mis-sized" (too small) non-local OBJs into a temp, - // thus preventing any possible out-of-bounds memory reads. - assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() || - (src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr())); - if (size < XMM_REGSIZE_BYTES) + // chunks. As such, we'll only use this path for correctly-sized sources. + if ((loadSize < XMM_REGSIZE_BYTES) && ((loadSize % TARGET_POINTER_SIZE) == 0)) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; } else #endif // TARGET_X86 - if (size <= CPBLK_UNROLL_LIMIT) + if (loadSize <= CPBLK_UNROLL_LIMIT) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; } @@ -604,7 +598,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // so if possible, widen the load to avoid the sign/zero-extension. if (varTypeIsSmall(regType) && srcIsLocal) { - assert(putArgStk->GetStackByteSize() <= genTypeSize(TYP_INT)); + assert(genTypeSize(TYP_INT) <= putArgStk->GetStackByteSize()); regType = TYP_INT; } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 5386ed5769558c..5be7b32972da62 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1561,21 +1561,31 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) return BuildOperandUses(src); } - ssize_t size = putArgStk->GetStackByteSize(); + unsigned loadSize = putArgStk->GetArgLoadSize(); switch (putArgStk->gtPutArgStkKind) { case GenTreePutArgStk::Kind::Unroll: // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. - if ((size % XMM_REGSIZE_BYTES) != 0) + if ((loadSize % XMM_REGSIZE_BYTES) != 0) { regMaskTP regMask = allRegs(TYP_INT); +#ifdef TARGET_X86 + // Storing at byte granularity requires a byteable register. + if ((loadSize & 1) != 0) + { + regMask &= allByteRegs(); + } +#endif // TARGET_X86 buildInternalIntRegisterDefForNode(putArgStk, regMask); } - if (size >= XMM_REGSIZE_BYTES) +#ifdef TARGET_X86 + if (loadSize >= 8) +#else + if (loadSize >= XMM_REGSIZE_BYTES) +#endif { - // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES, reserve - // an XMM register to use it for a series of 16-byte loads and stores. + // See "genStructPutArgUnroll" -- we will use this XMM register for wide stores. buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); SetContainsAVXFlags(); } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 4e65b06d9a75fc..8daedf2b0c7f78 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -3435,36 +3435,6 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call) assert(varTypeIsEnregisterable(argObj->TypeGet()) || (makeOutArgCopy && varTypeIsEnregisterable(structBaseType))); } - -#if !defined(UNIX_AMD64_ABI) && !defined(TARGET_ARMARCH) && !defined(TARGET_LOONGARCH64) - // TODO-CQ-XARCH: there is no need for a temp copy if we improve our code generation in - // `genPutStructArgStk` for xarch like we did it for Arm/Arm64. - - // We still have a struct unless we converted the GT_OBJ into a GT_IND above... - if (isHfaArg && passUsingFloatRegs) - { - } - else if (structBaseType == TYP_STRUCT) - { - // If the valuetype size is not a multiple of TARGET_POINTER_SIZE, - // we must copyblk to a temp before doing the obj to avoid - // the obj reading memory past the end of the valuetype - if (roundupSize > originalSize) - { - makeOutArgCopy = true; - - // There are a few special cases where we can omit using a CopyBlk - // where we normally would need to use one. - - if (argObj->OperIs(GT_OBJ) && - argObj->AsObj()->gtGetOp1()->IsLocalAddrExpr() != nullptr) // Is the source a LclVar? - { - makeOutArgCopy = false; - } - } - } - -#endif // !UNIX_AMD64_ABI } } diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_65937/Runtime_65937.cs b/src/tests/JIT/Regression/JitBlue/Runtime_65937/Runtime_65937.cs new file mode 100644 index 00000000000000..2591dd373a0fe5 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_65937/Runtime_65937.cs @@ -0,0 +1,71 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +public unsafe class Runtime_65937 +{ + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Main() + { + if (!OperatingSystem.IsLinux()) + { + return 100; + } + + const int PROT_NONE = 0x0; + const int PROT_READ = 0x1; + const int PROT_WRITE = 0x2; + const int MAP_PRIVATE = 0x02; + const int MAP_ANONYMOUS = 0x20; + const int PAGE_SIZE = 0x1000; + + byte* pages = (byte*)mmap(null, 2 * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (pages == (byte*)-1) + { + Console.WriteLine("Failed to allocate two pages, errno is {0}, giving up on the test", Marshal.GetLastSystemError()); + return 100; + } + + if (mprotect(pages + PAGE_SIZE, PAGE_SIZE, PROT_NONE) != 0) + { + Console.WriteLine("Failed to protect the second page, errno is {0}, giving up on the test", Marshal.GetLastSystemError()); + munmap(pages, 2 * PAGE_SIZE); + return 100; + } + + CallWithStkArg(0, 0, 0, 0, 0, 0, *(StructWithNineBytes*)(pages + PAGE_SIZE - sizeof(StructWithNineBytes))); + + munmap(pages, 2 * PAGE_SIZE); + + return 100; + } + + struct StructWithNineBytes + { + byte ByteOne; + byte ByteTwo; + byte ByteThree; + byte ByteFour; + byte ByteFive; + byte ByteSix; + byte ByteSeven; + byte ByteEight; + byte ByteNine; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void CallWithStkArg(int a, int b, int c, int d, int e, int f, StructWithNineBytes stkArg) { } + + [DllImport("libc")] + private static extern void* mmap(void* addr, nuint length, int prot, int flags, int fd, nuint offset); + + [DllImport("libc")] + private static extern int mprotect(void* addr, nuint len, int prot); + + [DllImport("libc")] + private static extern int munmap(void* addr, nuint length); +} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_65937/Runtime_65937.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_65937/Runtime_65937.csproj new file mode 100644 index 00000000000000..9379af57e1c721 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_65937/Runtime_65937.csproj @@ -0,0 +1,11 @@ + + + None + True + Exe + true + + + + +