diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 68d9c3baa163df..64fb68cba56c7c 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -363,6 +363,45 @@ void LinearScan::updateSpillCost(regNumber reg, Interval* interval) #endif } +//------------------------------------------------------------------------ +// updateRegsFreeBusyState: Update various register masks and global state to track +// registers that are free and busy. +// +// Arguments: +// refPosition - RefPosition for which we need to update the state. +// regsBusy - Mask of registers that are busy. +// regsToFree - Mask of registers that are set to be free. +// delayRegsToFree - Mask of registers that are set to be delayed free. +// interval - Interval of Refposition. +// assignedReg - Assigned register for this refposition. +// +void LinearScan::updateRegsFreeBusyState(RefPosition& refPosition, + regMaskTP regsBusy, + regMaskTP* regsToFree, + regMaskTP* delayRegsToFree DEBUG_ARG(Interval* interval) + DEBUG_ARG(regNumber assignedReg)) +{ + regsInUseThisLocation |= regsBusy; + if (refPosition.lastUse) + { + if (refPosition.delayRegFree) + { + INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, interval, assignedReg)); + *delayRegsToFree |= regsBusy; + regsInUseNextLocation |= regsBusy; + } + else + { + INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, interval, assignedReg)); + *regsToFree |= regsBusy; + } + } + else if (refPosition.delayRegFree) + { + regsInUseNextLocation |= regsBusy; + } +} + //------------------------------------------------------------------------ // internalFloatRegCandidates: Return the set of registers that are appropriate // for use as internal float registers. @@ -445,6 +484,17 @@ regMaskTP LinearScan::getConstrainedRegMask(regMaskTP regMaskActual, regMaskTP r regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) { +#ifdef TARGET_ARM64 + if ((refPosition != nullptr) && refPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + { + // If we are assigning for the refPositions that has consecutive registers requirements, skip the + // limit stress for them, because there are high chances that many registers are busy for consecutive + // requirements and we do not have enough remaining to assign other refpositions (like operands). + // Likewise, skip for the definition node that comes after that, for which, all the registers are in + // the "delayRegFree" state. + return mask; + } +#endif if (getStressLimitRegs() != LSRA_LIMIT_NONE) { // The refPosition could be null, for example when called @@ -5394,34 +5444,18 @@ void LinearScan::allocateRegisters() regMaskTP copyRegMask = getRegMask(copyReg, currentInterval->registerType); regMaskTP assignedRegMask = getRegMask(assignedRegister, currentInterval->registerType); - // For consecutive register, it doesn't matter what the assigned register was. - // We have just assigned it `copyRegMask` and that's the one in-use, and not the - // one that was assigned previously. + // For consecutive register, although it shouldn't matter what the assigned register was, + // because we have just assigned it `copyReg` and that's the one in-use, and not the + // one that was assigned previously. However, in situation where an upper-vector restore + // happened to be restored in assignedReg, we would need assignedReg to stay alive because + // we will copy the entire vector value from it to the `copyReg`. - regsInUseThisLocation |= copyRegMask; - if (currentRefPosition.lastUse) - { - if (currentRefPosition.delayRegFree) - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, currentInterval, - assignedRegister)); - delayRegsToFree |= copyRegMask | assignedRegMask; - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } - else - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, currentInterval, - assignedRegister)); - regsToFree |= copyRegMask | assignedRegMask; - } - } - else + updateRegsFreeBusyState(currentRefPosition, assignedRegMask | copyRegMask, ®sToFree, + &delayRegsToFree DEBUG_ARG(currentInterval) + DEBUG_ARG(assignedRegister)); + if (!currentRefPosition.lastUse) { copyRegsToFree |= copyRegMask; - if (currentRefPosition.delayRegFree) - { - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } } // If this is a tree temp (non-localVar) interval, we will need an explicit move. @@ -5508,40 +5542,18 @@ void LinearScan::allocateRegisters() // registers. assignConsecutiveRegisters(¤tRefPosition, copyReg); } - - // For consecutive register, it doesn't matter what the assigned register was. - // We have just assigned it `copyRegMask` and that's the one in-use, and not the - // one that was assigned previously. - - regsInUseThisLocation |= copyRegMask; } - else #endif - { - regsInUseThisLocation |= copyRegMask | assignedRegMask; - } - if (currentRefPosition.lastUse) - { - if (currentRefPosition.delayRegFree) - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, currentInterval, - assignedRegister)); - delayRegsToFree |= copyRegMask | assignedRegMask; - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } - else - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, currentInterval, assignedRegister)); - regsToFree |= copyRegMask | assignedRegMask; - } - } - else + // For consecutive register, although it shouldn't matter what the assigned register was, + // because we have just assigned it `copyReg` and that's the one in-use, and not the + // one that was assigned previously. However, in situation where an upper-vector restore + // happened to be restored in assignedReg, we would need assignedReg to stay alive because + // we will copy the entire vector value from it to the `copyReg`. + updateRegsFreeBusyState(currentRefPosition, assignedRegMask | copyRegMask, ®sToFree, + &delayRegsToFree DEBUG_ARG(currentInterval) DEBUG_ARG(assignedRegister)); + if (!currentRefPosition.lastUse) { copyRegsToFree |= copyRegMask; - if (currentRefPosition.delayRegFree) - { - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } } // If this is a tree temp (non-localVar) interval, we will need an explicit move. @@ -6675,12 +6687,30 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree, LIR::Range& blockRange = LIR::AsRange(block); if (tree != nullptr) { - JITDUMP("before %d.%s:\n", tree->gtTreeID, GenTree::OpName(tree->gtOper)); LIR::Use treeUse; + GenTree* useNode = nullptr; bool foundUse = blockRange.TryGetUse(tree, &treeUse); + useNode = treeUse.User(); + +#ifdef TARGET_ARM64 + if (refPosition->needsConsecutive && useNode->OperIs(GT_FIELD_LIST)) + { + // The tree node requiring consecutive registers are represented as GT_FIELD_LIST. + // When restoring the upper vector, make sure to restore it at the point where + // GT_FIELD_LIST is consumed instead where the individual field is consumed, which + // will always be at GT_FIELD_LIST creation time. That way, we will restore the + // upper vector just before the use of them in the intrinsic. + LIR::Use fieldListUse; + foundUse = blockRange.TryGetUse(useNode, &fieldListUse); + treeUse = fieldListUse; + useNode = treeUse.User(); + } +#endif assert(foundUse); + JITDUMP("before %d.%s:\n", useNode->gtTreeID, GenTree::OpName(useNode->gtOper)); + // We need to insert the restore prior to the use, not (necessarily) immediately after the lclVar. - blockRange.InsertBefore(treeUse.User(), LIR::SeqTree(compiler, simdUpperRestore)); + blockRange.InsertBefore(useNode, LIR::SeqTree(compiler, simdUpperRestore)); } else { @@ -12066,19 +12096,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } #ifdef DEBUG -#ifdef TARGET_ARM64 - if (!refPosition->needsConsecutive && (linearScan->consecutiveRegistersLocation == refPosition->nodeLocation)) - { - // If a method has consecutive registers and we are assigning to refPositions that are not part - // of consecutive registers, but are live at same location, skip the limit stress for them, because - // there are high chances that many registers are busy for consecutive requirements and we don't - // have enough remaining for other refpositions (like operands). - } - else -#endif - { - candidates = linearScan->stressLimitRegs(refPosition, candidates); - } + candidates = linearScan->stressLimitRegs(refPosition, candidates); #endif assert(candidates != RBM_NONE); diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 0aef405f35a9f9..f9de491dfc199d 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1784,6 +1784,12 @@ class LinearScan : public LinearScanInterface void clearSpillCost(regNumber reg, var_types regType); void updateSpillCost(regNumber reg, Interval* interval); + FORCEINLINE void updateRegsFreeBusyState(RefPosition& refPosition, + regMaskTP regsBusy, + regMaskTP* regsToFree, + regMaskTP* delayRegsToFree DEBUG_ARG(Interval* interval) + DEBUG_ARG(regNumber assignedReg)); + regMaskTP m_RegistersWithConstants; void clearConstantReg(regNumber reg, var_types regType) { @@ -2652,6 +2658,10 @@ class RefPosition } return false; } + +#ifdef DEBUG + bool isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation); +#endif #endif // TARGET_ARM64 #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 5e0385b7fc1b20..ebb0588b62b355 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -417,6 +417,15 @@ regMaskTP LinearScan::getConsecutiveCandidates(regMaskTP allCandidates, assert(refPosition->isFirstRefPositionOfConsecutiveRegisters()); regMaskTP freeCandidates = allCandidates & m_AvailableRegs; +#ifdef DEBUG + if (getStressLimitRegs() != LSRA_LIMIT_NONE) + { + // For stress, make only alternate registers available so we can stress the selection of free/busy registers. + freeCandidates &= (RBM_V0 | RBM_V2 | RBM_V4 | RBM_V6 | RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14 | RBM_V16 | + RBM_V18 | RBM_V20 | RBM_V22 | RBM_V24 | RBM_V26 | RBM_V28 | RBM_V30); + } +#endif + *busyCandidates = RBM_NONE; regMaskTP overallResult; unsigned int registersNeeded = refPosition->regCount; @@ -1802,6 +1811,48 @@ int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwN return srcCount; } + +#ifdef DEBUG +//------------------------------------------------------------------------ +// isLiveAtConsecutiveRegistersLoc: Check if the refPosition is live at the location +// where consecutive registers are needed. This is used during JitStressRegs to +// not constrain the register requirements for such refpositions, because a lot +// of registers will be busy. For RefTypeUse, it will just see if the nodeLocation +// matches with the tracking `consecutiveRegistersLocation`. For Def, it will check +// the underlying `GenTree*` to see if the tree that produced it had consecutive +// registers requirement. +// +// +// Arguments: +// consecutiveRegistersLocation - The most recent location where consecutive +// registers were needed. +// +// Returns: If the refposition is live at same location which has the requirement of +// consecutive registers. +// +bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation) +{ + if (needsConsecutive) + { + return true; + } + + if (refType == RefTypeDef) + { + if (treeNode->OperIsHWIntrinsic()) + { + const HWIntrinsic intrin(treeNode->AsHWIntrinsic()); + return HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id); + } + } + else if ((refType == RefTypeUse) || (refType == RefTypeUpperVectorRestore)) + { + return consecutiveRegistersLocation == nodeLocation; + } + return false; +} +#endif + #endif #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 9ff7f83b3ab6fa..78b06dae9d6468 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1811,8 +1811,15 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc assert(newRefPosition->refType == RefTypeUpperVectorRestore); minRegCount++; } -#endif -#endif +#endif // TARGET_ARM64 +#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE + +#ifdef TARGET_ARM64 + if (newRefPosition->needsConsecutive) + { + consecutiveRegistersLocation = newRefPosition->nodeLocation; + } +#endif // TARGET_ARM64 if (newRefPosition->getInterval()->isSpecialPutArg) { minRegCount++; @@ -1864,16 +1871,45 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc Interval* interval = newRefPosition->getInterval(); regMaskTP oldAssignment = newRefPosition->registerAssignment; regMaskTP calleeSaveMask = calleeSaveRegs(interval->registerType); - newRefPosition->registerAssignment = - getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef); +#ifdef TARGET_ARM64 + if (newRefPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + { + // If we are assigning to refPositions that has consecutive registers requirements, skip the + // limit stress for them, because there are high chances that many registers are busy for + // consecutive requirements and + // we do not have enough remaining for other refpositions (like operands). Likewise, skip for the + // definition node that comes after that, for which, all the registers are in "delayRegFree" state. + } + else +#endif // TARGET_ARM64 + { + newRefPosition->registerAssignment = + getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef); + } if ((newRefPosition->registerAssignment != oldAssignment) && (newRefPosition->refType == RefTypeUse) && !interval->isLocalVar) { - checkConflictingDefUse(newRefPosition); +#ifdef TARGET_ARM64 + RefPosition* defRefPos = interval->firstRefPosition; + assert(defRefPos->treeNode != nullptr); + if (defRefPos->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + { + // If a method has consecutive registers and we are assigning to use refPosition whose + // definition was from a location that has consecutive registers, skip the limit stress for + // them, because there are high chances that many registers are busy for consecutive + // requirements and marked as "delayRegFree" state. We do not have enough remaining for other + // refpositions. + } + else +#endif // TARGET_ARM64 + { + checkConflictingDefUse(newRefPosition); + } } } } + consecutiveRegistersLocation = MinLocation; } #endif // DEBUG JITDUMP("\n");