diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 8cd4ee9bf87b01..68d9c3baa163df 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -496,13 +496,6 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) { mask |= refPosition->registerAssignment; } - -#ifdef TARGET_ARM64 - if ((refPosition != nullptr) && refPosition->isFirstRefPositionOfConsecutiveRegisters()) - { - mask |= LsraLimitFPSetForConsecutive; - } -#endif } return mask; @@ -662,7 +655,9 @@ LinearScan::LinearScan(Compiler* theCompiler) firstColdLoc = MaxLocation; #ifdef DEBUG - maxNodeLocation = 0; + maxNodeLocation = 0; + consecutiveRegistersLocation = 0; + activeRefPosition = nullptr; currBuildNode = nullptr; @@ -4901,6 +4896,24 @@ void LinearScan::allocateRegisters() } } prevLocation = currentLocation; +#ifdef TARGET_ARM64 + +#ifdef DEBUG + if (hasConsecutiveRegister) + { + if (currentRefPosition.needsConsecutive) + { + // track all the refpositions around the location that is also + // allocating consecutive registers. + consecutiveRegistersLocation = currentLocation; + } + else if (consecutiveRegistersLocation < currentLocation) + { + consecutiveRegistersLocation = MinLocation; + } + } +#endif // DEBUG +#endif // TARGET_ARM64 // get previous refposition, then current refpos is the new previous if (currentReferent != nullptr) @@ -11683,49 +11696,53 @@ void LinearScan::RegisterSelection::try_SPILL_COST() Interval* assignedInterval = spillCandidateRegRecord->assignedInterval; RefPosition* recentRefPosition = assignedInterval != nullptr ? assignedInterval->recentRefPosition : nullptr; -// Can and should the interval in this register be spilled for this one, -// if we don't find a better alternative? + // Can and should the interval in this register be spilled for this one, + // if we don't find a better alternative? + weight_t currentSpillWeight = 0; #ifdef TARGET_ARM64 - if (assignedInterval == nullptr) - { - // Ideally we should not be seeing this candidate because it is not assigned to - // any interval. But based on that, we cannot determine if it is a good spill - // candidate or not. Skip processing it. - continue; - } - if ((recentRefPosition != nullptr) && linearScan->isRefPositionActive(recentRefPosition, thisLocation) && (recentRefPosition->needsConsecutive)) { continue; } -#endif // TARGET_ARM64 - - if ((linearScan->getNextIntervalRef(spillCandidateRegNum, regType) == thisLocation) && - !assignedInterval->getNextRefPosition()->RegOptional()) - { - continue; - } - if (!linearScan->isSpillCandidate(currentInterval, refPosition, spillCandidateRegRecord)) + else if (assignedInterval != nullptr) +#endif { - continue; - } + if ((linearScan->getNextIntervalRef(spillCandidateRegNum, regType) == thisLocation) && + !assignedInterval->getNextRefPosition()->RegOptional()) + { + continue; + } + if (!linearScan->isSpillCandidate(currentInterval, refPosition, spillCandidateRegRecord)) + { + continue; + } - weight_t currentSpillWeight = 0; - if ((recentRefPosition != nullptr) && - (recentRefPosition->RegOptional() && !(assignedInterval->isLocalVar && recentRefPosition->IsActualRef()))) - { - // We do not "spillAfter" if previous (recent) refPosition was regOptional or if it - // is not an actual ref. In those cases, we will reload in future (next) refPosition. - // For such cases, consider the spill cost of next refposition. - // See notes in "spillInterval()". - RefPosition* reloadRefPosition = assignedInterval->getNextRefPosition(); - if (reloadRefPosition != nullptr) + if ((recentRefPosition != nullptr) && (recentRefPosition->RegOptional() && + !(assignedInterval->isLocalVar && recentRefPosition->IsActualRef()))) { - currentSpillWeight = linearScan->getWeight(reloadRefPosition); + // We do not "spillAfter" if previous (recent) refPosition was regOptional or if it + // is not an actual ref. In those cases, we will reload in future (next) refPosition. + // For such cases, consider the spill cost of next refposition. + // See notes in "spillInterval()". + RefPosition* reloadRefPosition = assignedInterval->getNextRefPosition(); + if (reloadRefPosition != nullptr) + { + currentSpillWeight = linearScan->getWeight(reloadRefPosition); + } } } +#ifdef TARGET_ARM64 + else + { + // Ideally we should not be seeing this candidate because it is not assigned to + // any interval. But it is possible for certain scenarios. One of them is that + // `refPosition` needs consecutive registers and we decided to pick a mix of free+busy + // registers. This candidate is part of that set and is free and hence is not assigned + // to any interval. + } +#endif // TARGET_ARM64 // Only consider spillCost if we were not able to calculate weight of reloadRefPosition. if (currentSpillWeight == 0) @@ -11875,7 +11892,16 @@ void LinearScan::RegisterSelection::try_PREV_REG_OPT() #ifdef DEBUG // The assigned should be non-null, and should have a recentRefPosition, however since // this is a heuristic, we don't want a fatal error, so we just assert (not noway_assert). - if (!hasAssignedInterval) + if (!hasAssignedInterval +#ifdef TARGET_ARM64 + // We could see a candidate that doesn't have assignedInterval because allocation is + // happening for `refPosition` that needs consecutive registers and we decided to pick + // a mix of free+busy registers. This candidate is part of that set and is free and hence + // is not assigned to any interval. + + && !refPosition->needsConsecutive +#endif + ) { assert(!"Spill candidate has no assignedInterval recentRefPosition"); } @@ -11988,6 +12014,10 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, *registerScore = NONE; #endif +#ifdef TARGET_ARM64 + assert(!needsConsecutiveRegisters || refPosition->needsConsecutive); +#endif + reset(currentInterval, refPosition); // process data-structures @@ -12036,7 +12066,19 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } #ifdef DEBUG - candidates = linearScan->stressLimitRegs(refPosition, candidates); +#ifdef TARGET_ARM64 + if (!refPosition->needsConsecutive && (linearScan->consecutiveRegistersLocation == refPosition->nodeLocation)) + { + // If a method has consecutive registers and we are assigning to refPositions that are not part + // of consecutive registers, but are live at same location, skip the limit stress for them, because + // there are high chances that many registers are busy for consecutive requirements and we don't + // have enough remaining for other refpositions (like operands). + } + else +#endif + { + candidates = linearScan->stressLimitRegs(refPosition, candidates); + } #endif assert(candidates != RBM_NONE); @@ -12186,6 +12228,10 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } } +#ifdef DEBUG + regMaskTP inUseOrBusyRegsMask = RBM_NONE; +#endif + // Eliminate candidates that are in-use or busy. if (!found) { @@ -12195,6 +12241,10 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, regMaskTP busyRegs = linearScan->regsBusyUntilKill | linearScan->regsInUseThisLocation; candidates &= ~busyRegs; +#ifdef DEBUG + inUseOrBusyRegsMask |= busyRegs; +#endif + // Also eliminate as busy any register with a conflicting fixed reference at this or // the next location. // Note that this will eliminate the fixedReg, if any, but we'll add it back below. @@ -12210,6 +12260,9 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, (refPosition->delayRegFree && (checkConflictLocation == (refPosition->nodeLocation + 1)))) { candidates &= ~checkConflictBit; +#ifdef DEBUG + inUseOrBusyRegsMask |= checkConflictBit; +#endif } } candidates |= fixedRegMask; @@ -12226,12 +12279,10 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, prevRegBit = genRegMask(prevRegRec->regNum); if ((prevRegRec->assignedInterval == currentInterval) && ((candidates & prevRegBit) != RBM_NONE)) { -#ifdef TARGET_ARM64 - // If this is allocating for consecutive register, we need to make sure that - // we allocate register, whose consecutive registers are also free. if (!needsConsecutiveRegisters) -#endif { + // If this is allocating for consecutive register, we need to make sure that + // we allocate register, whose consecutive registers are also free. candidates = prevRegBit; found = true; #ifdef DEBUG @@ -12245,13 +12296,6 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, prevRegBit = RBM_NONE; } - if (!found && (candidates == RBM_NONE)) - { - assert(refPosition->RegOptional()); - currentInterval->assignedReg = nullptr; - return RBM_NONE; - } - // TODO-Cleanup: Previously, the "reverseSelect" stress mode reversed the order of the heuristics. // It needs to be re-engineered with this refactoring. // In non-debug builds, this will simply get optimized away @@ -12260,9 +12304,9 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, reverseSelect = linearScan->doReverseSelect(); #endif // DEBUG -#ifdef TARGET_ARM64 if (needsConsecutiveRegisters) { +#ifdef TARGET_ARM64 regMaskTP busyConsecutiveCandidates = RBM_NONE; if (refPosition->isFirstRefPositionOfConsecutiveRegisters()) { @@ -12287,12 +12331,46 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, if ((freeCandidates == RBM_NONE) && (candidates == RBM_NONE)) { - noway_assert(!"Not sufficient consecutive registers available."); +#ifdef DEBUG + // Need to make sure that candidates has N consecutive registers to assign + if (linearScan->getStressLimitRegs() != LSRA_LIMIT_NONE) + { + // If the refPosition needs consecutive registers, then we want to make sure that + // the candidates have atleast one range of N registers that are consecutive, where N + // is the number of consecutive registers needed. + // Remove the `inUseOrBusyRegsMask` from the original candidates list and find one + // such range that is consecutive. Next, append that range to the `candidates`. + // + regMaskTP limitCandidatesForConsecutive = refPosition->registerAssignment & ~inUseOrBusyRegsMask; + regMaskTP overallLimitCandidates; + regMaskTP limitConsecutiveResult = + linearScan->filterConsecutiveCandidates(limitCandidatesForConsecutive, refPosition->regCount, + &overallLimitCandidates); + assert(limitConsecutiveResult != RBM_NONE); + + unsigned startRegister = BitOperations::BitScanForward(limitConsecutiveResult); + + regMaskTP registersNeededMask = (1ULL << refPosition->regCount) - 1; + candidates |= (registersNeededMask << startRegister); + } + + if (candidates == RBM_NONE) +#endif // DEBUG + { + noway_assert(!"Not sufficient consecutive registers available."); + } } +#endif // TARGET_ARM64 } else -#endif // TARGET_ARM64 { + if (!found && (candidates == RBM_NONE)) + { + assert(refPosition->RegOptional()); + currentInterval->assignedReg = nullptr; + return RBM_NONE; + } + freeCandidates = linearScan->getFreeCandidates(candidates ARM_ARG(regType)); } diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index bf72219e755c2d..0aef405f35a9f9 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -792,9 +792,6 @@ class LinearScan : public LinearScanInterface #elif defined(TARGET_ARM64) static const regMaskTP LsraLimitSmallIntSet = (RBM_R0 | RBM_R1 | RBM_R2 | RBM_R19 | RBM_R20); static const regMaskTP LsraLimitSmallFPSet = (RBM_V0 | RBM_V1 | RBM_V2 | RBM_V8 | RBM_V9); - // LsraLimitFPSetForConsecutive is used for stress mode and gives few extra registers to satisfy - // the requirements for allocating consecutive registers. - static const regMaskTP LsraLimitFPSetForConsecutive = (RBM_V3 | RBM_V5 | RBM_V7); #elif defined(TARGET_X86) static const regMaskTP LsraLimitSmallIntSet = (RBM_EAX | RBM_ECX | RBM_EDI); static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7); @@ -2006,9 +2003,13 @@ class LinearScan : public LinearScanInterface int BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount); #ifdef TARGET_ARM64 int BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwNode = nullptr); -#endif +#endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS +#ifdef DEBUG + LsraLocation consecutiveRegistersLocation; +#endif // DEBUG + int BuildPutArgStk(GenTreePutArgStk* argNode); #if FEATURE_ARG_SPLIT int BuildPutArgSplit(GenTreePutArgSplit* tree); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 72047f67a56b35..5e0385b7fc1b20 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -196,18 +196,13 @@ regMaskTP LinearScan::filterConsecutiveCandidates(regMaskTP candidates, consecutiveResult |= availableRegistersMask & (selectionEndMask & ~selectionStartMask); \ overallResult |= availableRegistersMask; - DWORD regAvailableStartIndex = 0, regAvailableEndIndex = 0; - - // If we don't find consecutive registers, also track which registers we can pick so - // as to reduce the number of registers we will have to spill, to accomodate the - // request of the consecutive registers. - regMaskTP registersNeededMask = (1ULL << registersNeeded) - 1; + unsigned regAvailableStartIndex = 0, regAvailableEndIndex = 0; do { // From LSB, find the first available register (bit `1`) - BitScanForward64(®AvailableStartIndex, static_cast(currAvailableRegs)); - regMaskTP startMask = (1ULL << regAvailableStartIndex) - 1; + regAvailableStartIndex = BitOperations::BitScanForward(static_cast(currAvailableRegs)); + regMaskTP startMask = (1ULL << regAvailableStartIndex) - 1; // Mask all the bits that are processed from LSB thru regAvailableStart until the last `1`. regMaskTP maskProcessed = ~(currAvailableRegs | startMask); @@ -224,7 +219,7 @@ regMaskTP LinearScan::filterConsecutiveCandidates(regMaskTP candidates, } else { - BitScanForward64(®AvailableEndIndex, static_cast(maskProcessed)); + regAvailableEndIndex = BitOperations::BitScanForward(static_cast(maskProcessed)); } regMaskTP endMask = (1ULL << regAvailableEndIndex) - 1; @@ -330,13 +325,13 @@ regMaskTP LinearScan::filterConsecutiveCandidatesForSpill(regMaskTP consecutiveC assert((registersNeeded >= 2) && (registersNeeded <= 4)); regMaskTP consecutiveResultForBusy = RBM_NONE; regMaskTP unprocessedRegs = consecutiveCandidates; - DWORD regAvailableStartIndex = 0, regAvailableEndIndex = 0; + unsigned regAvailableStartIndex = 0, regAvailableEndIndex = 0; int maxSpillRegs = registersNeeded; regMaskTP registersNeededMask = (1ULL << registersNeeded) - 1; do { // From LSB, find the first available register (bit `1`) - BitScanForward64(®AvailableStartIndex, static_cast(unprocessedRegs)); + regAvailableStartIndex = BitOperations::BitScanForward(static_cast(unprocessedRegs)); // For the current range, find how many registers are free vs. busy regMaskTP maskForCurRange = RBM_NONE; @@ -421,77 +416,77 @@ regMaskTP LinearScan::getConsecutiveCandidates(regMaskTP allCandidates, assert(compiler->info.compNeedsConsecutiveRegisters); assert(refPosition->isFirstRefPositionOfConsecutiveRegisters()); regMaskTP freeCandidates = allCandidates & m_AvailableRegs; - if (freeCandidates == RBM_NONE) - { - return freeCandidates; - } *busyCandidates = RBM_NONE; regMaskTP overallResult; unsigned int registersNeeded = refPosition->regCount; - regMaskTP consecutiveResultForFree = filterConsecutiveCandidates(freeCandidates, registersNeeded, &overallResult); - if (consecutiveResultForFree != RBM_NONE) + if (freeCandidates != RBM_NONE) { - // One last time, check if subsequent RefPositions (all RefPositions except the first for which - // we assigned above) already have consecutive registers assigned. If yes, and if one of the - // register out of the `consecutiveResult` is available for the first RefPosition, then just use - // that. This will avoid unnecessary copies. + regMaskTP consecutiveResultForFree = + filterConsecutiveCandidates(freeCandidates, registersNeeded, &overallResult); - regNumber firstRegNum = REG_NA; - regNumber prevRegNum = REG_NA; - int foundCount = 0; - regMaskTP foundRegMask = RBM_NONE; + if (consecutiveResultForFree != RBM_NONE) + { + // One last time, check if subsequent RefPositions (all RefPositions except the first for which + // we assigned above) already have consecutive registers assigned. If yes, and if one of the + // register out of the `consecutiveResult` is available for the first RefPosition, then just use + // that. This will avoid unnecessary copies. - RefPosition* consecutiveRefPosition = getNextConsecutiveRefPosition(refPosition); - assert(consecutiveRefPosition != nullptr); + regNumber firstRegNum = REG_NA; + regNumber prevRegNum = REG_NA; + int foundCount = 0; + regMaskTP foundRegMask = RBM_NONE; - for (unsigned int i = 1; i < registersNeeded; i++) - { - Interval* interval = consecutiveRefPosition->getInterval(); - consecutiveRefPosition = getNextConsecutiveRefPosition(consecutiveRefPosition); + RefPosition* consecutiveRefPosition = getNextConsecutiveRefPosition(refPosition); + assert(consecutiveRefPosition != nullptr); - if (!interval->isActive) + for (unsigned int i = 1; i < registersNeeded; i++) { + Interval* interval = consecutiveRefPosition->getInterval(); + consecutiveRefPosition = getNextConsecutiveRefPosition(consecutiveRefPosition); + + if (!interval->isActive) + { + foundRegMask = RBM_NONE; + foundCount = 0; + continue; + } + + regNumber currRegNum = interval->assignedReg->regNum; + if ((prevRegNum == REG_NA) || (prevRegNum == REG_PREV(currRegNum)) || + ((prevRegNum == REG_FP_LAST) && (currRegNum == REG_FP_FIRST))) + { + foundRegMask |= genRegMask(currRegNum); + if (prevRegNum == REG_NA) + { + firstRegNum = currRegNum; + } + prevRegNum = currRegNum; + foundCount++; + continue; + } + foundRegMask = RBM_NONE; foundCount = 0; - continue; + break; } - regNumber currRegNum = interval->assignedReg->regNum; - if ((prevRegNum == REG_NA) || (prevRegNum == REG_PREV(currRegNum)) || - ((prevRegNum == REG_FP_LAST) && (currRegNum == REG_FP_FIRST))) + if (foundCount != 0) { - foundRegMask |= genRegMask(currRegNum); - if (prevRegNum == REG_NA) + assert(firstRegNum != REG_NA); + regMaskTP remainingRegsMask = ((1ULL << (registersNeeded - foundCount)) - 1) << (firstRegNum - 1); + + if ((overallResult & remainingRegsMask) != RBM_NONE) { - firstRegNum = currRegNum; + // If remaining registers are available, then just set the firstRegister mask + consecutiveResultForFree = 1ULL << (firstRegNum - 1); } - prevRegNum = currRegNum; - foundCount++; - continue; } - foundRegMask = RBM_NONE; - foundCount = 0; - break; - } - - if (foundCount != 0) - { - assert(firstRegNum != REG_NA); - regMaskTP remainingRegsMask = ((1ULL << (registersNeeded - foundCount)) - 1) << (firstRegNum - 1); - - if ((overallResult & remainingRegsMask) != RBM_NONE) - { - // If remaining registers are available, then just set the firstRegister mask - consecutiveResultForFree = 1ULL << (firstRegNum - 1); - } + return consecutiveResultForFree; } - - return consecutiveResultForFree; } - // There are registers available but they are not consecutive. // Here are some options to address them: // diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 7ed53b7920a7d5..9ff7f83b3ab6fa 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1805,6 +1805,13 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc { minRegCount++; } +#ifdef TARGET_ARM64 + else if (newRefPosition->needsConsecutive) + { + assert(newRefPosition->refType == RefTypeUpperVectorRestore); + minRegCount++; + } +#endif #endif if (newRefPosition->getInterval()->isSpecialPutArg) { @@ -1859,12 +1866,7 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc regMaskTP calleeSaveMask = calleeSaveRegs(interval->registerType); newRefPosition->registerAssignment = getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef); -#ifdef TARGET_ARM64 - if (newRefPosition->isFirstRefPositionOfConsecutiveRegisters()) - { - newRefPosition->registerAssignment |= LsraLimitFPSetForConsecutive; - } -#endif + if ((newRefPosition->registerAssignment != oldAssignment) && (newRefPosition->refType == RefTypeUse) && !interval->isLocalVar) {