-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Arm64: Implement VectorTableLookup/VectorTableLookupExtension intrinsinsic + Consecutive registers support #80297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
c92fbdf
bee0f8c
426f68a
83eb54c
d06c4a2
99adc17
7b10969
87f66f7
98e7bd2
786e350
e0a82b3
47848b0
25a738d
e492829
385abf1
2f4a4e3
8d3744b
8d66d45
85d90f5
0a0faed
036a273
791563a
db6036b
fdc94ed
cf84fda
06a78d4
2afe249
a086ab7
450a08d
5bb9302
8027c5a
6b1ba8a
723477b
c6e77e4
5696a6e
a9e1a7a
2617b77
2d4fd5c
6a21205
c6d338f
db4f846
bce8c5a
3d15fcb
75f142b
e4cbad9
96de024
11b345a
4526b41
46f0abd
6ef7c68
b0b6a5e
0197b73
2734023
1cb22d0
6e30b3a
721823b
5b9fac5
cb29aee
53b07b5
302d3ba
5e828f1
fc93cc2
78e87cd
60d383e
05f9fc6
b52059e
0721ad4
8a5c696
e64527b
22270c5
f3884fd
f2a1f19
985fe25
ab043fd
13601eb
7bf9105
40aa7c7
1f95637
e7bb069
6ebb12a
2d75291
68cd4d7
7b83053
903c3de
a8ec819
961e9c2
b9d0f15
cbe999f
6665536
2b9f49e
5fec6e1
5371c30
4875925
597e6de
4a1171d
1f124a4
9686773
55071f6
757c682
79e0bd5
a44cf60
5fefae6
2a5e52c
a17b44f
3c390d8
a9995e6
ae2e633
02f8ad2
984c6ee
8fe130a
7f8e77f
090bf26
c91bc77
1591deb
0c4d71f
0c56514
35a7550
ff587ac
dab2121
e94cfcf
ab007d0
5d6cc2d
24e6158
4026aa6
53c91f0
7d168b2
7cffe7a
dd10bbe
b4ea77e
0dc4ea6
6d9e136
f247b3c
e8d3ee5
524d983
289110d
d778833
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
… less spilling
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -168,15 +168,17 @@ regMaskTP LinearScan::getFreeCandidates(regMaskTP candidates, RefPosition* refPo | |
| return result; | ||
| } | ||
|
|
||
| unsigned int registersNeeded = refPosition->regCount; | ||
| regMaskTP currAvailableRegs = result; | ||
| if (BitOperations::PopCount(currAvailableRegs) < registersNeeded) | ||
| { | ||
| // If number of free registers are less than what we need, no point in scanning | ||
| // for them. | ||
| return RBM_NONE; | ||
| } | ||
| unsigned int registersNeeded = refPosition->regCount; | ||
|
|
||
|
|
||
| regMaskTP currAvailableRegs = result; | ||
| regMaskTP overallResult = RBM_NONE; | ||
| regMaskTP consecutiveResult = RBM_NONE; | ||
| regMaskTP consecutiveResultForBusy = RBM_NONE; | ||
| regMaskTP busyRegsInThisLocation = regsBusyUntilKill | regsInUseThisLocation; | ||
|
|
||
| if (BitOperations::PopCount(currAvailableRegs) >= registersNeeded) | ||
| { | ||
| // At this point, for 'n' registers requirement, if Rm+1, Rm+2, Rm+3, ..., Rm+k are | ||
| // available, create the mask only for Rm+1, Rm+2, ..., Rm+(k-n+1) to convey that it | ||
| // is safe to assign any of those registers, but not beyond that. | ||
|
|
@@ -186,44 +188,111 @@ regMaskTP LinearScan::getFreeCandidates(regMaskTP candidates, RefPosition* refPo | |
| consecutiveResult |= availableRegistersMask & (selectionEndMask & ~selectionStartMask); \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't it the case here that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's right, and we want to just extract the 1st of those bits to be set in |
||
| overallResult |= availableRegistersMask; | ||
|
|
||
| regMaskTP overallResult = RBM_NONE; | ||
| regMaskTP consecutiveResult = RBM_NONE; | ||
| DWORD regAvailableStartIndex = 0, regAvailableEndIndex = 0; | ||
| do | ||
| { | ||
| // From LSB, find the first available register (bit `1`) | ||
| BitScanForward64(®AvailableStartIndex, static_cast<DWORD64>(currAvailableRegs)); | ||
| regMaskTP startMask = (1ULL << regAvailableStartIndex) - 1; | ||
|
|
||
| // Mask all the bits that are processed from LSB thru regAvailableStart until the last `1`. | ||
| regMaskTP maskProcessed = ~(currAvailableRegs | startMask); | ||
| DWORD regAvailableStartIndex = 0, regAvailableEndIndex = 0; | ||
|
|
||
| // If we don't find consecutive registers, also track which registers we can pick so | ||
| // as to reduce the number of registers we will have to spill, to accomodate the | ||
| // request of the consecutive registers. | ||
| bool trackForBusyCandidates = true; | ||
| int maxSpillRegs = registersNeeded; | ||
| regMaskTP registersNeededMask = (1ULL << registersNeeded) - 1; | ||
|
|
||
| // From regAvailableStart, find the first unavailable register (bit `0`). | ||
| if (maskProcessed == 0) | ||
| do | ||
| { | ||
| regAvailableEndIndex = 64; | ||
| // From LSB, find the first available register (bit `1`) | ||
| BitScanForward64(®AvailableStartIndex, static_cast<DWORD64>(currAvailableRegs)); | ||
| regMaskTP startMask = (1ULL << regAvailableStartIndex) - 1; | ||
|
|
||
| // Mask all the bits that are processed from LSB thru regAvailableStart until the last `1`. | ||
| regMaskTP maskProcessed = ~(currAvailableRegs | startMask); | ||
|
|
||
| // From regAvailableStart, find the first unavailable register (bit `0`). | ||
| if (maskProcessed == 0) | ||
| { | ||
| regAvailableEndIndex = 64; | ||
| if ((regAvailableEndIndex - regAvailableStartIndex) >= registersNeeded) | ||
| { | ||
| AppendConsecutiveMask(regAvailableStartIndex, regAvailableEndIndex, currAvailableRegs); | ||
| trackForBusyCandidates = false; | ||
| consecutiveResultForBusy = RBM_NONE; | ||
| } | ||
| else | ||
| { | ||
| // We reached a set of registers where there are not enough consecutive registers. | ||
| // Move a registersNeeded size window for all the available registers and track for which | ||
| // one we can spill least number of registers. | ||
|
|
||
| for (DWORD i = regAvailableStartIndex; i < regAvailableEndIndex; i++) | ||
| { | ||
| regMaskTP maskForCurRange = registersNeededMask << i; | ||
| if ((maskForCurRange & busyRegsInThisLocation) != RBM_NONE) | ||
| { | ||
| // If any register between i and (i + registersNeeded) contains one or more | ||
| // register that are busy, then we cannot that entire range. | ||
| continue; | ||
| } | ||
| int curSpillRegs = registersNeeded - BitOperations::PopCount(maskForCurRange) + 1; | ||
|
|
||
| if (curSpillRegs < maxSpillRegs) | ||
| { | ||
| // We found a series that will need fewer registers to be spilled. | ||
| // Reset whatever we found so far and start accumulating the result again. | ||
| consecutiveResultForBusy = RBM_NONE; | ||
| maxSpillRegs = curSpillRegs; | ||
| } | ||
|
|
||
| consecutiveResultForBusy |= 1ULL << i; | ||
| } | ||
| } | ||
| break; | ||
| } | ||
| else | ||
| { | ||
| BitScanForward64(®AvailableEndIndex, static_cast<DWORD64>(maskProcessed)); | ||
| } | ||
| regMaskTP endMask = (1ULL << regAvailableEndIndex) - 1; | ||
|
|
||
| // Anything between regAvailableStart and regAvailableEnd is the range of consecutive registers available | ||
| // If they are equal to or greater than our register requirements, then add all of them to the result. | ||
| if ((regAvailableEndIndex - regAvailableStartIndex) >= registersNeeded) | ||
| { | ||
| AppendConsecutiveMask(regAvailableStartIndex, regAvailableEndIndex, currAvailableRegs); | ||
| AppendConsecutiveMask(regAvailableStartIndex, regAvailableEndIndex, (endMask & ~startMask)); | ||
| trackForBusyCandidates = false; | ||
| consecutiveResultForBusy = RBM_NONE; | ||
| } | ||
| break; | ||
| } | ||
| else | ||
| { | ||
| BitScanForward64(®AvailableEndIndex, static_cast<DWORD64>(maskProcessed)); | ||
| } | ||
| regMaskTP endMask = (1ULL << regAvailableEndIndex) - 1; | ||
| else if (trackForBusyCandidates) | ||
| { | ||
| // We reached a set of registers where there are not enough consecutive registers. | ||
| // Move a registersNeeded size window for all the available registers and track for which | ||
| // one we can spill least number of registers. | ||
|
|
||
| // Anything between regAvailableStart and regAvailableEnd is the range of consecutive registers available | ||
| // If they are equal to or greater than our register requirements, then add all of them to the result. | ||
| if ((regAvailableEndIndex - regAvailableStartIndex) >= registersNeeded) | ||
| { | ||
| AppendConsecutiveMask(regAvailableStartIndex, regAvailableEndIndex, (endMask & ~startMask)); | ||
| } | ||
| currAvailableRegs &= ~endMask; | ||
| } while (currAvailableRegs != RBM_NONE); | ||
| for (DWORD i = regAvailableStartIndex; i < regAvailableEndIndex; i++) | ||
| { | ||
| regMaskTP maskForCurRange = registersNeededMask << i; | ||
| if ((maskForCurRange & busyRegsInThisLocation) != RBM_NONE) | ||
| { | ||
| // If any register between i and (i + registersNeeded) contains one or more | ||
| // register that are busy, then we cannot that entire range. | ||
| continue; | ||
| } | ||
| int curSpillRegs = registersNeeded - BitOperations::PopCount(maskForCurRange) + 1; | ||
| if (curSpillRegs < maxSpillRegs) | ||
| { | ||
| // We found a series that will need fewer registers to be spilled. | ||
| // Reset whatever we found so far and start accumulating the result again. | ||
| consecutiveResultForBusy = RBM_NONE; | ||
| maxSpillRegs = curSpillRegs; | ||
| } | ||
|
|
||
| consecutiveResultForBusy |= 1ULL << i; | ||
| } | ||
| } | ||
| currAvailableRegs &= ~endMask; | ||
| } while (currAvailableRegs != RBM_NONE); | ||
| } | ||
|
|
||
| if (compiler->opts.OptimizationEnabled() && (overallResult != RBM_NONE)) | ||
| if (overallResult != RBM_NONE) | ||
| { | ||
| // One last time, check if subsequent refpositions (all refpositions except the first for which | ||
| // we assigned above) already have consecutive registers assigned. If yes, and if one of the | ||
|
|
@@ -281,6 +350,39 @@ regMaskTP LinearScan::getFreeCandidates(regMaskTP candidates, RefPosition* refPo | |
| } | ||
| } | ||
| } | ||
| else | ||
| { | ||
| // There are enough registers available but they are not consecutive. | ||
| // Here are some options to address them: | ||
| // | ||
| // 1. Scan once again the available registers and find a set which has maximum register available. | ||
| // In other words, try to find register sequence that needs fewer registers to be spilled. This | ||
| // will give optimal CQ. | ||
| // | ||
| // 2. Check if some of the refpositions in the series are already in *somewhat* consecutive registers | ||
| // and if yes, assign that register sequence. That way, we will avoid copying values of | ||
| // refpositions that are already positioned in the desired registers. Checking this is beneficial | ||
| // only if it can happen frequently. So for RefPositions <RP# 5, RP# 6, RP# 7, RP# 8>, it should | ||
| // be that, RP# 6 is already in V14 and RP# 7 is already in V16. But this can be rare (not tested). | ||
| // In future, if we see such cases being hit, we could use this heuristics. | ||
| // | ||
| // 3. Give one of the free register to the first position and the algorithm will | ||
| // give the subsequent consecutive registers (free or busy) to the remaining refpositions | ||
| // of the series. This may not give optimal CQ however. | ||
| // | ||
| // 4. Return the set of available registers and let selection heuristics pick one of them to get | ||
| // assigned to the first refposition. Remaining refpositions will be assigned to the subsequent | ||
| // registers (if busy, they will be spilled), similar to #3 above and will not give optimal CQ. | ||
| // | ||
| // | ||
| // Among `consecutiveResultForBusy`, we could shortlist the registers that are beneficial from "busy register | ||
| // selection" heuristics perspective. However, we would need to add logic of try_SPILL_COST(), try_FAR_NEXT_REF(), | ||
| // etc. here which would complicate things. Instead, we just go with option# 1 and select registers based on fewer | ||
| // number of registers that has to be spilled. | ||
| // | ||
|
|
||
| consecutiveResult = consecutiveResultForBusy; | ||
| } | ||
|
|
||
| return consecutiveResult; | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.