diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext-opt.png b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext-opt.png
new file mode 100644
index 00000000000000..c795c7d067b674
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext-opt.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext.png b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext.png
new file mode 100644
index 00000000000000..780227d2a3f483
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-ilsize-histogram1.png b/docs/design/features/DynamicPgo-InstrumentedTiers-ilsize-histogram1.png
new file mode 100644
index 00000000000000..9eb74ee70a2414
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-ilsize-histogram1.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-msft-service.png b/docs/design/features/DynamicPgo-InstrumentedTiers-msft-service.png
new file mode 100644
index 00000000000000..be6e94e8d826f4
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-msft-service.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers.md b/docs/design/features/DynamicPgo-InstrumentedTiers.md
new file mode 100644
index 00000000000000..b7786f37d6575a
--- /dev/null
+++ b/docs/design/features/DynamicPgo-InstrumentedTiers.md
@@ -0,0 +1,663 @@
+# Instrumented Tiers
+
+[#70941](https://github.com/dotnet/runtime/pull/70941) introduced separate tiers to focus on instrumenting only the hot code. It's done to address the following problems:
+1) R2R code should still benefit from Dynamic PGO despite being not instrumented in the first place
+2) Overhead from the instrumentation in Tier0 should not slow startup
+
+To address these problems the following workflow was introduced:
+
+```mermaid
+flowchart
+ prestub(.NET Function) -->|Compilation| hasAO{"Marked with
[AggressiveOpts]?"}
+ hasAO-->|Yes|tier1ao["JIT to Tier1
(no dynamic profile data)"]
+ hasAO-->|No|hasR2R
+ hasR2R{"Is prejitted (R2R)?"} -->|No| tier000
+
+ tier000["JIT to Tier0
(not optimized, not instrumented,
with patchpoints)"]-->|Running...|ishot555
+ ishot555{"Is hot?
(called >30 times)"}
+ ishot555-.->|No,
keep running...|ishot555
+ ishot555-->|Yes|tier0
+
+ hasR2R -->|Yes| R2R
+ R2R["Use R2R code
(optimized, not instrumented,
no patchpoints)"] -->|Running...|ishot1
+ ishot1{"Is hot?
(called >30 times)"}-.->|No,
keep running...|ishot1
+ ishot1--->|"Yes"|tier1inst
+
+ tier0["JIT to Tier0Instrumented
(not optimized, instrumented,
with patchpoints)"]-->|Running...|ishot5
+ tier1pgo2["JIT to Tier1
(optimized with profile data)"]
+
+ tier1inst["JIT to Tier1Instrumented
(optimized, instrumented,
no patchpoints)"]
+ tier1inst-->|Running...|ishot5
+ ishot5{"Is hot?
(called >30 times)"}-->|Yes|tier1pgo2
+ ishot5-.->|No,
keep running...|ishot5
+```
+(_VSCode doesn't support mermaid diagrams out of the box, consider installing external add-ins_)
+
+Now, any code is eligible for Dynamic PGO if it's hot enough. It's easier to explain this on a concrete example:
+
+```csharp
+class Program : IDisposable
+{
+ static int Main()
+ {
+ Program p = new();
+ for (int i = 0; i < 500; i++)
+ {
+ HotLoop(p);
+ Thread.Sleep(40); // cold loop
+ }
+
+ Console.ReadKey();
+ return 100;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void HotLoop(IDisposable d)
+ {
+ for (int i = 0; i < 500000; i++) // hot loop
+ d?.Dispose();
+ }
+
+ public void Dispose() => Test();
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ void Test() { }
+}
+```
+
+The method we'll be looking at is `HotLoop`. The method itself has a hot loop (to show how this work interacts with OSR) but the whole method is expected to be promoted to Tier1 too since it's invoked also in a loop (cold loop). The method also has a virtual call to showcase GDV.
+
+# Case 1: `HotLoop` is prejitted (R2R)
+
+Let's see what happens when the method we're inspecting has an AOT version on start:
+
+1) When we start the app, VM picks up R2R'd version of `HotLoop` that looks like this:
+
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; ReadyToRun compilation
+; optimized code
+; No PGO data
+G_M43040_IG01: ;; offset=0000H
+ 57 push rdi
+ 56 push rsi
+ 4883EC28 sub rsp, 40
+ 488BF1 mov rsi, rcx
+ ;; size=9 bbWeight=1 PerfScore 2.50
+G_M43040_IG02: ;; offset=0009H
+ 33FF xor edi, edi
+ ;; size=2 bbWeight=1 PerfScore 0.25
+G_M43040_IG03: ;; offset=000BH
+ 4885F6 test rsi, rsi
+ 740D je SHORT G_M43040_IG05
+ ;; size=5 bbWeight=4 PerfScore 5.00
+G_M43040_IG04: ;; offset=0010H
+ 488BCE mov rcx, rsi
+ 4C8D1D00000000 lea r11, [(reloc 0x4000000000420270)]
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ ;; size=13 bbWeight=2 PerfScore 7.50
+G_M43040_IG05: ;; offset=001DH
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CE4 jl SHORT G_M43040_IG03
+ ;; size=10 bbWeight=4 PerfScore 6.00
+G_M43040_IG06: ;; offset=0027H
+ 4883C428 add rsp, 40
+ 5E pop rsi
+ 5F pop rdi
+ C3 ret
+ ;; size=7 bbWeight=1 PerfScore 2.25
+; Total bytes of code 46
+```
+
+As we can see from the codegen: it's not instrumented (we never instrument R2R'd code - it would increase the binary size by quite a lot), it doesn't have patchpoints for OSR (since it's already optimized) and is optimized. Technically, it can be optimized with a Static PGO but, presumably, it's a rare case in the real world due to complexity, so we left that virtual call here non-devirtualized.
+
+2) HotLoop is invoked >30 times meaning it's likely a hot method so VM "promotes" it to Tier1Instrumented:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; optimized code
+; instrumented for collecting profile data
+; No PGO data
+G_M43040_IG01: ;; offset=0000H
+ 57 push rdi
+ 56 push rsi
+ 4883EC28 sub rsp, 40
+ 488BF1 mov rsi, rcx
+ ;; size=9 bbWeight=1 PerfScore 2.50
+G_M43040_IG02: ;; offset=0009H
+ FF05F9FE5500 inc dword ptr [(reloc 0x7ffd5edb4948)]
+ 33FF xor edi, edi
+ EB3B jmp SHORT G_M43040_IG05
+ ;; size=10 bbWeight=1 PerfScore 5.25
+G_M43040_IG03: ;; offset=0013H
+ FF05F3FE5500 inc dword ptr [(reloc 0x7ffd5edb494c)]
+ 4885F6 test rsi, rsi
+ 7428 je SHORT G_M43040_IG04
+ FF05ECFE5500 inc dword ptr [(reloc 0x7ffd5edb4950)]
+ 488BCE mov rcx, rsi
+ 48BA5849DB5EFD7F0000 mov rdx, 0x7FFD5EDB4958
+ E81ACB105F call CORINFO_HELP_CLASSPROFILE32
+ 488BCE mov rcx, rsi
+ 49BB5000595EFD7F0000 mov r11, 0x7FFD5E590050 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ ;; size=51 bbWeight=2 PerfScore 24.50
+G_M43040_IG04: ;; offset=0046H
+ FF0514FF5500 inc dword ptr [(reloc 0x7ffd5edb49a0)]
+ FFC7 inc edi
+ ;; size=8 bbWeight=2 PerfScore 6.50
+G_M43040_IG05: ;; offset=004EH
+ FF0510FF5500 inc dword ptr [(reloc 0x7ffd5edb49a4)]
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CB7 jl SHORT G_M43040_IG03
+ ;; size=14 bbWeight=8 PerfScore 34.00
+G_M43040_IG06: ;; offset=005CH
+ FF0506FF5500 inc dword ptr [(reloc 0x7ffd5edb49a8)]
+ ;; size=6 bbWeight=1 PerfScore 3.00
+G_M43040_IG07: ;; offset=0062H
+ 4883C428 add rsp, 40
+ 5E pop rsi
+ 5F pop rdi
+ C3 ret
+ ;; size=7 bbWeight=1 PerfScore 2.25
+; Total bytes of code 105
+```
+
+We had to instrument **optimized** code here to mitigate two issues:
+1) We don't want to see a significant performance degradation (even temporarily) after fast R2R
+2) Unoptimized code tends to spawn a lot of new unnecessary jit compilations because it doesn't inline code, even simple properties
+
+As a downside - the profile is less accurate and it doesn't instrument inlinees.
+
+3) The new code version of `HotLoop` is also invoked >30 times leading to the final promotion to Tier1:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; optimized code
+; optimized using profile data
+; with Dynamic PGO: edge weights are invalid, and fgCalledCount is 48
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
+G_M43040_IG01: ;; offset=0000H
+ 57 push rdi
+ 56 push rsi
+ 4883EC28 sub rsp, 40
+ 488BF1 mov rsi, rcx
+ ;; size=9 bbWeight=1 PerfScore 2.50
+G_M43040_IG02: ;; offset=0009H
+ 33FF xor edi, edi
+ 4885F6 test rsi, rsi
+ 7424 je SHORT G_M43040_IG05
+ 48B9A023C861FD7F0000 mov rcx, 0x7FFD61C823A0 ; Program
+ 48390E cmp qword ptr [rsi], rcx
+ 7515 jne SHORT G_M43040_IG05
+ ;; size=22 bbWeight=1 PerfScore 5.75
+G_M43040_IG03: ;; offset=001FH
+ 488BCE mov rcx, rsi
+ FF1550771B00 call [Program:Test():this]
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CED jl SHORT G_M43040_IG03
+ ;; size=19 bbWeight=484693.69 PerfScore 2302295.02
+G_M43040_IG04: ;; offset=0032H
+ EB27 jmp SHORT G_M43040_IG07
+ ;; size=2 bbWeight=1 PerfScore 2.00
+G_M43040_IG05: ;; offset=0034H
+ 4885F6 test rsi, rsi
+ 7418 je SHORT G_M43040_IG06
+ 48B9A023C861FD7F0000 mov rcx, 0x7FFD61C823A0 ; Program
+ 48390E cmp qword ptr [rsi], rcx
+ 751A jne SHORT G_M43040_IG08
+ 488BCE mov rcx, rsi
+ FF1527771B00 call [Program:Test():this]
+ ;; size=29 bbWeight=4895.90 PerfScore 42839.09
+G_M43040_IG06: ;; offset=0051H
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CD9 jl SHORT G_M43040_IG05
+ ;; size=10 bbWeight=4895.90 PerfScore 7343.84
+G_M43040_IG07: ;; offset=005BH
+ 4883C428 add rsp, 40
+ 5E pop rsi
+ 5F pop rdi
+ C3 ret
+ ;; size=7 bbWeight=0.98 PerfScore 2.20
+G_M43040_IG08: ;; offset=0062H
+ 488BCE mov rcx, rsi
+ 49BB10007E61FD7F0000 mov r11, 0x7FFD617E0010 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ EBDD jmp SHORT G_M43040_IG06
+; Total bytes of code 116
+```
+The codegen looks a bit bulky but if we look closer we'll see that we cloned the loop to have a fast version with a devirtualized call inside (see `G_M43040_IG03`) with guards hoisted out of that loop. To summarize what happened with `HotLoop` we can take a look at this part of the diagram:
+```mermaid
+flowchart
+ hasR2R("...") -->|Yes| R2R
+ R2R["Use R2R code
(optimized, not instrumented,
no patchpoints)"] -->|Running...|ishot1
+ ishot1{"Is hot?
(called >30 times)"}-.->|No,
keep running...|ishot1
+ ishot1--->|"Yes"|tier1inst
+ tier1pgo2["JIT to Tier1
(optimized with profile data)"]
+ tier1inst["JIT to Tier1Instrumented
(optimized, instrumented,
no patchpoints)"]
+ tier1inst-->|Running...|ishot5
+ ishot5{"Is hot?
(called >30 times)"}-->|Yes|tier1pgo2
+ ishot5-.->|No,
keep running...|ishot5
+```
+
+
+# Case 2: `HotLoop` is not initially prejitted
+
+This case is a bit more complicated since it involves OSR for this case.
+
+1) Since no R2R version exists for `HotLoop` VM has to ask JIT to compile a Tier0 version of it as fast as it can:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-0 compilation
+; MinOpts code
+G_M43040_IG01: ;; offset=0000H
+ 55 push rbp
+ 4883EC70 sub rsp, 112
+ 488D6C2470 lea rbp, [rsp+70H]
+ 33C0 xor eax, eax
+ 8945C4 mov dword ptr [rbp-3CH], eax
+ 48894D10 mov gword ptr [rbp+10H], rcx
+ ;; size=19 bbWeight=1 PerfScore 4.00
+G_M43040_IG02: ;; offset=0013H
+ 33C9 xor ecx, ecx
+ 894DC4 mov dword ptr [rbp-3CH], ecx
+ C745B8E8030000 mov dword ptr [rbp-48H], 0x3E8
+ EB20 jmp SHORT G_M43040_IG05
+ ;; size=14 bbWeight=1 PerfScore 4.25
+G_M43040_IG03: ;; offset=0021H
+ 48837D1000 cmp gword ptr [rbp+10H], 0
+ 7411 je SHORT G_M43040_IG04
+ 488B4D10 mov rcx, gword ptr [rbp+10H]
+ 49BB90027E61FD7F0000 mov r11, 0x7FFD617E0290 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ ;; size=24 bbWeight=1 PerfScore 7.25
+G_M43040_IG04: ;; offset=0039H
+ 8B45C4 mov eax, dword ptr [rbp-3CH]
+ FFC0 inc eax
+ 8945C4 mov dword ptr [rbp-3CH], eax
+ ;; size=8 bbWeight=1 PerfScore 2.25
+G_M43040_IG05: ;; offset=0041H
+ 8B4DB8 mov ecx, dword ptr [rbp-48H]
+ FFC9 dec ecx
+ 894DB8 mov dword ptr [rbp-48H], ecx
+ 837DB800 cmp dword ptr [rbp-48H], 0
+ 7F0E jg SHORT G_M43040_IG07
+ ;; size=14 bbWeight=1 PerfScore 5.25
+G_M43040_IG06: ;; offset=004FH
+ 488D4DB8 lea rcx, [rbp-48H]
+ BA11000000 mov edx, 17
+ E8338F045F call CORINFO_HELP_PATCHPOINT
+ ;; size=14 bbWeight=0.01 PerfScore 0.02
+G_M43040_IG07: ;; offset=005DH
+ 817DC420A10700 cmp dword ptr [rbp-3CH], 0x7A120
+ 7CBB jl SHORT G_M43040_IG03
+ ;; size=9 bbWeight=1 PerfScore 3.00
+G_M43040_IG08: ;; offset=0066H
+ 4883C470 add rsp, 112
+ 5D pop rbp
+ C3 ret
+ ;; size=6 bbWeight=1 PerfScore 1.75
+; Total bytes of code 108
+```
+
+The codegen is unoptimized, with patchpoints for OSR and without instrumentation (to avoid spending time on it for methods which will never make it to Tier1 - as the practice shows: only 10-20% of methods make it to Tier1)
+
+2) Its loop body triggers OSR after `DOTNET_TC_OnStackReplacement_InitialCounter` iterations (see jitconfigvalue.h):
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; OSR variant for entry point 0x11
+; optimized code
+; No PGO data
+G_M43040_IG01: ;; offset=0000H
+ 4883EC38 sub rsp, 56
+ 4889BC24A8000000 mov qword ptr [rsp+A8H], rdi
+ 4889B424A0000000 mov qword ptr [rsp+A0H], rsi
+ 488BB424C0000000 mov rsi, gword ptr [rsp+C0H]
+ 8B7C2474 mov edi, dword ptr [rsp+74H]
+ ;; size=32 bbWeight=1 PerfScore 6.25
+G_M43040_IG02: ;; offset=0020H
+ 81FF20A10700 cmp edi, 0x7A120
+ 7D1F jge SHORT G_M43040_IG06
+ ;; size=8 bbWeight=1 PerfScore 1.25
+G_M43040_IG03: ;; offset=0028H
+ 4885F6 test rsi, rsi
+ 7410 je SHORT G_M43040_IG05
+ ;; size=5 bbWeight=4 PerfScore 5.00
+G_M43040_IG04: ;; offset=002DH
+ 488BCE mov rcx, rsi
+ 49BB98027E61FD7F0000 mov r11, 0x7FFD617E0298 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ ;; size=16 bbWeight=2 PerfScore 7.00
+G_M43040_IG05: ;; offset=003DH
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CE1 jl SHORT G_M43040_IG03
+ ;; size=10 bbWeight=4 PerfScore 6.00
+G_M43040_IG06: ;; offset=0047H
+ 4881C4A0000000 add rsp, 160
+ 5E pop rsi
+ 5F pop rdi
+ 5D pop rbp
+ C3 ret
+ ;; size=11 bbWeight=1 PerfScore 2.75
+; Total bytes of code 82
+```
+
+Now the loop is faster because of optimizations but is still not instrumented/devirtualized. In theory, we could start instrumenting at least the loop body at this stage, but it's left as is for now, see notes below.
+
+3) `HotLoop` itself is invoked > 30 times, that triggers promotion to Tier0Instrumented:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-0 compilation
+; MinOpts code
+; instrumented for collecting profile data
+G_M43040_IG01: ;; offset=0000H
+ 55 push rbp
+ 4881EC80000000 sub rsp, 128
+ 488DAC2480000000 lea rbp, [rsp+80H]
+ 33C0 xor eax, eax
+ 488945A8 mov qword ptr [rbp-58H], rax
+ C5D857E4 vxorps xmm4, xmm4
+ C5F97F65B0 vmovdqa xmmword ptr [rbp-50H], xmm4
+ 488945C0 mov qword ptr [rbp-40H], rax
+ 48894D10 mov gword ptr [rbp+10H], rcx
+ ;; size=39 bbWeight=1 PerfScore 7.33
+G_M43040_IG02: ;; offset=0027H
+ FF05A3846000 inc dword ptr [(reloc 0x7ffd6214fc10)]
+ 33C9 xor ecx, ecx
+ 894DC4 mov dword ptr [rbp-3CH], ecx
+ C745B8E8030000 mov dword ptr [rbp-48H], 0x3E8
+ EB55 jmp SHORT G_M43040_IG05
+ ;; size=20 bbWeight=1 PerfScore 7.25
+G_M43040_IG03: ;; offset=003BH
+ FF0593846000 inc dword ptr [(reloc 0x7ffd6214fc14)]
+ 48837D1000 cmp gword ptr [rbp+10H], 0
+ 743A je SHORT G_M43040_IG04
+ FF058A846000 inc dword ptr [(reloc 0x7ffd6214fc18)]
+ 488B4D10 mov rcx, gword ptr [rbp+10H]
+ 48894DB0 mov gword ptr [rbp-50H], rcx
+ 488B4DB0 mov rcx, gword ptr [rbp-50H]
+ 48BA20FC1462FD7F0000 mov rdx, 0x7FFD6214FC20
+ E8E79D045F call CORINFO_HELP_CLASSPROFILE32
+ 488B4DB0 mov rcx, gword ptr [rbp-50H]
+ 48894DA8 mov gword ptr [rbp-58H], rcx
+ 488B4DA8 mov rcx, gword ptr [rbp-58H]
+ 49BBA0027E61FD7F0000 mov r11, 0x7FFD617E02A0 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ ;; size=71 bbWeight=1 PerfScore 19.50
+G_M43040_IG04: ;; offset=0082H
+ FF05A0846000 inc dword ptr [(reloc 0x7ffd6214fc68)]
+ 8B45C4 mov eax, dword ptr [rbp-3CH]
+ FFC0 inc eax
+ 8945C4 mov dword ptr [rbp-3CH], eax
+ ;; size=14 bbWeight=1 PerfScore 5.25
+G_M43040_IG05: ;; offset=0090H
+ 8B4DB8 mov ecx, dword ptr [rbp-48H]
+ FFC9 dec ecx
+ 894DB8 mov dword ptr [rbp-48H], ecx
+ 837DB800 cmp dword ptr [rbp-48H], 0
+ 7F0E jg SHORT G_M43040_IG07
+ ;; size=14 bbWeight=1 PerfScore 5.25
+G_M43040_IG06: ;; offset=009EH
+ 488D4DB8 lea rcx, [rbp-48H]
+ BA11000000 mov edx, 17
+ E8248C045F call CORINFO_HELP_PATCHPOINT
+ ;; size=14 bbWeight=0.01 PerfScore 0.02
+G_M43040_IG07: ;; offset=00ACH
+ FF057A846000 inc dword ptr [(reloc 0x7ffd6214fc6c)]
+ 817DC420A10700 cmp dword ptr [rbp-3CH], 0x7A120
+ 7C80 jl SHORT G_M43040_IG03
+ FF056F846000 inc dword ptr [(reloc 0x7ffd6214fc70)]
+ ;; size=21 bbWeight=1 PerfScore 9.00
+G_M43040_IG08: ;; offset=00C1H
+ 4881C480000000 add rsp, 128
+ 5D pop rbp
+ C3 ret
+ ;; size=9 bbWeight=1 PerfScore 1.75
+; Total bytes of code 202
+```
+Now the whole method is compiled to Tier0 with instrumentation and patchpoints. No optimizations.
+We decided to promote hot Tier0 to Tier0Instrumented without optimizations for the following reasons:
+1) We won't notice a big performance regression from going from Tier0 to Tier0Instrumented
+2) Tier0Instrumented is faster to compile
+3) Its profile is more accurate
+
+Although, in this specific case we could consider using Tier1Instrumented since we had a faster loop in the previous code version due to Tier1-OSR, but since OSR events are rare and we don't want to produce a less accurate profile that we had before https://github.com/dotnet/runtime/pull/70941 it's left as is. We might re-consider this when we improve instrumentation for the optimized code to produce a more accurate profile including inlinees.
+
+4) The loop of `HotLoop` triggered OSR once again:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; OSR variant for entry point 0x11
+; optimized code
+; optimized using profile data
+; with Dynamic PGO: edge weights are invalid, and fgCalledCount is 9999
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
+G_M43040_IG01: ;; offset=0000H
+ 4883EC38 sub rsp, 56
+ 4889BC24B8000000 mov qword ptr [rsp+B8H], rdi
+ 4889B424B0000000 mov qword ptr [rsp+B0H], rsi
+ 488BB424D0000000 mov rsi, gword ptr [rsp+D0H]
+ 8BBC2484000000 mov edi, dword ptr [rsp+84H]
+ ;; size=35 bbWeight=1 PerfScore 6.25
+G_M43040_IG02: ;; offset=0023H
+ 81FF20A10700 cmp edi, 0x7A120
+ 7D50 jge SHORT G_M43040_IG06
+ 4885F6 test rsi, rsi
+ 7424 je SHORT G_M43040_IG04
+ 48B9C86CCC61FD7F0000 mov rcx, 0x7FFD61CC6CC8 ; Program
+ 48390E cmp qword ptr [rsi], rcx
+ 7515 jne SHORT G_M43040_IG04
+ ;; size=28 bbWeight=1 PerfScore 6.75
+G_M43040_IG03: ;; offset=003FH
+ 488BCE mov rcx, rsi
+ FF15605A1500 call [Program:Test():this]
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7D29 jge SHORT G_M43040_IG06
+ EBEB jmp SHORT G_M43040_IG03
+ ;; size=21 bbWeight=0.99 PerfScore 6.68
+G_M43040_IG04: ;; offset=0054H
+ 4885F6 test rsi, rsi
+ 7418 je SHORT G_M43040_IG05
+ 48B9C86CCC61FD7F0000 mov rcx, 0x7FFD61CC6CC8 ; Program
+ 48390E cmp qword ptr [rsi], rcx
+ 751E jne SHORT G_M43040_IG07
+ 488BCE mov rcx, rsi
+ FF15375A1500 call [Program:Test():this]
+ ;; size=29 bbWeight=0.01 PerfScore 0.09
+G_M43040_IG05: ;; offset=0071H
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CD9 jl SHORT G_M43040_IG04
+ ;; size=10 bbWeight=0.01 PerfScore 0.02
+G_M43040_IG06: ;; offset=007BH
+ 4881C4B0000000 add rsp, 176
+ 5E pop rsi
+ 5F pop rdi
+ 5D pop rbp
+ C3 ret
+ ;; size=11 bbWeight=0 PerfScore 0.00
+G_M43040_IG07: ;; offset=0086H
+ 488BCE mov rcx, rsi
+ 49BBA8027E61FD7F0000 mov r11, 0x7FFD617E02A8 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ EBD9 jmp SHORT G_M43040_IG05
+ ;; size=18 bbWeight=0 PerfScore 0.00
+; Total bytes of code 152
+```
+We ended up with a very fast version of the method with optimal loop `G_M43040_IG03` that calls devirtualized call each iteration without any guards. The outsides of the loop are still unoptimized Tier0 codegen.
+
+5) ``HotLoop` method is invoked 30 more times and triggers the final promotion to the last tier:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; optimized code
+; optimized using profile data
+; with Dynamic PGO: edge weights are invalid, and fgCalledCount is 48
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
+G_M43040_IG01: ;; offset=0000H
+ 57 push rdi
+ 56 push rsi
+ 4883EC28 sub rsp, 40
+ 488BF1 mov rsi, rcx
+ ;; size=9 bbWeight=1 PerfScore 2.50
+G_M43040_IG02: ;; offset=0009H
+ 33FF xor edi, edi
+ 4885F6 test rsi, rsi
+ 7424 je SHORT G_M43040_IG04
+ 48B9C86CCC61FD7F0000 mov rcx, 0x7FFD61CC6CC8 ; Program
+ 48390E cmp qword ptr [rsi], rcx
+ 7515 jne SHORT G_M43040_IG04
+ ;; size=22 bbWeight=1 PerfScore 5.75
+G_M43040_IG03: ;; offset=001FH
+ 488BCE mov rcx, rsi
+ FF15C0591500 call [Program:Test():this]
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7D29 jge SHORT G_M43040_IG06
+ EBEB jmp SHORT G_M43040_IG03
+ ;; size=21 bbWeight=1158.09 PerfScore 7817.13
+G_M43040_IG04: ;; offset=0034H
+ 4885F6 test rsi, rsi
+ 7418 je SHORT G_M43040_IG05
+ 48B9C86CCC61FD7F0000 mov rcx, 0x7FFD61CC6CC8 ; Program
+ 48390E cmp qword ptr [rsi], rcx
+ 751A jne SHORT G_M43040_IG07
+ 488BCE mov rcx, rsi
+ FF1597591500 call [Program:Test():this]
+ ;; size=29 bbWeight=11.70 PerfScore 102.36
+G_M43040_IG05: ;; offset=0051H
+ FFC7 inc edi
+ 81FF20A10700 cmp edi, 0x7A120
+ 7CD9 jl SHORT G_M43040_IG04
+ ;; size=10 bbWeight=11.70 PerfScore 17.55
+G_M43040_IG06: ;; offset=005BH
+ 4883C428 add rsp, 40
+ 5E pop rsi
+ 5F pop rdi
+ C3 ret
+ ;; size=7 bbWeight=0 PerfScore 0.00
+G_M43040_IG07: ;; offset=0062H
+ 488BCE mov rcx, rsi
+ 49BBB0027E61FD7F0000 mov r11, 0x7FFD617E02B0 ; code for System.IDisposable:Dispose
+ 41FF13 call [r11]System.IDisposable:Dispose():this
+ EBDD jmp SHORT G_M43040_IG05
+ ;; size=18 bbWeight=0 PerfScore 0.00
+; Total bytes of code 116
+```
+Again, to summarize the workflow for non-prejitted case let's take a look at this branch of the diagram (OSR details are omitted to showcase the most common case):
+
+```mermaid
+flowchart
+ hasR2R("...") -->tier000
+ tier000["JIT to Tier0
(not optimized, not instrumented,
with patchpoints)"]-->|Running...|ishot555
+ ishot555{"Is hot?
(called >30 times)"}
+ ishot555-.->|No,
keep running...|ishot555
+ ishot555-->|Yes|tier0
+ tier0["JIT to Tier0Instrumented
(not optimized, instrumented,
with patchpoints)"]-->|Running...|ishot5
+ tier1pgo2["JIT to Tier1
(optimized with profile data)"]
+ ishot5{"Is hot?
(called >30 times)"}-->|Yes|tier1pgo2
+ ishot5-.->|No,
keep running...|ishot5
+```
+
+It's worth noting that we analyzed the worst (in case of working set) case with OSR, normally (in 99.8% of cases) we end up only with three code versions for hot code:
+1) Tier0/R2R
+2) Instrumented Tier (with or without optimizations)
+3) Tier1 optimized with profile
+
+# Working Set Impact
+
+The general rule of thumb that only 10-20% of methods make it to Tier1 and about to 40-60% of all methods are less than 8 bytes of IL (e.g., getters/setters) so we're effectively double the size of Tier1 with this approach (including call counting stubs, etc.). How bad it can be compared to overall working set in various apps? let's consider these two examples:
+
+## 1) A large web app (internal Microsoft service)
+
+| Metric | Number of methods | Share, % | Total size, MB | Share, % |
+|------------------|-------------------|----------|----------------|----------|
+| **Tier0** | 115862 | 59.36% | 60.06 | 83.89% |
+| **Tier1** | 30942 | 15.85% | 8.22 | 11.48% |
+| **FullOpts** | 48384 | 24.79% | 3.26 | 4.55% |
+| **Contains OSR** | 55 | 0.03% | 0.06 | 0.08% |
+| **Total jitted** | 195188 | 100.00% | 71.60 | 100.00% |
+
+
+
+
+In this app Tier1 code occupies 8.22MB in the loader heap (we can add a few megabytes on top of it for call counting stubs, jump-stubs, etc.) meaning that instrumentated tier is expected to add a similar amount (~13MB). The total working set of the service is 10GB so instrumentated tiers contribute ~0.1% of that. We're adding +30k new jit compilations which we can fully compensate with https://github.com/dotnet/runtime/issues/76402 work to avoid potential problems connected with too big queues of methods pending call counting installation/promotions to tier1.
+
+## 2) A desktop OSS application [AvaloniaILSpy](https://github.com/icsharpcode/AvaloniaILSpy)
+
+`ReadyToRun=0`:
+
+| Metric | Number of methods | % | Total size, MB | % |
+|------------------|-------------------|----------|----------------|----------|
+| **Tier0** | 19968 | 79.09% | 4.58 | 84.69% |
+| **Tier1** | 4978 | 19.72% | **0.75** | 13.90% |
+| **FullOpts** | 300 | 1.19% | 0.08 | 1.39% |
+| **OSR** | 2 | 0.01% | 0.00 | 0.02% |
+| | | | | |
+| **Total** | 25248 | 100.00% | 5.41 | 100.00% |
+
+`ReadyToRun=1`:
+
+| Metric | Number of methods | % | Total size, MB | % |
+|------------------|-------------------|----------|----------------|----------|
+| **Tier0** | 4713 | 62.45% | 0.84 | 58.34% |
+| **Tier1** | 2516 | 33.34% | 0.56 | 38.75% |
+| **FullOpts** | 318 | 4.21% | 0.04 | 2.92% |
+| **OSR** | 0 | 0.00% | 0.00 | 0.00% |
+| | | | | |
+| **Total** | 7547 | 100.00% | 1.44 | 100.00% |
+
+In case of AvaloniaILSpy, instrumented tiers add around 1MB (stubs included) to the total working set and around 5k of new jit compilations.
+
+# Start time and performance impact
+
+## TechEmpower
+
+Overall, it is expected from instrumented tiers to improve startup speed when Dynamic PGO is enabled and improve performance (e.g. Latency/Throughput) for prejitted code. A good example demonstrating both is the following TechEmpower benchmark (plaintext-plaintext):
+
+
+
+Legend:
+* Red - `DOTNET_TieredPGO=0`, `DOTNET_ReadyToRun=1`
+* Black - `DOTNET_TieredPGO=1`, `DOTNET_ReadyToRun=1`
+* Yellow - `DOTNET_TieredPGO=1`, `DOTNET_ReadyToRun=0`
+
+Yellow line provides the highest level of performance (RPS) by sacrificing start up speed (and, hence, time it takes to process the first request). It happens because the benchmark is quite simple and most of its code is already prejitted so we can only instrument it when we completely drop R2R and compile everything from scratch. It also explains why the black line (when we enable Dynamic PGO but still rely on R2R) didn't really show a lot of improvements. With the separate instrumentated tiers for hot R2R we achieve "Yellow"-level of performance while maintaining the same start up speed as it was before. Also, for the mode where we have to compile a lot of code to Tier0, switching to "instrument only hot Tier0 code" strategy shows ~8% time-to-first-request reduction across all TE benchmarks.
+
+
+(_Predicted results according to local runs_)
+
+## AvaloniaILSpy
+
+For this experiment we modified the source code of the app to send an event once view is completely loaded to measure the real start time:
+
+| Mode | Start time |
+|----------------------------|------------|
+| R2R=0 | 2.03s |
+| R2R=0, PGO=1 | 2.26s |
+| R2R=0, PGO=1, Instr. Tiers | 2.03s |
+
+As we can see, instrumentated tiers help to mitigate the start time regression from Dynamic PGO.
+
+## Microsoft internal service
+
+Throughput of the service after startup:
+
+
+
+X axis - time in seconds after start, Y axis - Throughput in MB/s.
+
+Here Dynamic PGO without instrumented tiers (red line) is not able to show benefits because the service is prejitted thus prejitted code doesn't benefit from Dynamic PGO. Instrumented tiers help with that by instrumenting hot R2R code to achieve the best performance, hence, the throughput is higher (green line).
\ No newline at end of file
diff --git a/docs/design/features/DynamicPgo.md b/docs/design/features/DynamicPgo.md
index 464e87950cbece..2bd227d9618b8e 100644
--- a/docs/design/features/DynamicPgo.md
+++ b/docs/design/features/DynamicPgo.md
@@ -257,9 +257,9 @@ If we confidently could identify the top N% of methods (say 5%) then one could i
R2R methods bypass Tier0 and so don't get instrumentation in the current TieredPGO prototype. We probably don't want to instrument the code in the R2R image. And many of these R2R methods are key framework methods that are important for performance. So we need to find a way to get data for these methods.
There are a few basic ideas:
-* Leverage IBC. If there is IBC data in the R2R image then we can make that data available to the JIT. It may not be as relevant as in-process collected data, but it's quite likely better than synthetic data or no data.
-* Sampled instrumentation for R2R methods. Produce an instrumented version and run it every so often before the method gets promoted to Tier1. This may be costly, especially if we have to use unoptimized methods for instrumentation, as we'll do quite a bit of extra jitting.
-* Make R2R methods go through Tier0 on their way to Tier1. Likely introduces an unacceptable perf hit.
+1) Leverage IBC. If there is IBC data in the R2R image then we can make that data available to the JIT. It may not be as relevant as in-process collected data, but it's quite likely better than synthetic data or no data.
+2) Sampled instrumentation for R2R methods. Produce an instrumented version and run it every so often before the method gets promoted to Tier1. This may be costly, especially if we have to use unoptimized methods for instrumentation, as we'll do quite a bit of extra jitting.
+3) Make R2R methods go through a separate instrumentation tier on their way to Tier1, see [DynamicPgo-InstrumentedTiers.md](DynamicPgo-InstrumentedTiers.md) prototype.
#### Dynamic PGO, QuickJitForLoops, OSR
diff --git a/src/coreclr/debug/daccess/request.cpp b/src/coreclr/debug/daccess/request.cpp
index 88c1c03d695f9d..7f48fc6819c82a 100644
--- a/src/coreclr/debug/daccess/request.cpp
+++ b/src/coreclr/debug/daccess/request.cpp
@@ -1112,6 +1112,12 @@ HRESULT ClrDataAccess::GetTieredVersions(
case NativeCodeVersion::OptimizationTierOptimized:
nativeCodeAddrs[count].OptimizationTier = DacpTieredVersionData::OptimizationTier_Optimized;
break;
+ case NativeCodeVersion::OptimizationTier0Instrumented:
+ nativeCodeAddrs[count].OptimizationTier = DacpTieredVersionData::OptimizationTier_QuickJittedInstrumented;
+ break;
+ case NativeCodeVersion::OptimizationTier1Instrumented:
+ nativeCodeAddrs[count].OptimizationTier = DacpTieredVersionData::OptimizationTier_OptimizedTier1Instrumented;
+ break;
}
}
else if (pMD->IsJitOptimizationDisabled())
diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h
index 81cd74e76250a0..02c0d8487cfd4a 100644
--- a/src/coreclr/inc/clrconfigvalues.h
+++ b/src/coreclr/inc/clrconfigvalues.h
@@ -606,6 +606,12 @@ RETAIL_CONFIG_STRING_INFO(INTERNAL_PGODataPath, W("PGODataPath"), "Read/Write PG
RETAIL_CONFIG_DWORD_INFO(INTERNAL_ReadPGOData, W("ReadPGOData"), 0, "Read PGO data")
RETAIL_CONFIG_DWORD_INFO(INTERNAL_WritePGOData, W("WritePGOData"), 0, "Write PGO data")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TieredPGO, W("TieredPGO"), 0, "Instrument Tier0 code and make counts available to Tier1")
+
+// TieredPGO_InstrumentOnlyHotCode values:
+//
+// 0) Instrument all IL-only code, R2R'd code is never instrumented
+// 1) Instrument only hot IL-only and hot R2R code (use optimizations in the instrumented tier for hot R2R and no optimizations for hot IL-only)
+RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredPGO_InstrumentOnlyHotCode, W("TieredPGO_InstrumentOnlyHotCode"), 1, "Strategy for TieredPGO, see comments in clrconfigvalues.h")
#endif
///
diff --git a/src/coreclr/inc/dacprivate.h b/src/coreclr/inc/dacprivate.h
index 1e345810445c8a..5d920fd0da4905 100644
--- a/src/coreclr/inc/dacprivate.h
+++ b/src/coreclr/inc/dacprivate.h
@@ -610,6 +610,8 @@ struct MSLAYOUT DacpTieredVersionData
OptimizationTier_OptimizedTier1,
OptimizationTier_ReadyToRun,
OptimizationTier_OptimizedTier1OSR,
+ OptimizationTier_QuickJittedInstrumented,
+ OptimizationTier_OptimizedTier1Instrumented,
};
CLRDATA_ADDRESS NativeCodeAddr;
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 9184f102ce7e4e..1ad5cb944ad8f4 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -4101,13 +4101,13 @@ const char* Compiler::compGetTieringName(bool wantShortName) const
}
else if (tier1)
{
- if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_OSR))
+ if (opts.IsOSR())
{
return instrumenting ? "Instrumented Tier1-OSR" : "Tier1-OSR";
}
else
{
- return "Tier1";
+ return instrumenting ? "Instrumented Tier1" : "Tier1";
}
}
else if (opts.OptimizationEnabled())
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 05dfb0bfe30111..2564aee7644166 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9181,6 +9181,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
}
#endif
+ bool IsInstrumented() const
+ {
+ return jitFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR);
+ }
+
+ bool IsInstrumentedOptimized() const
+ {
+ return IsInstrumented() && jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1);
+ }
+
// true if we should use the PINVOKE_{BEGIN,END} helpers instead of generating
// PInvoke transitions inline. Normally used by R2R, but also used when generating a reverse pinvoke frame, as
// the current logic for frame setup initializes and pushes
diff --git a/src/coreclr/jit/fginline.cpp b/src/coreclr/jit/fginline.cpp
index 55443f8d0a2b70..e43c9ac980e8d8 100644
--- a/src/coreclr/jit/fginline.cpp
+++ b/src/coreclr/jit/fginline.cpp
@@ -561,7 +561,8 @@ class SubstitutePlaceholdersAndDevirtualizeWalker : public GenTreeVisitordspTreeID(tree),
block->bbNum);
- noway_assert((block->bbNext->countOfInEdges() > 0) && (block->bbJumpDest->countOfInEdges() > 0));
+
+ noway_assert(!m_compiler->fgComputePredsDone);
// We have a constant operand, and should have the all clear to optimize.
// Update side effects on the tree, assert there aren't any, and bash to nop.
@@ -570,36 +571,20 @@ class SubstitutePlaceholdersAndDevirtualizeWalker : public GenTreeVisitorgtBashToNOP();
m_madeChanges = true;
- BasicBlock* bNotTaken = nullptr;
-
- if (condTree->AsIntCon()->gtIconVal != 0)
+ if (!condTree->IsIntegralConst(0))
{
block->bbJumpKind = BBJ_ALWAYS;
- bNotTaken = block->bbNext;
}
else
{
block->bbJumpKind = BBJ_NONE;
- bNotTaken = block->bbJumpDest;
- }
-
- m_compiler->fgRemoveRefPred(bNotTaken, block);
-
- // If that was the last ref, a subsequent flow-opt pass
- // will clean up the now-unreachable bNotTaken, and any
- // other transitively unreachable blocks.
- if (bNotTaken->bbRefs == 0)
- {
- JITDUMP("... it looks like " FMT_BB " is now unreachable!\n", bNotTaken->bbNum);
}
}
}
else
{
- const var_types retType = tree->TypeGet();
- GenTree* foldedTree = m_compiler->gtFoldExpr(tree);
- *pTree = foldedTree;
- m_madeChanges = true;
+ *pTree = m_compiler->gtFoldExpr(tree);
+ m_madeChanges = true;
}
}
};
diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp
index a0f82765b6b40c..5c2b9fa3b96901 100644
--- a/src/coreclr/jit/fgprofile.cpp
+++ b/src/coreclr/jit/fgprofile.cpp
@@ -383,7 +383,7 @@ void BlockCountInstrumentor::Prepare(bool preImport)
//
// If we see any, we need to adjust our instrumentation pattern.
//
- if (m_comp->opts.IsOSR() && ((m_comp->optMethodFlags & OMF_HAS_TAILCALL_SUCCESSOR) != 0))
+ if (m_comp->opts.IsInstrumentedOptimized() && ((m_comp->optMethodFlags & OMF_HAS_TAILCALL_SUCCESSOR) != 0))
{
JITDUMP("OSR + PGO + potential tail call --- preparing to relocate block probes\n");
@@ -1887,8 +1887,11 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
(JitConfig.TC_PartialCompilation() > 0);
const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && mayHavePatchpoints;
- const bool osrMethod = opts.IsOSR();
- const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod;
+ const bool isOptimized = opts.IsInstrumentedOptimized();
+ const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !isOptimized;
+
+ // TODO-TP: Don't give up on edge profiling for optimized code, currently it has issues
+ // such as unexpected trees near tail calls
if (useEdgeProfiles)
{
@@ -1899,7 +1902,7 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
JITDUMP("Using block profiling, because %s\n",
(JitConfig.JitEdgeProfiling() == 0)
? "edge profiles disabled"
- : prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints");
+ : prejit ? "prejitting" : isOptimized ? "tier1 instrumented" : "tier0 with patchpoints");
fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
}
diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp
index bfd042fd102e83..5c90d1efc81520 100644
--- a/src/coreclr/jit/importercalls.cpp
+++ b/src/coreclr/jit/importercalls.cpp
@@ -1288,7 +1288,7 @@ var_types Compiler::impImportCall(OPCODE opcode,
// have to check for anything that might introduce a recursive tail call.
// * We only instrument root method blocks in OSR methods,
//
- if (opts.IsOSR() && !compIsForInlining())
+ if ((opts.IsInstrumentedOptimized() || opts.IsOSR()) && !compIsForInlining())
{
// If a root method tail call candidate block is not a BBJ_RETURN, it should have a unique
// BBJ_RETURN successor. Mark that successor so we can handle it specially during profile
@@ -1312,9 +1312,9 @@ var_types Compiler::impImportCall(OPCODE opcode,
// Only schedule importation if we're not currently importing.
//
- if (mustImportEntryBlock && (compCurBB != fgEntryBB))
+ if (opts.IsOSR() && mustImportEntryBlock && (compCurBB != fgEntryBB))
{
- JITDUMP("\nOSR: inlineable or recursive tail call [%06u] in the method, so scheduling " FMT_BB
+ JITDUMP("\ninlineable or recursive tail call [%06u] in the method, so scheduling " FMT_BB
" for importation\n",
dspTreeID(call), fgEntryBB->bbNum);
impImportBlockPending(fgEntryBB);
@@ -6290,7 +6290,7 @@ bool Compiler::impConsiderCallProbe(GenTreeCall* call, IL_OFFSET ilOffset)
return false;
}
- assert(opts.OptimizationDisabled() || opts.IsOSR());
+ assert(opts.OptimizationDisabled() || opts.IsInstrumentedOptimized());
assert(!compIsForInlining());
// During importation, optionally flag this block as one that
diff --git a/src/coreclr/vm/callcounting.cpp b/src/coreclr/vm/callcounting.cpp
index 671eb8f018d5bc..a49a2ace6032ca 100644
--- a/src/coreclr/vm/callcounting.cpp
+++ b/src/coreclr/vm/callcounting.cpp
@@ -574,7 +574,7 @@ bool CallCountingManager::SetCodeEntryPoint(
// For a default code version that is not tier 0, call counting will have been disabled by this time (checked
// below). Avoid the redundant and not-insignificant expense of GetOptimizationTier() on a default code version.
!activeCodeVersion.IsDefaultVersion() &&
- activeCodeVersion.GetOptimizationTier() != NativeCodeVersion::OptimizationTier0
+ activeCodeVersion.IsFinalTier()
) ||
!g_pConfig->TieredCompilation_CallCounting())
{
@@ -602,7 +602,7 @@ bool CallCountingManager::SetCodeEntryPoint(
return true;
}
- _ASSERTE(activeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!activeCodeVersion.IsFinalTier());
// If the tiering delay is active, postpone further work
if (GetAppDomain()
@@ -649,7 +649,7 @@ bool CallCountingManager::SetCodeEntryPoint(
}
else
{
- _ASSERTE(activeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!activeCodeVersion.IsFinalTier());
// If the tiering delay is active, postpone further work
if (GetAppDomain()
@@ -659,7 +659,7 @@ bool CallCountingManager::SetCodeEntryPoint(
return true;
}
- CallCount callCountThreshold = (CallCount)g_pConfig->TieredCompilation_CallCountThreshold();
+ CallCount callCountThreshold = g_pConfig->TieredCompilation_CallCountThreshold();
_ASSERTE(callCountThreshold != 0);
NewHolder callCountingInfoHolder = new CallCountingInfo(activeCodeVersion, callCountThreshold);
@@ -780,7 +780,7 @@ PCODE CallCountingManager::OnCallCountThresholdReached(TransitionBlock *transiti
// used going forward under appropriate locking to synchronize further with deletion.
GCX_PREEMP_THREAD_EXISTS(CURRENT_THREAD);
- _ASSERTE(codeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!codeVersion.IsFinalTier());
codeEntryPoint = codeVersion.GetNativeCode();
do
diff --git a/src/coreclr/vm/codeversion.cpp b/src/coreclr/vm/codeversion.cpp
index bd7fce5d9d7c4d..9bc971033915e0 100644
--- a/src/coreclr/vm/codeversion.cpp
+++ b/src/coreclr/vm/codeversion.cpp
@@ -151,7 +151,11 @@ NativeCodeVersion::OptimizationTier NativeCodeVersionNode::GetOptimizationTier()
void NativeCodeVersionNode::SetOptimizationTier(NativeCodeVersion::OptimizationTier tier)
{
LIMITED_METHOD_CONTRACT;
- _ASSERTE(tier >= m_optTier);
+
+ _ASSERTE(
+ tier == m_optTier ||
+ (m_optTier != NativeCodeVersion::OptimizationTier::OptimizationTier1 &&
+ m_optTier != NativeCodeVersion::OptimizationTier::OptimizationTierOptimized));
m_optTier = tier;
}
@@ -333,6 +337,13 @@ NativeCodeVersion::OptimizationTier NativeCodeVersion::GetOptimizationTier() con
}
}
+bool NativeCodeVersion::IsFinalTier() const
+{
+ LIMITED_METHOD_DAC_CONTRACT;
+ OptimizationTier tier = GetOptimizationTier();
+ return tier == OptimizationTier1 || tier == OptimizationTierOptimized;
+}
+
#ifndef DACCESS_COMPILE
void NativeCodeVersion::SetOptimizationTier(OptimizationTier tier)
{
@@ -808,7 +819,7 @@ bool ILCodeVersion::HasAnyOptimizedNativeCodeVersion(NativeCodeVersion tier0Nati
_ASSERTE(!tier0NativeCodeVersion.IsNull());
_ASSERTE(tier0NativeCodeVersion.GetILCodeVersion() == *this);
_ASSERTE(tier0NativeCodeVersion.GetMethodDesc()->IsEligibleForTieredCompilation());
- _ASSERTE(tier0NativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!tier0NativeCodeVersion.IsFinalTier());
NativeCodeVersionCollection nativeCodeVersions = GetNativeCodeVersions(tier0NativeCodeVersion.GetMethodDesc());
for (auto itEnd = nativeCodeVersions.End(), it = nativeCodeVersions.Begin(); it != itEnd; ++it)
@@ -1708,9 +1719,7 @@ PCODE CodeVersionManager::PublishVersionableCodeIfNecessary(
{
#ifdef FEATURE_TIERED_COMPILATION
_ASSERTE(!config->ShouldCountCalls() || pMethodDesc->IsEligibleForTieredCompilation());
- _ASSERTE(
- !config->ShouldCountCalls() ||
- activeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!config->ShouldCountCalls() || !activeVersion.IsFinalTier());
if (config->ShouldCountCalls()) // the generated code was at a tier that is call-counted
{
// This is the first call to a call-counted code version of the method
diff --git a/src/coreclr/vm/codeversion.h b/src/coreclr/vm/codeversion.h
index d83bfa29c2ea6f..66de4ba27257a4 100644
--- a/src/coreclr/vm/codeversion.h
+++ b/src/coreclr/vm/codeversion.h
@@ -71,15 +71,19 @@ class NativeCodeVersion
BOOL SetNativeCodeInterlocked(PCODE pCode, PCODE pExpected = NULL);
#endif
+ // NOTE: Don't change existing values to avoid breaking changes in event tracing
enum OptimizationTier
{
OptimizationTier0,
OptimizationTier1,
OptimizationTier1OSR,
OptimizationTierOptimized, // may do less optimizations than tier 1
+ OptimizationTier0Instrumented,
+ OptimizationTier1Instrumented,
};
#ifdef FEATURE_TIERED_COMPILATION
OptimizationTier GetOptimizationTier() const;
+ bool IsFinalTier() const;
#ifndef DACCESS_COMPILE
void SetOptimizationTier(OptimizationTier tier);
#endif
diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp
index 883d602b1f2309..f438a31838fa14 100644
--- a/src/coreclr/vm/eeconfig.cpp
+++ b/src/coreclr/vm/eeconfig.cpp
@@ -239,6 +239,7 @@ HRESULT EEConfig::Init()
#if defined(FEATURE_PGO)
fTieredPGO = false;
+ tieredPGO_InstrumentOnlyHotCode = false;
#endif
#if defined(FEATURE_READYTORUN)
@@ -699,10 +700,6 @@ HRESULT EEConfig::sync()
dwSleepOnExit = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_SleepOnExit);
-#if defined(FEATURE_PGO)
- fTieredPGO = Configuration::GetKnobBooleanValue(W("System.Runtime.TieredPGO"), CLRConfig::EXTERNAL_TieredPGO);
-#endif
-
#if defined(FEATURE_TIERED_COMPILATION)
fTieredCompilation = Configuration::GetKnobBooleanValue(W("System.Runtime.TieredCompilation"), CLRConfig::EXTERNAL_TieredCompilation);
if (fTieredCompilation)
@@ -784,6 +781,20 @@ HRESULT EEConfig::sync()
}
#endif
+#if defined(FEATURE_PGO)
+ fTieredPGO = Configuration::GetKnobBooleanValue(W("System.Runtime.TieredPGO"), CLRConfig::EXTERNAL_TieredPGO);
+
+ // Also, consider DynamicPGO enabled if WritePGOData is set
+ fTieredPGO |= CLRConfig::GetConfigValue(CLRConfig::INTERNAL_WritePGOData) != 0;
+ tieredPGO_InstrumentOnlyHotCode = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredPGO_InstrumentOnlyHotCode) == 1;
+
+ // We need quick jit for TieredPGO
+ if (!fTieredCompilation_QuickJit)
+ {
+ fTieredPGO = false;
+ }
+#endif
+
#if defined(FEATURE_ON_STACK_REPLACEMENT)
dwOSR_HitLimit = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_OSR_HitLimit);
dwOSR_CounterBump = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_OSR_CounterBump);
diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h
index 4651cf1bc84e0f..684f04181fa3b9 100644
--- a/src/coreclr/vm/eeconfig.h
+++ b/src/coreclr/vm/eeconfig.h
@@ -92,6 +92,7 @@ class EEConfig
#if defined(FEATURE_PGO)
bool TieredPGO(void) const { LIMITED_METHOD_CONTRACT; return fTieredPGO; }
+ bool TieredPGO_InstrumentOnlyHotCode(void) const { LIMITED_METHOD_CONTRACT; return tieredPGO_InstrumentOnlyHotCode; }
#endif
#if defined(FEATURE_READYTORUN)
@@ -658,6 +659,7 @@ class EEConfig
#if defined(FEATURE_PGO)
bool fTieredPGO;
+ bool tieredPGO_InstrumentOnlyHotCode;
#endif
#if defined(FEATURE_READYTORUN)
diff --git a/src/coreclr/vm/interpreter.cpp b/src/coreclr/vm/interpreter.cpp
index d30b9934e10d70..1210ed5453a390 100644
--- a/src/coreclr/vm/interpreter.cpp
+++ b/src/coreclr/vm/interpreter.cpp
@@ -1802,7 +1802,7 @@ void Interpreter::JitMethodIfAppropriate(InterpreterMethodInfo* interpMethInfo,
CodeVersionManager::LockHolder _lockHolder;
NativeCodeVersion activeCodeVersion = md->GetCodeVersionManager()->GetActiveILCodeVersion(md).GetActiveNativeCodeVersion(md);
ILCodeVersion ilCodeVersion = activeCodeVersion.GetILCodeVersion();
- if (activeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0 &&
+ if (!activeCodeVersion.IsFinalTier() &&
!ilCodeVersion.HasAnyOptimizedNativeCodeVersion(activeCodeVersion))
{
tieredCompilationManager->AsyncPromoteToTier1(activeCodeVersion, &scheduleTieringBackgroundWork);
diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp
index 7f9eb048926049..3e0cd970d97969 100644
--- a/src/coreclr/vm/jitinterface.cpp
+++ b/src/coreclr/vm/jitinterface.cpp
@@ -12823,23 +12823,6 @@ CORJIT_FLAGS GetCompileFlags(MethodDesc * ftn, CORJIT_FLAGS flags, CORINFO_METHO
#ifdef FEATURE_PGO
- // Instrument, if
- //
- // * We're writing pgo data and we're jitting at Tier0.
- // * Tiered PGO is enabled and we're jitting at Tier0.
- // * Tiered PGO is enabled and we are jitting an OSR method.
- //
- if ((CLRConfig::GetConfigValue(CLRConfig::INTERNAL_WritePGOData) > 0)
- && flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0))
- {
- flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
- }
- else if ((g_pConfig->TieredPGO())
- && (flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0) || flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_OSR)))
- {
- flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
- }
-
if (CLRConfig::GetConfigValue(CLRConfig::INTERNAL_ReadPGOData) > 0)
{
flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBOPT);
diff --git a/src/coreclr/vm/method.hpp b/src/coreclr/vm/method.hpp
index 0edff0f4ff3650..6df3d0042e3343 100644
--- a/src/coreclr/vm/method.hpp
+++ b/src/coreclr/vm/method.hpp
@@ -1993,6 +1993,8 @@ class PrepareCodeConfig
QuickJitted,
OptimizedTier1,
OptimizedTier1OSR,
+ InstrumentedTier,
+ InstrumentedTierOptimized,
Count
};
diff --git a/src/coreclr/vm/prestub.cpp b/src/coreclr/vm/prestub.cpp
index 0aba1852ab40f6..4b4373ac40e818 100644
--- a/src/coreclr/vm/prestub.cpp
+++ b/src/coreclr/vm/prestub.cpp
@@ -364,9 +364,9 @@ PCODE MethodDesc::PrepareILBasedCode(PrepareCodeConfig* pConfig)
if (codeVersion.IsDefaultVersion())
{
pConfig->GetMethodDesc()->GetLoaderAllocator()->GetCallCountingManager()->DisableCallCounting(codeVersion);
- _ASSERTE(codeVersion.GetOptimizationTier() != NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(codeVersion.IsFinalTier());
}
- else if (codeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0)
+ else if (!codeVersion.IsFinalTier())
{
codeVersion.SetOptimizationTier(NativeCodeVersion::OptimizationTierOptimized);
}
@@ -457,7 +457,7 @@ PCODE MethodDesc::GetPrecompiledCode(PrepareCodeConfig* pConfig, bool shouldTier
#ifdef FEATURE_TIERED_COMPILATION
if (shouldCountCalls)
{
- _ASSERTE(pConfig->GetCodeVersion().GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!pConfig->GetCodeVersion().IsFinalTier());
pConfig->SetShouldCountCalls();
}
#endif
@@ -1225,6 +1225,12 @@ PrepareCodeConfig::JitOptimizationTier PrepareCodeConfig::GetJitOptimizationTier
case NativeCodeVersion::OptimizationTierOptimized:
return JitOptimizationTier::Optimized;
+ case NativeCodeVersion::OptimizationTier0Instrumented:
+ return JitOptimizationTier::InstrumentedTier;
+
+ case NativeCodeVersion::OptimizationTier1Instrumented:
+ return JitOptimizationTier::InstrumentedTierOptimized;
+
default:
UNREACHABLE();
}
@@ -1247,6 +1253,8 @@ const char *PrepareCodeConfig::GetJitOptimizationTierStr(PrepareCodeConfig *conf
case JitOptimizationTier::QuickJitted: return "QuickJitted";
case JitOptimizationTier::OptimizedTier1: return "OptimizedTier1";
case JitOptimizationTier::OptimizedTier1OSR: return "OptimizedTier1OSR";
+ case JitOptimizationTier::InstrumentedTier: return "InstrumentedTier";
+ case JitOptimizationTier::InstrumentedTierOptimized: return "InstrumentedTierOptimized";
default:
UNREACHABLE();
@@ -1296,6 +1304,7 @@ bool PrepareCodeConfig::FinalizeOptimizationTierForTier0LoadOrJit()
NativeCodeVersion::OptimizationTier previousOptimizationTier = GetCodeVersion().GetOptimizationTier();
_ASSERTE(
previousOptimizationTier == NativeCodeVersion::OptimizationTier0 ||
+ previousOptimizationTier == NativeCodeVersion::OptimizationTier0Instrumented ||
previousOptimizationTier == NativeCodeVersion::OptimizationTierOptimized);
#endif // _DEBUG
diff --git a/src/coreclr/vm/tieredcompilation.cpp b/src/coreclr/vm/tieredcompilation.cpp
index 3b15100a34fb66..dcff50459c8b4a 100644
--- a/src/coreclr/vm/tieredcompilation.cpp
+++ b/src/coreclr/vm/tieredcompilation.cpp
@@ -111,6 +111,22 @@ NativeCodeVersion::OptimizationTier TieredCompilationManager::GetInitialOptimiza
return NativeCodeVersion::OptimizationTierOptimized;
}
+#ifdef FEATURE_PGO
+ if (g_pConfig->TieredPGO())
+ {
+ // Initial tier for R2R is always just OptimizationTier0
+ // For ILOnly it depends on TieredPGO_InstrumentOnlyHotCode:
+ // 1 - OptimizationTier0 as we don't want to instrument the initial version (will only instrument hot Tier0)
+ // 2 - OptimizationTier0Instrumented - instrument all ILOnly code
+ if (g_pConfig->TieredPGO_InstrumentOnlyHotCode() ||
+ ExecutionManager::IsReadyToRunCode(pMethodDesc->GetNativeCode()))
+ {
+ return NativeCodeVersion::OptimizationTier0;
+ }
+ return NativeCodeVersion::OptimizationTier0Instrumented;
+ }
+#endif
+
return NativeCodeVersion::OptimizationTier0;
#else
return NativeCodeVersion::OptimizationTierOptimized;
@@ -237,7 +253,7 @@ bool TieredCompilationManager::TrySetCodeEntryPointAndRecordMethodForCallCountin
}
void TieredCompilationManager::AsyncPromoteToTier1(
- NativeCodeVersion tier0NativeCodeVersion,
+ NativeCodeVersion currentNativeCodeVersion,
bool *createTieringBackgroundWorkerRef)
{
CONTRACTL
@@ -249,8 +265,8 @@ void TieredCompilationManager::AsyncPromoteToTier1(
CONTRACTL_END;
_ASSERTE(CodeVersionManager::IsLockOwnedByCurrentThread());
- _ASSERTE(!tier0NativeCodeVersion.IsNull());
- _ASSERTE(tier0NativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ _ASSERTE(!currentNativeCodeVersion.IsNull());
+ _ASSERTE(!currentNativeCodeVersion.IsFinalTier());
_ASSERTE(createTieringBackgroundWorkerRef != nullptr);
NativeCodeVersion t1NativeCodeVersion;
@@ -261,10 +277,41 @@ void TieredCompilationManager::AsyncPromoteToTier1(
// particular version of the IL code regardless of any changes that may
// occur between now and when jitting completes. If the IL does change in that
// interval the new code entry won't be activated.
- MethodDesc *pMethodDesc = tier0NativeCodeVersion.GetMethodDesc();
- ILCodeVersion ilCodeVersion = tier0NativeCodeVersion.GetILCodeVersion();
- _ASSERTE(!ilCodeVersion.HasAnyOptimizedNativeCodeVersion(tier0NativeCodeVersion));
- hr = ilCodeVersion.AddNativeCodeVersion(pMethodDesc, NativeCodeVersion::OptimizationTier1, &t1NativeCodeVersion);
+ MethodDesc *pMethodDesc = currentNativeCodeVersion.GetMethodDesc();
+
+ NativeCodeVersion::OptimizationTier nextTier = NativeCodeVersion::OptimizationTier1;
+
+#ifdef FEATURE_PGO
+ if (g_pConfig->TieredPGO())
+ {
+ if (currentNativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0 &&
+ g_pConfig->TieredPGO_InstrumentOnlyHotCode())
+ {
+ if (ExecutionManager::IsReadyToRunCode(currentNativeCodeVersion.GetNativeCode()))
+ {
+ // We definitely don't want to use unoptimized instrumentation tier for hot R2R:
+ // 1) It will produce a lot of new compilations for small methods which were inlined in R2R
+ // 2) Noticeable performance regression from fast R2R to slow instrumented Tier0
+ nextTier = NativeCodeVersion::OptimizationTier1Instrumented;
+ }
+ else
+ {
+ // For ILOnly it's fine to use unoptimized instrumented tier:
+ // 1) No new compilations since previous tier already triggered them
+ // 2) Better profile since we'll be able to instrument inlinees
+ // 3) Unoptimized instrumented tier is faster to produce and wire up
+ nextTier = NativeCodeVersion::OptimizationTier0Instrumented;
+
+ // NOTE: we might consider using OptimizationTier1Instrumented if the previous Tier0
+ // made it to Tier1-OSR.
+ }
+ }
+ }
+#endif
+
+ ILCodeVersion ilCodeVersion = currentNativeCodeVersion.GetILCodeVersion();
+ _ASSERTE(!ilCodeVersion.HasAnyOptimizedNativeCodeVersion(currentNativeCodeVersion));
+ hr = ilCodeVersion.AddNativeCodeVersion(pMethodDesc, nextTier, &t1NativeCodeVersion);
if (FAILED(hr))
{
ThrowHR(hr);
@@ -992,7 +1039,7 @@ CORJIT_FLAGS TieredCompilationManager::GetJitFlags(PrepareCodeConfig *config)
_ASSERTE(config != nullptr);
_ASSERTE(
!config->WasTieringDisabledBeforeJitting() ||
- config->GetCodeVersion().GetOptimizationTier() != NativeCodeVersion::OptimizationTier0);
+ config->GetCodeVersion().IsFinalTier());
CORJIT_FLAGS flags;
@@ -1015,9 +1062,25 @@ CORJIT_FLAGS TieredCompilationManager::GetJitFlags(PrepareCodeConfig *config)
NativeCodeVersion::OptimizationTier newOptimizationTier;
if (!methodDesc->RequestedAggressiveOptimization())
{
+ NativeCodeVersion::OptimizationTier currentTier = nativeCodeVersion.GetOptimizationTier();
+
if (g_pConfig->TieredCompilation_QuickJit())
{
- _ASSERTE(nativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+ if (currentTier == NativeCodeVersion::OptimizationTier::OptimizationTier0Instrumented)
+ {
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
+ return flags;
+ }
+
+ if (currentTier == NativeCodeVersion::OptimizationTier::OptimizationTier1Instrumented)
+ {
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER1);
+ return flags;
+ }
+
+ _ASSERTE(!nativeCodeVersion.IsFinalTier());
flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
return flags;
}
@@ -1040,13 +1103,24 @@ CORJIT_FLAGS TieredCompilationManager::GetJitFlags(PrepareCodeConfig *config)
switch (nativeCodeVersion.GetOptimizationTier())
{
+ case NativeCodeVersion::OptimizationTier0Instrumented:
+ _ASSERT(g_pConfig->TieredCompilation_QuickJit());
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
+ break;
+
+ case NativeCodeVersion::OptimizationTier1Instrumented:
+ _ASSERT(g_pConfig->TieredCompilation_QuickJit());
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+ flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER1);
+ break;
+
case NativeCodeVersion::OptimizationTier0:
if (g_pConfig->TieredCompilation_QuickJit())
{
flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
break;
}
-
nativeCodeVersion.SetOptimizationTier(NativeCodeVersion::OptimizationTierOptimized);
goto Optimized;
diff --git a/src/coreclr/vm/tieredcompilation.h b/src/coreclr/vm/tieredcompilation.h
index 4ad624a3f7f1c2..bf078dbc2979e8 100644
--- a/src/coreclr/vm/tieredcompilation.h
+++ b/src/coreclr/vm/tieredcompilation.h
@@ -42,7 +42,7 @@ class TieredCompilationManager
public:
void HandleCallCountingForFirstCall(MethodDesc* pMethodDesc);
bool TrySetCodeEntryPointAndRecordMethodForCallCounting(MethodDesc* pMethodDesc, PCODE codeEntryPoint);
- void AsyncPromoteToTier1(NativeCodeVersion tier0NativeCodeVersion, bool *createTieringBackgroundWorkerRef);
+ void AsyncPromoteToTier1(NativeCodeVersion currentNativeCodeVersion, bool *createTieringBackgroundWorkerRef);
static CORJIT_FLAGS GetJitFlags(PrepareCodeConfig *config);
#if !defined(DACCESS_COMPILE) && defined(_DEBUG)
diff --git a/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.cs b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.cs
new file mode 100644
index 00000000000000..f3705502e9967c
--- /dev/null
+++ b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.cs
@@ -0,0 +1,33 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+// A smoke test for all DOTNET_TieredPGO strategies
+class Program : IDisposable
+{
+ static int Main()
+ {
+ Program p = new();
+ for (int i = 0; i < 100; i++)
+ {
+ HotLoop(p);
+ Thread.Sleep(40); // cold loop
+ }
+ return 100;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static void HotLoop(IDisposable d)
+ {
+ for (int i = 0; i < 100000; i++) // hot loop
+ d?.Dispose();
+ }
+
+ public void Dispose() => Test();
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ void Test() { }
+}
\ No newline at end of file
diff --git a/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.csproj b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.csproj
new file mode 100644
index 00000000000000..0d7ec3d6e64703
--- /dev/null
+++ b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.csproj
@@ -0,0 +1,21 @@
+
+
+ Exe
+ True
+
+
+
+
+
+
+
\ No newline at end of file