diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext-opt.png b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext-opt.png
new file mode 100644
index 00000000000000..c795c7d067b674
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext-opt.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext.png b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext.png
new file mode 100644
index 00000000000000..780227d2a3f483
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-Plaintext.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-ilsize-histogram1.png b/docs/design/features/DynamicPgo-InstrumentedTiers-ilsize-histogram1.png
new file mode 100644
index 00000000000000..9eb74ee70a2414
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-ilsize-histogram1.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers-msft-service.png b/docs/design/features/DynamicPgo-InstrumentedTiers-msft-service.png
new file mode 100644
index 00000000000000..be6e94e8d826f4
Binary files /dev/null and b/docs/design/features/DynamicPgo-InstrumentedTiers-msft-service.png differ
diff --git a/docs/design/features/DynamicPgo-InstrumentedTiers.md b/docs/design/features/DynamicPgo-InstrumentedTiers.md
new file mode 100644
index 00000000000000..b7786f37d6575a
--- /dev/null
+++ b/docs/design/features/DynamicPgo-InstrumentedTiers.md
@@ -0,0 +1,663 @@
+# Instrumented Tiers
+
+[#70941](https://github.com/dotnet/runtime/pull/70941) introduced separate tiers to focus on instrumenting only the hot code. It's done to address the following problems:
+1) R2R code should still benefit from Dynamic PGO despite being not instrumented in the first place
+2) Overhead from the instrumentation in Tier0 should not slow startup
+
+To address these problems the following workflow was introduced:
+
+```mermaid
+flowchart
+    prestub(.NET Function) -->|Compilation| hasAO{"Marked with<br/>[AggressiveOpts]?"}
+    hasAO-->|Yes|tier1ao["JIT to <b><ins>Tier1</ins></b><br/><br/>(no dynamic profile data)"]
+    hasAO-->|No|hasR2R
+    hasR2R{"Is prejitted (R2R)?"} -->|No| tier000
+
+    tier000["JIT to <b><ins>Tier0</ins></b><br/><br/>(not optimized, not instrumented,<br/> with patchpoints)"]-->|Running...|ishot555
+    ishot555{"Is hot?<br/>(called >30 times)"}
+    ishot555-.->|No,<br/>keep running...|ishot555
+    ishot555-->|Yes|tier0
+   
+    hasR2R -->|Yes| R2R
+    R2R["Use <b><ins>R2R</ins></b> code<br/><br/>(optimized, not instrumented,<br/>no patchpoints)"] -->|Running...|ishot1
+    ishot1{"Is hot?<br/>(called >30 times)"}-.->|No,<br/>keep running...|ishot1
+    ishot1--->|"Yes"|tier1inst
+
+    tier0["JIT to <b><ins>Tier0Instrumented</ins></b><br/><br/>(not optimized, instrumented,<br/> with patchpoints)"]-->|Running...|ishot5
+    tier1pgo2["JIT to <b><ins>Tier1</ins></b><br/><br/>(optimized with profile data)"]
+      
+    tier1inst["JIT to <b><ins>Tier1Instrumented</ins></b><br/><br/>(optimized, instrumented, <br/>no patchpoints)"]
+    tier1inst-->|Running...|ishot5
+    ishot5{"Is hot?<br/>(called >30 times)"}-->|Yes|tier1pgo2
+    ishot5-.->|No,<br/>keep running...|ishot5
+```
+(_VSCode doesn't support mermaid diagrams out of the box, consider installing external add-ins_)
+
+Now, any code is eligible for Dynamic PGO if it's hot enough. It's easier to explain this on a concrete example:
+
+```csharp
+class Program : IDisposable
+{
+    static int Main()
+    {
+        Program p = new();
+        for (int i = 0; i < 500; i++)
+        {
+            HotLoop(p);
+            Thread.Sleep(40); // cold loop
+        }
+
+        Console.ReadKey();
+        return 100;
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void HotLoop(IDisposable d)
+    {
+        for (int i = 0; i < 500000; i++) // hot loop
+            d?.Dispose();
+    }
+
+    public void Dispose() => Test();
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    void Test() { }
+}
+```
+
+The method we'll be looking at is `HotLoop`. The method itself has a hot loop (to show how this work interacts with OSR) but the whole method is expected to be promoted to Tier1 too since it's invoked also in a loop (cold loop). The method also has a virtual call to showcase GDV.
+
+# Case 1: `HotLoop` is prejitted (R2R)
+
+Let's see what happens when the method we're inspecting has an AOT version on start:
+
+1) When we start the app, VM picks up R2R'd version of `HotLoop` that looks like this:
+
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; ReadyToRun compilation
+; optimized code
+; No PGO data
+G_M43040_IG01:              ;; offset=0000H
+       57                   push     rdi
+       56                   push     rsi
+       4883EC28             sub      rsp, 40
+       488BF1               mov      rsi, rcx
+						;; size=9 bbWeight=1    PerfScore 2.50
+G_M43040_IG02:              ;; offset=0009H
+       33FF                 xor      edi, edi
+						;; size=2 bbWeight=1    PerfScore 0.25
+G_M43040_IG03:              ;; offset=000BH
+       4885F6               test     rsi, rsi
+       740D                 je       SHORT G_M43040_IG05
+						;; size=5 bbWeight=4    PerfScore 5.00
+G_M43040_IG04:              ;; offset=0010H
+       488BCE               mov      rcx, rsi
+       4C8D1D00000000       lea      r11, [(reloc 0x4000000000420270)]
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+						;; size=13 bbWeight=2    PerfScore 7.50
+G_M43040_IG05:              ;; offset=001DH
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7CE4                 jl       SHORT G_M43040_IG03
+						;; size=10 bbWeight=4    PerfScore 6.00
+G_M43040_IG06:              ;; offset=0027H
+       4883C428             add      rsp, 40
+       5E                   pop      rsi
+       5F                   pop      rdi
+       C3                   ret      
+						;; size=7 bbWeight=1    PerfScore 2.25
+; Total bytes of code 46
+```
+
+As we can see from the codegen: it's not instrumented (we never instrument R2R'd code - it would increase the binary size by quite a lot), it doesn't have patchpoints for OSR (since it's already optimized) and is optimized. Technically, it can be optimized with a Static PGO but, presumably, it's a rare case in the real world due to complexity, so we left that virtual call here non-devirtualized.
+
+2) HotLoop is invoked >30 times meaning it's likely a hot method so VM "promotes" it to Tier1Instrumented:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; optimized code
+; instrumented for collecting profile data
+; No PGO data
+G_M43040_IG01:              ;; offset=0000H
+       57                   push     rdi
+       56                   push     rsi
+       4883EC28             sub      rsp, 40
+       488BF1               mov      rsi, rcx
+                                                ;; size=9 bbWeight=1    PerfScore 2.50
+G_M43040_IG02:              ;; offset=0009H
+       FF05F9FE5500         inc      dword ptr [(reloc 0x7ffd5edb4948)]
+       33FF                 xor      edi, edi
+       EB3B                 jmp      SHORT G_M43040_IG05
+                                                ;; size=10 bbWeight=1    PerfScore 5.25
+G_M43040_IG03:              ;; offset=0013H
+       FF05F3FE5500         inc      dword ptr [(reloc 0x7ffd5edb494c)]
+       4885F6               test     rsi, rsi
+       7428                 je       SHORT G_M43040_IG04
+       FF05ECFE5500         inc      dword ptr [(reloc 0x7ffd5edb4950)]
+       488BCE               mov      rcx, rsi
+       48BA5849DB5EFD7F0000 mov      rdx, 0x7FFD5EDB4958
+       E81ACB105F           call     CORINFO_HELP_CLASSPROFILE32
+       488BCE               mov      rcx, rsi
+       49BB5000595EFD7F0000 mov      r11, 0x7FFD5E590050      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+                                                ;; size=51 bbWeight=2    PerfScore 24.50
+G_M43040_IG04:              ;; offset=0046H
+       FF0514FF5500         inc      dword ptr [(reloc 0x7ffd5edb49a0)]
+       FFC7                 inc      edi
+                                                ;; size=8 bbWeight=2    PerfScore 6.50
+G_M43040_IG05:              ;; offset=004EH
+       FF0510FF5500         inc      dword ptr [(reloc 0x7ffd5edb49a4)]
+       81FF20A10700         cmp      edi, 0x7A120
+       7CB7                 jl       SHORT G_M43040_IG03
+                                                ;; size=14 bbWeight=8    PerfScore 34.00
+G_M43040_IG06:              ;; offset=005CH
+       FF0506FF5500         inc      dword ptr [(reloc 0x7ffd5edb49a8)]
+                                                ;; size=6 bbWeight=1    PerfScore 3.00
+G_M43040_IG07:              ;; offset=0062H
+       4883C428             add      rsp, 40
+       5E                   pop      rsi
+       5F                   pop      rdi
+       C3                   ret
+                                                ;; size=7 bbWeight=1    PerfScore 2.25
+; Total bytes of code 105
+```
+
+We had to instrument **optimized** code here to mitigate two issues:
+1) We don't want to see a significant performance degradation (even temporarily) after fast R2R
+2) Unoptimized code tends to spawn a lot of new unnecessary jit compilations because it doesn't inline code, even simple properties
+
+As a downside - the profile is less accurate and it doesn't instrument inlinees.
+
+3) The new code version of `HotLoop` is also invoked >30 times leading to the final promotion to Tier1:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; optimized code
+; optimized using profile data
+; with Dynamic PGO: edge weights are invalid, and fgCalledCount is 48
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
+G_M43040_IG01:              ;; offset=0000H
+       57                   push     rdi
+       56                   push     rsi
+       4883EC28             sub      rsp, 40
+       488BF1               mov      rsi, rcx
+                                                ;; size=9 bbWeight=1    PerfScore 2.50
+G_M43040_IG02:              ;; offset=0009H
+       33FF                 xor      edi, edi
+       4885F6               test     rsi, rsi
+       7424                 je       SHORT G_M43040_IG05
+       48B9A023C861FD7F0000 mov      rcx, 0x7FFD61C823A0      ; Program
+       48390E               cmp      qword ptr [rsi], rcx
+       7515                 jne      SHORT G_M43040_IG05
+                                                ;; size=22 bbWeight=1    PerfScore 5.75
+G_M43040_IG03:              ;; offset=001FH
+       488BCE               mov      rcx, rsi
+       FF1550771B00         call     [Program:Test():this]
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7CED                 jl       SHORT G_M43040_IG03
+                                                ;; size=19 bbWeight=484693.69 PerfScore 2302295.02
+G_M43040_IG04:              ;; offset=0032H
+       EB27                 jmp      SHORT G_M43040_IG07
+                                                ;; size=2 bbWeight=1    PerfScore 2.00
+G_M43040_IG05:              ;; offset=0034H
+       4885F6               test     rsi, rsi
+       7418                 je       SHORT G_M43040_IG06
+       48B9A023C861FD7F0000 mov      rcx, 0x7FFD61C823A0      ; Program
+       48390E               cmp      qword ptr [rsi], rcx
+       751A                 jne      SHORT G_M43040_IG08
+       488BCE               mov      rcx, rsi
+       FF1527771B00         call     [Program:Test():this]
+                                                ;; size=29 bbWeight=4895.90 PerfScore 42839.09
+G_M43040_IG06:              ;; offset=0051H
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7CD9                 jl       SHORT G_M43040_IG05
+                                                ;; size=10 bbWeight=4895.90 PerfScore 7343.84
+G_M43040_IG07:              ;; offset=005BH
+       4883C428             add      rsp, 40
+       5E                   pop      rsi
+       5F                   pop      rdi
+       C3                   ret
+                                                ;; size=7 bbWeight=0.98 PerfScore 2.20
+G_M43040_IG08:              ;; offset=0062H
+       488BCE               mov      rcx, rsi
+       49BB10007E61FD7F0000 mov      r11, 0x7FFD617E0010      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+       EBDD                 jmp      SHORT G_M43040_IG06
+; Total bytes of code 116
+```
+The codegen looks a bit bulky but if we look closer we'll see that we cloned the loop to have a fast version with a devirtualized call inside (see `G_M43040_IG03`) with guards hoisted out of that loop. To summarize what happened with `HotLoop` we can take a look at this part of the diagram:
+```mermaid
+flowchart
+    hasR2R("...") -->|Yes| R2R
+    R2R["Use <b><ins>R2R</ins></b> code<br/><br/>(optimized, not instrumented,<br/>no patchpoints)"] -->|Running...|ishot1
+    ishot1{"Is hot?<br/>(called >30 times)"}-.->|No,<br/>keep running...|ishot1
+    ishot1--->|"Yes"|tier1inst
+    tier1pgo2["JIT to <b><ins>Tier1</ins></b><br/><br/>(optimized with profile data)"]
+    tier1inst["JIT to <b><ins>Tier1Instrumented</ins></b><br/><br/>(optimized, instrumented, <br/>no patchpoints)"]
+    tier1inst-->|Running...|ishot5
+    ishot5{"Is hot?<br/>(called >30 times)"}-->|Yes|tier1pgo2
+    ishot5-.->|No,<br/>keep running...|ishot5
+```
+
+
+# Case 2: `HotLoop` is not initially prejitted
+
+This case is a bit more complicated since it involves OSR for this case.
+
+1) Since no R2R version exists for `HotLoop` VM has to ask JIT to compile a Tier0 version of it as fast as it can:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-0 compilation
+; MinOpts code
+G_M43040_IG01:              ;; offset=0000H
+       55                   push     rbp
+       4883EC70             sub      rsp, 112
+       488D6C2470           lea      rbp, [rsp+70H]
+       33C0                 xor      eax, eax
+       8945C4               mov      dword ptr [rbp-3CH], eax
+       48894D10             mov      gword ptr [rbp+10H], rcx
+                                                ;; size=19 bbWeight=1    PerfScore 4.00
+G_M43040_IG02:              ;; offset=0013H
+       33C9                 xor      ecx, ecx
+       894DC4               mov      dword ptr [rbp-3CH], ecx
+       C745B8E8030000       mov      dword ptr [rbp-48H], 0x3E8
+       EB20                 jmp      SHORT G_M43040_IG05
+                                                ;; size=14 bbWeight=1    PerfScore 4.25
+G_M43040_IG03:              ;; offset=0021H
+       48837D1000           cmp      gword ptr [rbp+10H], 0
+       7411                 je       SHORT G_M43040_IG04
+       488B4D10             mov      rcx, gword ptr [rbp+10H]
+       49BB90027E61FD7F0000 mov      r11, 0x7FFD617E0290      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+                                                ;; size=24 bbWeight=1    PerfScore 7.25
+G_M43040_IG04:              ;; offset=0039H
+       8B45C4               mov      eax, dword ptr [rbp-3CH]
+       FFC0                 inc      eax
+       8945C4               mov      dword ptr [rbp-3CH], eax
+                                                ;; size=8 bbWeight=1    PerfScore 2.25
+G_M43040_IG05:              ;; offset=0041H
+       8B4DB8               mov      ecx, dword ptr [rbp-48H]
+       FFC9                 dec      ecx
+       894DB8               mov      dword ptr [rbp-48H], ecx
+       837DB800             cmp      dword ptr [rbp-48H], 0
+       7F0E                 jg       SHORT G_M43040_IG07
+                                                ;; size=14 bbWeight=1    PerfScore 5.25
+G_M43040_IG06:              ;; offset=004FH
+       488D4DB8             lea      rcx, [rbp-48H]
+       BA11000000           mov      edx, 17
+       E8338F045F           call     CORINFO_HELP_PATCHPOINT
+                                                ;; size=14 bbWeight=0.01 PerfScore 0.02
+G_M43040_IG07:              ;; offset=005DH
+       817DC420A10700       cmp      dword ptr [rbp-3CH], 0x7A120
+       7CBB                 jl       SHORT G_M43040_IG03
+                                                ;; size=9 bbWeight=1    PerfScore 3.00
+G_M43040_IG08:              ;; offset=0066H
+       4883C470             add      rsp, 112
+       5D                   pop      rbp
+       C3                   ret
+                                                ;; size=6 bbWeight=1    PerfScore 1.75
+; Total bytes of code 108
+```
+
+The codegen is unoptimized, with patchpoints for OSR and without instrumentation (to avoid spending time on it for methods which will never make it to Tier1 - as the practice shows: only 10-20% of methods make it to Tier1)
+
+2) Its loop body triggers OSR after `DOTNET_TC_OnStackReplacement_InitialCounter` iterations (see jitconfigvalue.h):
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; OSR variant for entry point 0x11
+; optimized code
+; No PGO data
+G_M43040_IG01:              ;; offset=0000H
+       4883EC38             sub      rsp, 56
+       4889BC24A8000000     mov      qword ptr [rsp+A8H], rdi
+       4889B424A0000000     mov      qword ptr [rsp+A0H], rsi
+       488BB424C0000000     mov      rsi, gword ptr [rsp+C0H]
+       8B7C2474             mov      edi, dword ptr [rsp+74H]
+                                                ;; size=32 bbWeight=1    PerfScore 6.25
+G_M43040_IG02:              ;; offset=0020H
+       81FF20A10700         cmp      edi, 0x7A120
+       7D1F                 jge      SHORT G_M43040_IG06
+                                                ;; size=8 bbWeight=1    PerfScore 1.25
+G_M43040_IG03:              ;; offset=0028H
+       4885F6               test     rsi, rsi
+       7410                 je       SHORT G_M43040_IG05
+                                                ;; size=5 bbWeight=4    PerfScore 5.00
+G_M43040_IG04:              ;; offset=002DH
+       488BCE               mov      rcx, rsi
+       49BB98027E61FD7F0000 mov      r11, 0x7FFD617E0298      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+                                                ;; size=16 bbWeight=2    PerfScore 7.00
+G_M43040_IG05:              ;; offset=003DH
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7CE1                 jl       SHORT G_M43040_IG03
+                                                ;; size=10 bbWeight=4    PerfScore 6.00
+G_M43040_IG06:              ;; offset=0047H
+       4881C4A0000000       add      rsp, 160
+       5E                   pop      rsi
+       5F                   pop      rdi
+       5D                   pop      rbp
+       C3                   ret
+                                                ;; size=11 bbWeight=1    PerfScore 2.75
+; Total bytes of code 82
+```
+
+Now the loop is faster because of optimizations but is still not instrumented/devirtualized. In theory, we could start instrumenting at least the loop body at this stage, but it's left as is for now, see notes below.
+
+3) `HotLoop` itself is invoked > 30 times, that triggers promotion to Tier0Instrumented:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-0 compilation
+; MinOpts code
+; instrumented for collecting profile data
+G_M43040_IG01:              ;; offset=0000H
+       55                   push     rbp
+       4881EC80000000       sub      rsp, 128
+       488DAC2480000000     lea      rbp, [rsp+80H]
+       33C0                 xor      eax, eax
+       488945A8             mov      qword ptr [rbp-58H], rax
+       C5D857E4             vxorps   xmm4, xmm4
+       C5F97F65B0           vmovdqa  xmmword ptr [rbp-50H], xmm4
+       488945C0             mov      qword ptr [rbp-40H], rax
+       48894D10             mov      gword ptr [rbp+10H], rcx
+                                                ;; size=39 bbWeight=1    PerfScore 7.33
+G_M43040_IG02:              ;; offset=0027H
+       FF05A3846000         inc      dword ptr [(reloc 0x7ffd6214fc10)]
+       33C9                 xor      ecx, ecx
+       894DC4               mov      dword ptr [rbp-3CH], ecx
+       C745B8E8030000       mov      dword ptr [rbp-48H], 0x3E8
+       EB55                 jmp      SHORT G_M43040_IG05
+                                                ;; size=20 bbWeight=1    PerfScore 7.25
+G_M43040_IG03:              ;; offset=003BH
+       FF0593846000         inc      dword ptr [(reloc 0x7ffd6214fc14)]
+       48837D1000           cmp      gword ptr [rbp+10H], 0
+       743A                 je       SHORT G_M43040_IG04
+       FF058A846000         inc      dword ptr [(reloc 0x7ffd6214fc18)]
+       488B4D10             mov      rcx, gword ptr [rbp+10H]
+       48894DB0             mov      gword ptr [rbp-50H], rcx
+       488B4DB0             mov      rcx, gword ptr [rbp-50H]
+       48BA20FC1462FD7F0000 mov      rdx, 0x7FFD6214FC20
+       E8E79D045F           call     CORINFO_HELP_CLASSPROFILE32
+       488B4DB0             mov      rcx, gword ptr [rbp-50H]
+       48894DA8             mov      gword ptr [rbp-58H], rcx
+       488B4DA8             mov      rcx, gword ptr [rbp-58H]
+       49BBA0027E61FD7F0000 mov      r11, 0x7FFD617E02A0      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+                                                ;; size=71 bbWeight=1    PerfScore 19.50
+G_M43040_IG04:              ;; offset=0082H
+       FF05A0846000         inc      dword ptr [(reloc 0x7ffd6214fc68)]
+       8B45C4               mov      eax, dword ptr [rbp-3CH]
+       FFC0                 inc      eax
+       8945C4               mov      dword ptr [rbp-3CH], eax
+                                                ;; size=14 bbWeight=1    PerfScore 5.25
+G_M43040_IG05:              ;; offset=0090H
+       8B4DB8               mov      ecx, dword ptr [rbp-48H]
+       FFC9                 dec      ecx
+       894DB8               mov      dword ptr [rbp-48H], ecx
+       837DB800             cmp      dword ptr [rbp-48H], 0
+       7F0E                 jg       SHORT G_M43040_IG07
+                                                ;; size=14 bbWeight=1    PerfScore 5.25
+G_M43040_IG06:              ;; offset=009EH
+       488D4DB8             lea      rcx, [rbp-48H]
+       BA11000000           mov      edx, 17
+       E8248C045F           call     CORINFO_HELP_PATCHPOINT
+                                                ;; size=14 bbWeight=0.01 PerfScore 0.02
+G_M43040_IG07:              ;; offset=00ACH
+       FF057A846000         inc      dword ptr [(reloc 0x7ffd6214fc6c)]
+       817DC420A10700       cmp      dword ptr [rbp-3CH], 0x7A120
+       7C80                 jl       SHORT G_M43040_IG03
+       FF056F846000         inc      dword ptr [(reloc 0x7ffd6214fc70)]
+                                                ;; size=21 bbWeight=1    PerfScore 9.00
+G_M43040_IG08:              ;; offset=00C1H
+       4881C480000000       add      rsp, 128
+       5D                   pop      rbp
+       C3                   ret
+                                                ;; size=9 bbWeight=1    PerfScore 1.75
+; Total bytes of code 202
+```
+Now the whole method is compiled to Tier0 with instrumentation and patchpoints. No optimizations.
+We decided to promote hot Tier0 to Tier0Instrumented without optimizations for the following reasons:
+1) We won't notice a big performance regression from going from Tier0 to Tier0Instrumented
+2) Tier0Instrumented is faster to compile
+3) Its profile is more accurate
+
+Although, in this specific case we could consider using Tier1Instrumented since we had a faster loop in the previous code version due to Tier1-OSR, but since OSR events are rare and we don't want to produce a less accurate profile that we had before https://github.com/dotnet/runtime/pull/70941 it's left as is. We might re-consider this when we improve instrumentation for the optimized code to produce a more accurate profile including inlinees.
+
+4) The loop of `HotLoop` triggered OSR once again:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; OSR variant for entry point 0x11
+; optimized code
+; optimized using profile data
+; with Dynamic PGO: edge weights are invalid, and fgCalledCount is 9999
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
+G_M43040_IG01:              ;; offset=0000H
+       4883EC38             sub      rsp, 56
+       4889BC24B8000000     mov      qword ptr [rsp+B8H], rdi
+       4889B424B0000000     mov      qword ptr [rsp+B0H], rsi
+       488BB424D0000000     mov      rsi, gword ptr [rsp+D0H]
+       8BBC2484000000       mov      edi, dword ptr [rsp+84H]
+                                                ;; size=35 bbWeight=1    PerfScore 6.25
+G_M43040_IG02:              ;; offset=0023H
+       81FF20A10700         cmp      edi, 0x7A120
+       7D50                 jge      SHORT G_M43040_IG06
+       4885F6               test     rsi, rsi
+       7424                 je       SHORT G_M43040_IG04
+       48B9C86CCC61FD7F0000 mov      rcx, 0x7FFD61CC6CC8      ; Program
+       48390E               cmp      qword ptr [rsi], rcx
+       7515                 jne      SHORT G_M43040_IG04
+                                                ;; size=28 bbWeight=1    PerfScore 6.75
+G_M43040_IG03:              ;; offset=003FH
+       488BCE               mov      rcx, rsi
+       FF15605A1500         call     [Program:Test():this]
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7D29                 jge      SHORT G_M43040_IG06
+       EBEB                 jmp      SHORT G_M43040_IG03
+                                                ;; size=21 bbWeight=0.99 PerfScore 6.68
+G_M43040_IG04:              ;; offset=0054H
+       4885F6               test     rsi, rsi
+       7418                 je       SHORT G_M43040_IG05
+       48B9C86CCC61FD7F0000 mov      rcx, 0x7FFD61CC6CC8      ; Program
+       48390E               cmp      qword ptr [rsi], rcx
+       751E                 jne      SHORT G_M43040_IG07
+       488BCE               mov      rcx, rsi
+       FF15375A1500         call     [Program:Test():this]
+                                                ;; size=29 bbWeight=0.01 PerfScore 0.09
+G_M43040_IG05:              ;; offset=0071H
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7CD9                 jl       SHORT G_M43040_IG04
+                                                ;; size=10 bbWeight=0.01 PerfScore 0.02
+G_M43040_IG06:              ;; offset=007BH
+       4881C4B0000000       add      rsp, 176
+       5E                   pop      rsi
+       5F                   pop      rdi
+       5D                   pop      rbp
+       C3                   ret
+                                                ;; size=11 bbWeight=0    PerfScore 0.00
+G_M43040_IG07:              ;; offset=0086H
+       488BCE               mov      rcx, rsi
+       49BBA8027E61FD7F0000 mov      r11, 0x7FFD617E02A8      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+       EBD9                 jmp      SHORT G_M43040_IG05
+                                                ;; size=18 bbWeight=0    PerfScore 0.00
+; Total bytes of code 152
+```
+We ended up with a very fast version of the method with optimal loop `G_M43040_IG03` that calls devirtualized call each iteration without any guards. The outsides of the loop are still unoptimized Tier0 codegen.
+
+5) ``HotLoop` method is invoked 30 more times and triggers the final promotion to the last tier:
+```asm
+; Assembly listing for method Program:HotLoop(System.IDisposable)
+; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
+; Tier-1 compilation
+; optimized code
+; optimized using profile data
+; with Dynamic PGO: edge weights are invalid, and fgCalledCount is 48
+; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
+G_M43040_IG01:              ;; offset=0000H
+       57                   push     rdi
+       56                   push     rsi
+       4883EC28             sub      rsp, 40
+       488BF1               mov      rsi, rcx
+                                                ;; size=9 bbWeight=1    PerfScore 2.50
+G_M43040_IG02:              ;; offset=0009H
+       33FF                 xor      edi, edi
+       4885F6               test     rsi, rsi
+       7424                 je       SHORT G_M43040_IG04
+       48B9C86CCC61FD7F0000 mov      rcx, 0x7FFD61CC6CC8      ; Program
+       48390E               cmp      qword ptr [rsi], rcx
+       7515                 jne      SHORT G_M43040_IG04
+                                                ;; size=22 bbWeight=1    PerfScore 5.75
+G_M43040_IG03:              ;; offset=001FH
+       488BCE               mov      rcx, rsi
+       FF15C0591500         call     [Program:Test():this]
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7D29                 jge      SHORT G_M43040_IG06
+       EBEB                 jmp      SHORT G_M43040_IG03
+                                                ;; size=21 bbWeight=1158.09 PerfScore 7817.13
+G_M43040_IG04:              ;; offset=0034H
+       4885F6               test     rsi, rsi
+       7418                 je       SHORT G_M43040_IG05
+       48B9C86CCC61FD7F0000 mov      rcx, 0x7FFD61CC6CC8      ; Program
+       48390E               cmp      qword ptr [rsi], rcx
+       751A                 jne      SHORT G_M43040_IG07
+       488BCE               mov      rcx, rsi
+       FF1597591500         call     [Program:Test():this]
+                                                ;; size=29 bbWeight=11.70 PerfScore 102.36
+G_M43040_IG05:              ;; offset=0051H
+       FFC7                 inc      edi
+       81FF20A10700         cmp      edi, 0x7A120
+       7CD9                 jl       SHORT G_M43040_IG04
+                                                ;; size=10 bbWeight=11.70 PerfScore 17.55
+G_M43040_IG06:              ;; offset=005BH
+       4883C428             add      rsp, 40
+       5E                   pop      rsi
+       5F                   pop      rdi
+       C3                   ret
+                                                ;; size=7 bbWeight=0    PerfScore 0.00
+G_M43040_IG07:              ;; offset=0062H
+       488BCE               mov      rcx, rsi
+       49BBB0027E61FD7F0000 mov      r11, 0x7FFD617E02B0      ; code for System.IDisposable:Dispose
+       41FF13               call     [r11]System.IDisposable:Dispose():this
+       EBDD                 jmp      SHORT G_M43040_IG05
+                                                ;; size=18 bbWeight=0    PerfScore 0.00
+; Total bytes of code 116
+```
+Again, to summarize the workflow for non-prejitted case let's take a look at this branch of the diagram (OSR details are omitted to showcase the most common case):
+
+```mermaid
+flowchart
+    hasR2R("...") -->tier000
+    tier000["JIT to <b><ins>Tier0</ins></b><br/><br/>(not optimized, not instrumented,<br/> with patchpoints)"]-->|Running...|ishot555
+    ishot555{"Is hot?<br/>(called >30 times)"}
+    ishot555-.->|No,<br/>keep running...|ishot555
+    ishot555-->|Yes|tier0
+    tier0["JIT to <b><ins>Tier0Instrumented</ins></b><br/><br/>(not optimized, instrumented,<br/> with patchpoints)"]-->|Running...|ishot5
+    tier1pgo2["JIT to <b><ins>Tier1</ins></b><br/><br/>(optimized with profile data)"]
+    ishot5{"Is hot?<br/>(called >30 times)"}-->|Yes|tier1pgo2
+    ishot5-.->|No,<br/>keep running...|ishot5
+```
+
+It's worth noting that we analyzed the worst (in case of working set) case with OSR, normally (in 99.8% of cases) we end up only with three code versions for hot code:
+1) Tier0/R2R
+2) Instrumented Tier (with or without optimizations)
+3) Tier1 optimized with profile
+
+# Working Set Impact
+
+The general rule of thumb that only 10-20% of methods make it to Tier1 and about to 40-60% of all methods are less than 8 bytes of IL (e.g., getters/setters) so we're effectively double the size of Tier1 with this approach (including call counting stubs, etc.). How bad it can be compared to overall working set in various apps? let's consider these two examples:
+
+## 1) A large web app (internal Microsoft service)
+
+| Metric           | Number of methods | Share, % | Total size, MB | Share, % |
+|------------------|-------------------|----------|----------------|----------|
+| **Tier0**        |            115862 |   59.36% |          60.06 |   83.89% |
+| **Tier1**        |             30942 |   15.85% |           8.22 |   11.48% |
+| **FullOpts**     |             48384 |   24.79% |           3.26 |    4.55% |
+| **Contains OSR** |                55 |    0.03% |           0.06 |    0.08% |
+| **Total jitted** |            195188 |  100.00% |          71.60 |  100.00% |
+
+
+![IL Histogram 1](DynamicPgo-InstrumentedTiers-ilsize-histogram1.png)
+
+In this app Tier1 code occupies 8.22MB in the loader heap (we can add a few megabytes on top of it for call counting stubs, jump-stubs, etc.) meaning that instrumentated tier is expected to add a similar amount (~13MB). The total working set of the service is 10GB so instrumentated tiers contribute ~0.1% of that. We're adding +30k new jit compilations which we can fully compensate with https://github.com/dotnet/runtime/issues/76402 work to avoid potential problems connected with too big queues of methods pending call counting installation/promotions to tier1.
+
+## 2) A desktop OSS application [AvaloniaILSpy](https://github.com/icsharpcode/AvaloniaILSpy)
+
+`ReadyToRun=0`:
+
+| Metric           | Number of methods |        % | Total size, MB |        % |
+|------------------|-------------------|----------|----------------|----------|
+| **Tier0**        |             19968 |   79.09% |           4.58 |   84.69% |
+| **Tier1**        |              4978 |   19.72% |           **0.75** |   13.90% |
+| **FullOpts**     |               300 |    1.19% |           0.08 |    1.39% |
+| **OSR**          |                 2 |    0.01% |           0.00 |    0.02% |
+|                  |                   |          |                |          |
+|        **Total** |             25248 |  100.00% |           5.41 |  100.00% |
+
+`ReadyToRun=1`:
+
+| Metric           | Number of methods |        % | Total size, MB |        % |
+|------------------|-------------------|----------|----------------|----------|
+| **Tier0**        |              4713 |   62.45% |           0.84 |   58.34% |
+| **Tier1**        |              2516 |   33.34% |           0.56 |   38.75% |
+| **FullOpts**     |               318 |    4.21% |           0.04 |    2.92% |
+| **OSR**          |                 0 |    0.00% |           0.00 |    0.00% |
+|                  |                   |          |                |          |
+|        **Total** |              7547 |  100.00% |           1.44 |  100.00% |
+
+In case of AvaloniaILSpy, instrumented tiers add around 1MB (stubs included) to the total working set and around 5k of new jit compilations.
+
+# Start time and performance impact
+
+## TechEmpower
+
+Overall, it is expected from instrumented tiers to improve startup speed when Dynamic PGO is enabled and improve performance (e.g. Latency/Throughput) for prejitted code. A good example demonstrating both is the following TechEmpower benchmark (plaintext-plaintext):
+
+![Plaintext](DynamicPgo-InstrumentedTiers-Plaintext.png)
+
+Legend:
+* Red    - `DOTNET_TieredPGO=0`, `DOTNET_ReadyToRun=1`
+* Black  - `DOTNET_TieredPGO=1`, `DOTNET_ReadyToRun=1`
+* Yellow - `DOTNET_TieredPGO=1`, `DOTNET_ReadyToRun=0`
+
+Yellow line provides the highest level of performance (RPS) by sacrificing start up speed (and, hence, time it takes to process the first request). It happens because the benchmark is quite simple and most of its code is already prejitted so we can only instrument it when we completely drop R2R and compile everything from scratch. It also explains why the black line (when we enable Dynamic PGO but still rely on R2R) didn't really show a lot of improvements. With the separate instrumentated tiers for hot R2R we achieve "Yellow"-level of performance while maintaining the same start up speed as it was before. Also, for the mode where we have to compile a lot of code to Tier0, switching to "instrument only hot Tier0 code" strategy shows ~8% time-to-first-request reduction across all TE benchmarks.
+
+![Plaintext](DynamicPgo-InstrumentedTiers-Plaintext-opt.png)
+(_Predicted results according to local runs_)
+
+## AvaloniaILSpy
+
+For this experiment we modified the source code of the app to send an event once view is completely loaded to measure the real start time:
+
+| Mode                       | Start time |
+|----------------------------|------------|
+| R2R=0                      |      2.03s |
+| R2R=0, PGO=1               |      2.26s |
+| R2R=0, PGO=1, Instr. Tiers |      2.03s |
+
+As we can see, instrumentated tiers help to mitigate the start time regression from Dynamic PGO.
+
+## Microsoft internal service
+
+Throughput of the service after startup:
+
+![Plaintext](DynamicPgo-InstrumentedTiers-msft-service.png)
+
+X axis - time in seconds after start, Y axis - Throughput in MB/s.
+
+Here Dynamic PGO without instrumented tiers (red line) is not able to show benefits because the service is prejitted thus prejitted code doesn't benefit from Dynamic PGO. Instrumented tiers help with that by instrumenting hot R2R code to achieve the best performance, hence, the throughput is higher (green line).
\ No newline at end of file
diff --git a/docs/design/features/DynamicPgo.md b/docs/design/features/DynamicPgo.md
index 464e87950cbece..2bd227d9618b8e 100644
--- a/docs/design/features/DynamicPgo.md
+++ b/docs/design/features/DynamicPgo.md
@@ -257,9 +257,9 @@ If we confidently could identify the top N% of methods (say 5%) then one could i
 R2R methods bypass Tier0 and so don't get instrumentation in the current TieredPGO prototype. We probably don't want to instrument the code in the R2R image. And many of these R2R methods are key framework methods that are important for performance. So we need to find a way to get data for these methods.
 
 There are a few basic ideas:
-* Leverage IBC. If there is IBC data in the R2R image then we can make that data available to the JIT. It may not be as relevant as in-process collected data, but it's quite likely better than synthetic data or no data.
-* Sampled instrumentation for R2R methods. Produce an instrumented version and run it every so often before the method gets promoted to Tier1. This may be costly, especially if we have to use unoptimized methods for instrumentation, as we'll do quite a bit of extra jitting.
-* Make R2R methods go through Tier0 on their way to Tier1. Likely introduces an unacceptable perf hit.
+1) Leverage IBC. If there is IBC data in the R2R image then we can make that data available to the JIT. It may not be as relevant as in-process collected data, but it's quite likely better than synthetic data or no data.
+2) Sampled instrumentation for R2R methods. Produce an instrumented version and run it every so often before the method gets promoted to Tier1. This may be costly, especially if we have to use unoptimized methods for instrumentation, as we'll do quite a bit of extra jitting.
+3) Make R2R methods go through a separate instrumentation tier on their way to Tier1, see [DynamicPgo-InstrumentedTiers.md](DynamicPgo-InstrumentedTiers.md) prototype.
 
 #### Dynamic PGO, QuickJitForLoops, OSR
 
diff --git a/src/coreclr/debug/daccess/request.cpp b/src/coreclr/debug/daccess/request.cpp
index 88c1c03d695f9d..7f48fc6819c82a 100644
--- a/src/coreclr/debug/daccess/request.cpp
+++ b/src/coreclr/debug/daccess/request.cpp
@@ -1112,6 +1112,12 @@ HRESULT ClrDataAccess::GetTieredVersions(
                 case NativeCodeVersion::OptimizationTierOptimized:
                     nativeCodeAddrs[count].OptimizationTier = DacpTieredVersionData::OptimizationTier_Optimized;
                     break;
+                case NativeCodeVersion::OptimizationTier0Instrumented:
+                    nativeCodeAddrs[count].OptimizationTier = DacpTieredVersionData::OptimizationTier_QuickJittedInstrumented;
+                    break;
+                case NativeCodeVersion::OptimizationTier1Instrumented:
+                    nativeCodeAddrs[count].OptimizationTier = DacpTieredVersionData::OptimizationTier_OptimizedTier1Instrumented;
+                    break;
                 }
             }
             else if (pMD->IsJitOptimizationDisabled())
diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h
index 81cd74e76250a0..02c0d8487cfd4a 100644
--- a/src/coreclr/inc/clrconfigvalues.h
+++ b/src/coreclr/inc/clrconfigvalues.h
@@ -606,6 +606,12 @@ RETAIL_CONFIG_STRING_INFO(INTERNAL_PGODataPath, W("PGODataPath"), "Read/Write PG
 RETAIL_CONFIG_DWORD_INFO(INTERNAL_ReadPGOData, W("ReadPGOData"), 0, "Read PGO data")
 RETAIL_CONFIG_DWORD_INFO(INTERNAL_WritePGOData, W("WritePGOData"), 0, "Write PGO data")
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TieredPGO, W("TieredPGO"), 0, "Instrument Tier0 code and make counts available to Tier1")
+
+// TieredPGO_InstrumentOnlyHotCode values:
+//
+// 0) Instrument all IL-only code, R2R'd code is never instrumented
+// 1) Instrument only hot IL-only and hot R2R code (use optimizations in the instrumented tier for hot R2R and no optimizations for hot IL-only)
+RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TieredPGO_InstrumentOnlyHotCode, W("TieredPGO_InstrumentOnlyHotCode"), 1, "Strategy for TieredPGO, see comments in clrconfigvalues.h")
 #endif
 
 ///
diff --git a/src/coreclr/inc/dacprivate.h b/src/coreclr/inc/dacprivate.h
index 1e345810445c8a..5d920fd0da4905 100644
--- a/src/coreclr/inc/dacprivate.h
+++ b/src/coreclr/inc/dacprivate.h
@@ -610,6 +610,8 @@ struct MSLAYOUT DacpTieredVersionData
         OptimizationTier_OptimizedTier1,
         OptimizationTier_ReadyToRun,
         OptimizationTier_OptimizedTier1OSR,
+        OptimizationTier_QuickJittedInstrumented,
+        OptimizationTier_OptimizedTier1Instrumented,
     };
 
     CLRDATA_ADDRESS NativeCodeAddr;
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 9184f102ce7e4e..1ad5cb944ad8f4 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -4101,13 +4101,13 @@ const char* Compiler::compGetTieringName(bool wantShortName) const
     }
     else if (tier1)
     {
-        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_OSR))
+        if (opts.IsOSR())
         {
             return instrumenting ? "Instrumented Tier1-OSR" : "Tier1-OSR";
         }
         else
         {
-            return "Tier1";
+            return instrumenting ? "Instrumented Tier1" : "Tier1";
         }
     }
     else if (opts.OptimizationEnabled())
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 05dfb0bfe30111..2564aee7644166 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9181,6 +9181,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         }
 #endif
 
+        bool IsInstrumented() const
+        {
+            return jitFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR);
+        }
+
+        bool IsInstrumentedOptimized() const
+        {
+            return IsInstrumented() && jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1);
+        }
+
         // true if we should use the PINVOKE_{BEGIN,END} helpers instead of generating
         // PInvoke transitions inline. Normally used by R2R, but also used when generating a reverse pinvoke frame, as
         // the current logic for frame setup initializes and pushes
diff --git a/src/coreclr/jit/fginline.cpp b/src/coreclr/jit/fginline.cpp
index 55443f8d0a2b70..e43c9ac980e8d8 100644
--- a/src/coreclr/jit/fginline.cpp
+++ b/src/coreclr/jit/fginline.cpp
@@ -561,7 +561,8 @@ class SubstitutePlaceholdersAndDevirtualizeWalker : public GenTreeVisitor<Substi
             {
                 JITDUMP(" ... found foldable jtrue at [%06u] in " FMT_BB "\n", m_compiler->dspTreeID(tree),
                         block->bbNum);
-                noway_assert((block->bbNext->countOfInEdges() > 0) && (block->bbJumpDest->countOfInEdges() > 0));
+
+                noway_assert(!m_compiler->fgComputePredsDone);
 
                 // We have a constant operand, and should have the all clear to optimize.
                 // Update side effects on the tree, assert there aren't any, and bash to nop.
@@ -570,36 +571,20 @@ class SubstitutePlaceholdersAndDevirtualizeWalker : public GenTreeVisitor<Substi
                 tree->gtBashToNOP();
                 m_madeChanges = true;
 
-                BasicBlock* bNotTaken = nullptr;
-
-                if (condTree->AsIntCon()->gtIconVal != 0)
+                if (!condTree->IsIntegralConst(0))
                 {
                     block->bbJumpKind = BBJ_ALWAYS;
-                    bNotTaken         = block->bbNext;
                 }
                 else
                 {
                     block->bbJumpKind = BBJ_NONE;
-                    bNotTaken         = block->bbJumpDest;
-                }
-
-                m_compiler->fgRemoveRefPred(bNotTaken, block);
-
-                // If that was the last ref, a subsequent flow-opt pass
-                // will clean up the now-unreachable bNotTaken, and any
-                // other transitively unreachable blocks.
-                if (bNotTaken->bbRefs == 0)
-                {
-                    JITDUMP("... it looks like " FMT_BB " is now unreachable!\n", bNotTaken->bbNum);
                 }
             }
         }
         else
         {
-            const var_types retType    = tree->TypeGet();
-            GenTree*        foldedTree = m_compiler->gtFoldExpr(tree);
-            *pTree                     = foldedTree;
-            m_madeChanges              = true;
+            *pTree        = m_compiler->gtFoldExpr(tree);
+            m_madeChanges = true;
         }
     }
 };
diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp
index a0f82765b6b40c..5c2b9fa3b96901 100644
--- a/src/coreclr/jit/fgprofile.cpp
+++ b/src/coreclr/jit/fgprofile.cpp
@@ -383,7 +383,7 @@ void BlockCountInstrumentor::Prepare(bool preImport)
     //
     // If we see any, we need to adjust our instrumentation pattern.
     //
-    if (m_comp->opts.IsOSR() && ((m_comp->optMethodFlags & OMF_HAS_TAILCALL_SUCCESSOR) != 0))
+    if (m_comp->opts.IsInstrumentedOptimized() && ((m_comp->optMethodFlags & OMF_HAS_TAILCALL_SUCCESSOR) != 0))
     {
         JITDUMP("OSR + PGO + potential tail call --- preparing to relocate block probes\n");
 
@@ -1887,8 +1887,11 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
         (JitConfig.TC_PartialCompilation() > 0);
     const bool prejit               = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
     const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && mayHavePatchpoints;
-    const bool osrMethod            = opts.IsOSR();
-    const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod;
+    const bool isOptimized          = opts.IsInstrumentedOptimized();
+    const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !isOptimized;
+
+    // TODO-TP: Don't give up on edge profiling for optimized code, currently it has issues
+    // such as unexpected trees near tail calls
 
     if (useEdgeProfiles)
     {
@@ -1899,7 +1902,7 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
         JITDUMP("Using block profiling, because %s\n",
                 (JitConfig.JitEdgeProfiling() == 0)
                     ? "edge profiles disabled"
-                    : prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints");
+                    : prejit ? "prejitting" : isOptimized ? "tier1 instrumented" : "tier0 with patchpoints");
 
         fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
     }
diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp
index bfd042fd102e83..5c90d1efc81520 100644
--- a/src/coreclr/jit/importercalls.cpp
+++ b/src/coreclr/jit/importercalls.cpp
@@ -1288,7 +1288,7 @@ var_types Compiler::impImportCall(OPCODE                  opcode,
         //    have to check for anything that might introduce a recursive tail call.
         // * We only instrument root method blocks in OSR methods,
         //
-        if (opts.IsOSR() && !compIsForInlining())
+        if ((opts.IsInstrumentedOptimized() || opts.IsOSR()) && !compIsForInlining())
         {
             // If a root method tail call candidate block is not a BBJ_RETURN, it should have a unique
             // BBJ_RETURN successor. Mark that successor so we can handle it specially during profile
@@ -1312,9 +1312,9 @@ var_types Compiler::impImportCall(OPCODE                  opcode,
 
             // Only schedule importation if we're not currently importing.
             //
-            if (mustImportEntryBlock && (compCurBB != fgEntryBB))
+            if (opts.IsOSR() && mustImportEntryBlock && (compCurBB != fgEntryBB))
             {
-                JITDUMP("\nOSR: inlineable or recursive tail call [%06u] in the method, so scheduling " FMT_BB
+                JITDUMP("\ninlineable or recursive tail call [%06u] in the method, so scheduling " FMT_BB
                         " for importation\n",
                         dspTreeID(call), fgEntryBB->bbNum);
                 impImportBlockPending(fgEntryBB);
@@ -6290,7 +6290,7 @@ bool Compiler::impConsiderCallProbe(GenTreeCall* call, IL_OFFSET ilOffset)
         return false;
     }
 
-    assert(opts.OptimizationDisabled() || opts.IsOSR());
+    assert(opts.OptimizationDisabled() || opts.IsInstrumentedOptimized());
     assert(!compIsForInlining());
 
     // During importation, optionally flag this block as one that
diff --git a/src/coreclr/vm/callcounting.cpp b/src/coreclr/vm/callcounting.cpp
index 671eb8f018d5bc..a49a2ace6032ca 100644
--- a/src/coreclr/vm/callcounting.cpp
+++ b/src/coreclr/vm/callcounting.cpp
@@ -574,7 +574,7 @@ bool CallCountingManager::SetCodeEntryPoint(
             // For a default code version that is not tier 0, call counting will have been disabled by this time (checked
             // below). Avoid the redundant and not-insignificant expense of GetOptimizationTier() on a default code version.
             !activeCodeVersion.IsDefaultVersion() &&
-            activeCodeVersion.GetOptimizationTier() != NativeCodeVersion::OptimizationTier0
+            activeCodeVersion.IsFinalTier()
         ) ||
         !g_pConfig->TieredCompilation_CallCounting())
     {
@@ -602,7 +602,7 @@ bool CallCountingManager::SetCodeEntryPoint(
                 return true;
             }
 
-            _ASSERTE(activeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+            _ASSERTE(!activeCodeVersion.IsFinalTier());
 
             // If the tiering delay is active, postpone further work
             if (GetAppDomain()
@@ -649,7 +649,7 @@ bool CallCountingManager::SetCodeEntryPoint(
         }
         else
         {
-            _ASSERTE(activeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+            _ASSERTE(!activeCodeVersion.IsFinalTier());
 
             // If the tiering delay is active, postpone further work
             if (GetAppDomain()
@@ -659,7 +659,7 @@ bool CallCountingManager::SetCodeEntryPoint(
                 return true;
             }
 
-            CallCount callCountThreshold = (CallCount)g_pConfig->TieredCompilation_CallCountThreshold();
+            CallCount callCountThreshold = g_pConfig->TieredCompilation_CallCountThreshold();
             _ASSERTE(callCountThreshold != 0);
 
             NewHolder<CallCountingInfo> callCountingInfoHolder = new CallCountingInfo(activeCodeVersion, callCountThreshold);
@@ -780,7 +780,7 @@ PCODE CallCountingManager::OnCallCountThresholdReached(TransitionBlock *transiti
     // used going forward under appropriate locking to synchronize further with deletion.
     GCX_PREEMP_THREAD_EXISTS(CURRENT_THREAD);
 
-    _ASSERTE(codeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+    _ASSERTE(!codeVersion.IsFinalTier());
 
     codeEntryPoint = codeVersion.GetNativeCode();
     do
diff --git a/src/coreclr/vm/codeversion.cpp b/src/coreclr/vm/codeversion.cpp
index bd7fce5d9d7c4d..9bc971033915e0 100644
--- a/src/coreclr/vm/codeversion.cpp
+++ b/src/coreclr/vm/codeversion.cpp
@@ -151,7 +151,11 @@ NativeCodeVersion::OptimizationTier NativeCodeVersionNode::GetOptimizationTier()
 void NativeCodeVersionNode::SetOptimizationTier(NativeCodeVersion::OptimizationTier tier)
 {
     LIMITED_METHOD_CONTRACT;
-    _ASSERTE(tier >= m_optTier);
+
+    _ASSERTE(
+        tier == m_optTier ||
+        (m_optTier != NativeCodeVersion::OptimizationTier::OptimizationTier1 &&
+         m_optTier != NativeCodeVersion::OptimizationTier::OptimizationTierOptimized));
 
     m_optTier = tier;
 }
@@ -333,6 +337,13 @@ NativeCodeVersion::OptimizationTier NativeCodeVersion::GetOptimizationTier() con
     }
 }
 
+bool NativeCodeVersion::IsFinalTier() const
+{
+    LIMITED_METHOD_DAC_CONTRACT;
+    OptimizationTier tier = GetOptimizationTier();
+    return tier == OptimizationTier1 || tier == OptimizationTierOptimized;
+}
+
 #ifndef DACCESS_COMPILE
 void NativeCodeVersion::SetOptimizationTier(OptimizationTier tier)
 {
@@ -808,7 +819,7 @@ bool ILCodeVersion::HasAnyOptimizedNativeCodeVersion(NativeCodeVersion tier0Nati
     _ASSERTE(!tier0NativeCodeVersion.IsNull());
     _ASSERTE(tier0NativeCodeVersion.GetILCodeVersion() == *this);
     _ASSERTE(tier0NativeCodeVersion.GetMethodDesc()->IsEligibleForTieredCompilation());
-    _ASSERTE(tier0NativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+    _ASSERTE(!tier0NativeCodeVersion.IsFinalTier());
 
     NativeCodeVersionCollection nativeCodeVersions = GetNativeCodeVersions(tier0NativeCodeVersion.GetMethodDesc());
     for (auto itEnd = nativeCodeVersions.End(), it = nativeCodeVersions.Begin(); it != itEnd; ++it)
@@ -1708,9 +1719,7 @@ PCODE CodeVersionManager::PublishVersionableCodeIfNecessary(
             {
             #ifdef FEATURE_TIERED_COMPILATION
                 _ASSERTE(!config->ShouldCountCalls() || pMethodDesc->IsEligibleForTieredCompilation());
-                _ASSERTE(
-                    !config->ShouldCountCalls() ||
-                    activeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+                _ASSERTE(!config->ShouldCountCalls() || !activeVersion.IsFinalTier());
                 if (config->ShouldCountCalls()) // the generated code was at a tier that is call-counted
                 {
                     // This is the first call to a call-counted code version of the method
diff --git a/src/coreclr/vm/codeversion.h b/src/coreclr/vm/codeversion.h
index d83bfa29c2ea6f..66de4ba27257a4 100644
--- a/src/coreclr/vm/codeversion.h
+++ b/src/coreclr/vm/codeversion.h
@@ -71,15 +71,19 @@ class NativeCodeVersion
     BOOL SetNativeCodeInterlocked(PCODE pCode, PCODE pExpected = NULL);
 #endif
 
+    // NOTE: Don't change existing values to avoid breaking changes in event tracing
     enum OptimizationTier
     {
         OptimizationTier0,
         OptimizationTier1,
         OptimizationTier1OSR,
         OptimizationTierOptimized, // may do less optimizations than tier 1
+        OptimizationTier0Instrumented,
+        OptimizationTier1Instrumented,
     };
 #ifdef FEATURE_TIERED_COMPILATION
     OptimizationTier GetOptimizationTier() const;
+    bool IsFinalTier() const;
 #ifndef DACCESS_COMPILE
     void SetOptimizationTier(OptimizationTier tier);
 #endif
diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp
index 883d602b1f2309..f438a31838fa14 100644
--- a/src/coreclr/vm/eeconfig.cpp
+++ b/src/coreclr/vm/eeconfig.cpp
@@ -239,6 +239,7 @@ HRESULT EEConfig::Init()
 
 #if defined(FEATURE_PGO)
     fTieredPGO = false;
+    tieredPGO_InstrumentOnlyHotCode = false;
 #endif
 
 #if defined(FEATURE_READYTORUN)
@@ -699,10 +700,6 @@ HRESULT EEConfig::sync()
 
     dwSleepOnExit = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_SleepOnExit);
 
-#if defined(FEATURE_PGO)
-    fTieredPGO = Configuration::GetKnobBooleanValue(W("System.Runtime.TieredPGO"), CLRConfig::EXTERNAL_TieredPGO);
-#endif
-
 #if defined(FEATURE_TIERED_COMPILATION)
     fTieredCompilation = Configuration::GetKnobBooleanValue(W("System.Runtime.TieredCompilation"), CLRConfig::EXTERNAL_TieredCompilation);
     if (fTieredCompilation)
@@ -784,6 +781,20 @@ HRESULT EEConfig::sync()
     }
 #endif
 
+#if defined(FEATURE_PGO)
+    fTieredPGO = Configuration::GetKnobBooleanValue(W("System.Runtime.TieredPGO"), CLRConfig::EXTERNAL_TieredPGO);
+
+    // Also, consider DynamicPGO enabled if WritePGOData is set
+    fTieredPGO |= CLRConfig::GetConfigValue(CLRConfig::INTERNAL_WritePGOData) != 0;
+    tieredPGO_InstrumentOnlyHotCode = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_TieredPGO_InstrumentOnlyHotCode) == 1;
+
+    // We need quick jit for TieredPGO
+    if (!fTieredCompilation_QuickJit)
+    {
+        fTieredPGO = false;
+    }
+#endif
+
 #if defined(FEATURE_ON_STACK_REPLACEMENT)
     dwOSR_HitLimit = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_OSR_HitLimit);
     dwOSR_CounterBump = CLRConfig::GetConfigValue(CLRConfig::INTERNAL_OSR_CounterBump);
diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h
index 4651cf1bc84e0f..684f04181fa3b9 100644
--- a/src/coreclr/vm/eeconfig.h
+++ b/src/coreclr/vm/eeconfig.h
@@ -92,6 +92,7 @@ class EEConfig
 
 #if defined(FEATURE_PGO)
     bool          TieredPGO(void) const { LIMITED_METHOD_CONTRACT;  return fTieredPGO; }
+    bool          TieredPGO_InstrumentOnlyHotCode(void) const { LIMITED_METHOD_CONTRACT;  return tieredPGO_InstrumentOnlyHotCode; }
 #endif
 
 #if defined(FEATURE_READYTORUN)
@@ -658,6 +659,7 @@ class EEConfig
 
 #if defined(FEATURE_PGO)
     bool fTieredPGO;
+    bool tieredPGO_InstrumentOnlyHotCode;
 #endif
 
 #if defined(FEATURE_READYTORUN)
diff --git a/src/coreclr/vm/interpreter.cpp b/src/coreclr/vm/interpreter.cpp
index d30b9934e10d70..1210ed5453a390 100644
--- a/src/coreclr/vm/interpreter.cpp
+++ b/src/coreclr/vm/interpreter.cpp
@@ -1802,7 +1802,7 @@ void Interpreter::JitMethodIfAppropriate(InterpreterMethodInfo* interpMethInfo,
             CodeVersionManager::LockHolder _lockHolder;
             NativeCodeVersion activeCodeVersion = md->GetCodeVersionManager()->GetActiveILCodeVersion(md).GetActiveNativeCodeVersion(md);
             ILCodeVersion ilCodeVersion = activeCodeVersion.GetILCodeVersion();
-            if (activeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0 &&
+            if (!activeCodeVersion.IsFinalTier() &&
                 !ilCodeVersion.HasAnyOptimizedNativeCodeVersion(activeCodeVersion))
             {
                 tieredCompilationManager->AsyncPromoteToTier1(activeCodeVersion, &scheduleTieringBackgroundWork);
diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp
index 7f9eb048926049..3e0cd970d97969 100644
--- a/src/coreclr/vm/jitinterface.cpp
+++ b/src/coreclr/vm/jitinterface.cpp
@@ -12823,23 +12823,6 @@ CORJIT_FLAGS GetCompileFlags(MethodDesc * ftn, CORJIT_FLAGS flags, CORINFO_METHO
 
 #ifdef FEATURE_PGO
 
-    // Instrument, if
-    //
-    // * We're writing pgo data and we're jitting at Tier0.
-    // * Tiered PGO is enabled and we're jitting at Tier0.
-    // * Tiered PGO is enabled and we are jitting an OSR method.
-    //
-    if ((CLRConfig::GetConfigValue(CLRConfig::INTERNAL_WritePGOData) > 0)
-        && flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0))
-    {
-        flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
-    }
-    else if ((g_pConfig->TieredPGO())
-        && (flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0) || flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_OSR)))
-    {
-        flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
-    }
-
     if (CLRConfig::GetConfigValue(CLRConfig::INTERNAL_ReadPGOData) > 0)
     {
         flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBOPT);
diff --git a/src/coreclr/vm/method.hpp b/src/coreclr/vm/method.hpp
index 0edff0f4ff3650..6df3d0042e3343 100644
--- a/src/coreclr/vm/method.hpp
+++ b/src/coreclr/vm/method.hpp
@@ -1993,6 +1993,8 @@ class PrepareCodeConfig
         QuickJitted,
         OptimizedTier1,
         OptimizedTier1OSR,
+        InstrumentedTier,
+        InstrumentedTierOptimized,
 
         Count
     };
diff --git a/src/coreclr/vm/prestub.cpp b/src/coreclr/vm/prestub.cpp
index 0aba1852ab40f6..4b4373ac40e818 100644
--- a/src/coreclr/vm/prestub.cpp
+++ b/src/coreclr/vm/prestub.cpp
@@ -364,9 +364,9 @@ PCODE MethodDesc::PrepareILBasedCode(PrepareCodeConfig* pConfig)
         if (codeVersion.IsDefaultVersion())
         {
             pConfig->GetMethodDesc()->GetLoaderAllocator()->GetCallCountingManager()->DisableCallCounting(codeVersion);
-            _ASSERTE(codeVersion.GetOptimizationTier() != NativeCodeVersion::OptimizationTier0);
+            _ASSERTE(codeVersion.IsFinalTier());
         }
-        else if (codeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0)
+        else if (!codeVersion.IsFinalTier())
         {
             codeVersion.SetOptimizationTier(NativeCodeVersion::OptimizationTierOptimized);
         }
@@ -457,7 +457,7 @@ PCODE MethodDesc::GetPrecompiledCode(PrepareCodeConfig* pConfig, bool shouldTier
 #ifdef FEATURE_TIERED_COMPILATION
             if (shouldCountCalls)
             {
-                _ASSERTE(pConfig->GetCodeVersion().GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+                _ASSERTE(!pConfig->GetCodeVersion().IsFinalTier());
                 pConfig->SetShouldCountCalls();
             }
 #endif
@@ -1225,6 +1225,12 @@ PrepareCodeConfig::JitOptimizationTier PrepareCodeConfig::GetJitOptimizationTier
                 case NativeCodeVersion::OptimizationTierOptimized:
                     return JitOptimizationTier::Optimized;
 
+                case NativeCodeVersion::OptimizationTier0Instrumented:
+                    return JitOptimizationTier::InstrumentedTier;
+
+                case NativeCodeVersion::OptimizationTier1Instrumented:
+                    return JitOptimizationTier::InstrumentedTierOptimized;
+
                 default:
                     UNREACHABLE();
             }
@@ -1247,6 +1253,8 @@ const char *PrepareCodeConfig::GetJitOptimizationTierStr(PrepareCodeConfig *conf
         case JitOptimizationTier::QuickJitted: return "QuickJitted";
         case JitOptimizationTier::OptimizedTier1: return "OptimizedTier1";
         case JitOptimizationTier::OptimizedTier1OSR: return "OptimizedTier1OSR";
+        case JitOptimizationTier::InstrumentedTier: return "InstrumentedTier";
+        case JitOptimizationTier::InstrumentedTierOptimized: return "InstrumentedTierOptimized";
 
         default:
             UNREACHABLE();
@@ -1296,6 +1304,7 @@ bool PrepareCodeConfig::FinalizeOptimizationTierForTier0LoadOrJit()
         NativeCodeVersion::OptimizationTier previousOptimizationTier = GetCodeVersion().GetOptimizationTier();
         _ASSERTE(
             previousOptimizationTier == NativeCodeVersion::OptimizationTier0 ||
+            previousOptimizationTier == NativeCodeVersion::OptimizationTier0Instrumented ||
             previousOptimizationTier == NativeCodeVersion::OptimizationTierOptimized);
     #endif // _DEBUG
 
diff --git a/src/coreclr/vm/tieredcompilation.cpp b/src/coreclr/vm/tieredcompilation.cpp
index 3b15100a34fb66..dcff50459c8b4a 100644
--- a/src/coreclr/vm/tieredcompilation.cpp
+++ b/src/coreclr/vm/tieredcompilation.cpp
@@ -111,6 +111,22 @@ NativeCodeVersion::OptimizationTier TieredCompilationManager::GetInitialOptimiza
         return NativeCodeVersion::OptimizationTierOptimized;
     }
 
+#ifdef FEATURE_PGO
+    if (g_pConfig->TieredPGO())
+    {
+        // Initial tier for R2R is always just OptimizationTier0
+        // For ILOnly it depends on TieredPGO_InstrumentOnlyHotCode:
+        // 1 - OptimizationTier0 as we don't want to instrument the initial version (will only instrument hot Tier0)
+        // 2 - OptimizationTier0Instrumented - instrument all ILOnly code
+        if (g_pConfig->TieredPGO_InstrumentOnlyHotCode() || 
+            ExecutionManager::IsReadyToRunCode(pMethodDesc->GetNativeCode()))
+        {
+            return NativeCodeVersion::OptimizationTier0;
+        }
+        return NativeCodeVersion::OptimizationTier0Instrumented;
+    }
+#endif
+
     return NativeCodeVersion::OptimizationTier0;
 #else
     return NativeCodeVersion::OptimizationTierOptimized;
@@ -237,7 +253,7 @@ bool TieredCompilationManager::TrySetCodeEntryPointAndRecordMethodForCallCountin
 }
 
 void TieredCompilationManager::AsyncPromoteToTier1(
-    NativeCodeVersion tier0NativeCodeVersion,
+    NativeCodeVersion currentNativeCodeVersion,
     bool *createTieringBackgroundWorkerRef)
 {
     CONTRACTL
@@ -249,8 +265,8 @@ void TieredCompilationManager::AsyncPromoteToTier1(
     CONTRACTL_END;
 
     _ASSERTE(CodeVersionManager::IsLockOwnedByCurrentThread());
-    _ASSERTE(!tier0NativeCodeVersion.IsNull());
-    _ASSERTE(tier0NativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+    _ASSERTE(!currentNativeCodeVersion.IsNull());
+    _ASSERTE(!currentNativeCodeVersion.IsFinalTier());
     _ASSERTE(createTieringBackgroundWorkerRef != nullptr);
 
     NativeCodeVersion t1NativeCodeVersion;
@@ -261,10 +277,41 @@ void TieredCompilationManager::AsyncPromoteToTier1(
     // particular version of the IL code regardless of any changes that may
     // occur between now and when jitting completes. If the IL does change in that
     // interval the new code entry won't be activated.
-    MethodDesc *pMethodDesc = tier0NativeCodeVersion.GetMethodDesc();
-    ILCodeVersion ilCodeVersion = tier0NativeCodeVersion.GetILCodeVersion();
-    _ASSERTE(!ilCodeVersion.HasAnyOptimizedNativeCodeVersion(tier0NativeCodeVersion));
-    hr = ilCodeVersion.AddNativeCodeVersion(pMethodDesc, NativeCodeVersion::OptimizationTier1, &t1NativeCodeVersion);
+    MethodDesc *pMethodDesc = currentNativeCodeVersion.GetMethodDesc();
+
+    NativeCodeVersion::OptimizationTier nextTier = NativeCodeVersion::OptimizationTier1;
+
+#ifdef FEATURE_PGO
+    if (g_pConfig->TieredPGO())
+    {
+        if (currentNativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0 &&
+            g_pConfig->TieredPGO_InstrumentOnlyHotCode())
+        {
+            if (ExecutionManager::IsReadyToRunCode(currentNativeCodeVersion.GetNativeCode()))
+            {
+                // We definitely don't want to use unoptimized instrumentation tier for hot R2R:
+                // 1) It will produce a lot of new compilations for small methods which were inlined in R2R
+                // 2) Noticeable performance regression from fast R2R to slow instrumented Tier0
+                nextTier = NativeCodeVersion::OptimizationTier1Instrumented;
+            }
+            else
+            {
+                // For ILOnly it's fine to use unoptimized instrumented tier:
+                // 1) No new compilations since previous tier already triggered them
+                // 2) Better profile since we'll be able to instrument inlinees
+                // 3) Unoptimized instrumented tier is faster to produce and wire up
+                nextTier = NativeCodeVersion::OptimizationTier0Instrumented;
+
+                // NOTE: we might consider using OptimizationTier1Instrumented if the previous Tier0
+                // made it to Tier1-OSR.
+            }
+        }
+    }
+#endif
+
+    ILCodeVersion ilCodeVersion = currentNativeCodeVersion.GetILCodeVersion();
+    _ASSERTE(!ilCodeVersion.HasAnyOptimizedNativeCodeVersion(currentNativeCodeVersion));
+    hr = ilCodeVersion.AddNativeCodeVersion(pMethodDesc, nextTier, &t1NativeCodeVersion);
     if (FAILED(hr))
     {
         ThrowHR(hr);
@@ -992,7 +1039,7 @@ CORJIT_FLAGS TieredCompilationManager::GetJitFlags(PrepareCodeConfig *config)
     _ASSERTE(config != nullptr);
     _ASSERTE(
         !config->WasTieringDisabledBeforeJitting() ||
-        config->GetCodeVersion().GetOptimizationTier() != NativeCodeVersion::OptimizationTier0);
+        config->GetCodeVersion().IsFinalTier());
 
     CORJIT_FLAGS flags;
 
@@ -1015,9 +1062,25 @@ CORJIT_FLAGS TieredCompilationManager::GetJitFlags(PrepareCodeConfig *config)
         NativeCodeVersion::OptimizationTier newOptimizationTier;
         if (!methodDesc->RequestedAggressiveOptimization())
         {
+            NativeCodeVersion::OptimizationTier currentTier = nativeCodeVersion.GetOptimizationTier();
+
             if (g_pConfig->TieredCompilation_QuickJit())
             {
-                _ASSERTE(nativeCodeVersion.GetOptimizationTier() == NativeCodeVersion::OptimizationTier0);
+                if (currentTier == NativeCodeVersion::OptimizationTier::OptimizationTier0Instrumented)
+                {
+                    flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+                    flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
+                    return flags;
+                }
+
+                if (currentTier == NativeCodeVersion::OptimizationTier::OptimizationTier1Instrumented)
+                {
+                    flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+                    flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER1);
+                    return flags;
+                }
+
+                _ASSERTE(!nativeCodeVersion.IsFinalTier());
                 flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
                 return flags;
             }
@@ -1040,13 +1103,24 @@ CORJIT_FLAGS TieredCompilationManager::GetJitFlags(PrepareCodeConfig *config)
 
     switch (nativeCodeVersion.GetOptimizationTier())
     {
+        case NativeCodeVersion::OptimizationTier0Instrumented:
+            _ASSERT(g_pConfig->TieredCompilation_QuickJit());
+            flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+            flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
+            break;
+
+        case NativeCodeVersion::OptimizationTier1Instrumented:
+            _ASSERT(g_pConfig->TieredCompilation_QuickJit());
+            flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
+            flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER1);
+            break;
+
         case NativeCodeVersion::OptimizationTier0:
             if (g_pConfig->TieredCompilation_QuickJit())
             {
                 flags.Set(CORJIT_FLAGS::CORJIT_FLAG_TIER0);
                 break;
             }
-
             nativeCodeVersion.SetOptimizationTier(NativeCodeVersion::OptimizationTierOptimized);
             goto Optimized;
 
diff --git a/src/coreclr/vm/tieredcompilation.h b/src/coreclr/vm/tieredcompilation.h
index 4ad624a3f7f1c2..bf078dbc2979e8 100644
--- a/src/coreclr/vm/tieredcompilation.h
+++ b/src/coreclr/vm/tieredcompilation.h
@@ -42,7 +42,7 @@ class TieredCompilationManager
 public:
     void HandleCallCountingForFirstCall(MethodDesc* pMethodDesc);
     bool TrySetCodeEntryPointAndRecordMethodForCallCounting(MethodDesc* pMethodDesc, PCODE codeEntryPoint);
-    void AsyncPromoteToTier1(NativeCodeVersion tier0NativeCodeVersion, bool *createTieringBackgroundWorkerRef);
+    void AsyncPromoteToTier1(NativeCodeVersion currentNativeCodeVersion, bool *createTieringBackgroundWorkerRef);
     static CORJIT_FLAGS GetJitFlags(PrepareCodeConfig *config);
 
 #if !defined(DACCESS_COMPILE) && defined(_DEBUG)
diff --git a/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.cs b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.cs
new file mode 100644
index 00000000000000..f3705502e9967c
--- /dev/null
+++ b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.cs
@@ -0,0 +1,33 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Threading;
+
+// A smoke test for all DOTNET_TieredPGO strategies
+class Program : IDisposable
+{
+    static int Main()
+    {
+        Program p = new();
+        for (int i = 0; i < 100; i++)
+        {
+            HotLoop(p);
+            Thread.Sleep(40); // cold loop
+        }
+        return 100;
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void HotLoop(IDisposable d)
+    {
+        for (int i = 0; i < 100000; i++) // hot loop
+            d?.Dispose();
+    }
+
+    public void Dispose() => Test();
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    void Test() { }
+}
\ No newline at end of file
diff --git a/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.csproj b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.csproj
new file mode 100644
index 00000000000000..0d7ec3d6e64703
--- /dev/null
+++ b/src/tests/JIT/PGO/InstrumentedTiers/InstrumentedTiers.csproj
@@ -0,0 +1,21 @@
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <Optimize>True</Optimize>
+    <CLRTestBatchPreCommands><![CDATA[
+      $(CLRTestBatchPreCommands)
+      set DOTNET_TieredCompilation=1
+      set DOTNET_TieredPGO=1
+      set DOTNET_TieredPGO_InstrumentOnlyHotCode=1
+      ]]></CLRTestBatchPreCommands>
+          <BashCLRTestPreCommands><![CDATA[
+      $(BashCLRTestPreCommands)
+      export DOTNET_TieredCompilation=1
+      export DOTNET_TieredPGO=1
+      export DOTNET_TieredPGO_InstrumentOnlyHotCode=1
+      ]]></BashCLRTestPreCommands>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="InstrumentedTiers.cs" />
+  </ItemGroup>
+</Project>
\ No newline at end of file