Skip to content

Conversation

@github-actions
Copy link

No description provided.

tannergooding pushed a commit that referenced this pull request Mar 19, 2025
* JIT: Introduce `LclVarDsc::lvIsMultiRegDest`

With recent work to expand returned promoted locals into `FIELD_LIST`
the only "whole references" of promoted locals we should see is when
stored from a multi-reg node. This is the only knowledge the backend
should need for correctness purposes, so introduce a bit to track this
property, and switch the backend to check this instead.

The existing `lvIsMultiRegRet` is essentially this + whether the local
is returned. We should be able to remove this, but it is currently used
for some heuristics in old promotion, so keep it around for now.

* JIT: Add some more constant folding in lowering

Add folding for shifts and certain binops that are now getting produced
late due to returned `FIELD_LIST` nodes.

win-arm64 example:
```csharp
[MethodImpl(MethodImplOptions.NoInlining)]
static ValueTask<byte> Foo()
{
    return new ValueTask<byte>(123);
}
```

```diff
 G_M17084_IG02:  ;; offset=0x0008
             mov     x0, xzr
-            mov     w1, #1
-            mov     w2, wzr
-            mov     w3, dotnet#123
-            orr     w2, w2, w3,  LSL #16
-            orr     w1, w2, w1,  LSL #24
-						;; size=24 bbWeight=1 PerfScore 4.00
+            mov     w1, #0x17B0000
+						;; size=8 bbWeight=1 PerfScore 1.00
```

* Feedback
tannergooding pushed a commit that referenced this pull request May 29, 2025
Add support using bitwise operations to reconstruct registers passed
into calls from multiple promoted fields. Remove the IR invariant that
`FIELD_LIST` args must always map cleanly to registers; instead, any
`FIELD_LIST` is allowed for register-only arguments before lowering, and
lowering takes care to normalize them into a handled shape.

`fgTryMorphStructArg` is changed to take advantage of this by now
producing `FIELD_LIST` even when a promoted arg does not match the
target ABI. Support in physical promotion will be added in a follow up.

Split arguments are not handled and retain the old IR invariant of
requiring registers and stack slots to make cleanly from `FIELD_LIST`.

win-x64 examples:
```csharp
static void Foo(int x)
{
    Use<int?>(x);
    Use<int?>(5);
    Use<int?>(null);
}
```

```diff
 G_M7200_IG02:  ;; offset=0x0004
-       mov      byte  ptr [rsp+0x20], 1
-       mov      dword ptr [rsp+0x24], ecx
-       mov      rcx, qword ptr [rsp+0x20]
+       mov      ecx, ecx
+       shl      rcx, 32
+       or       rcx, 1
        call     [Program:Bar(System.Nullable`1[int])]
-       mov      dword ptr [rsp+0x24], 5
-       mov      rcx, qword ptr [rsp+0x20]
+       mov      rcx, 0x500000001
        call     [Program:Bar(System.Nullable`1[int])]
-       mov      byte  ptr [rsp+0x20], 0
        xor      ecx, ecx
-       mov      dword ptr [rsp+0x24], ecx
-       mov      rcx, qword ptr [rsp+0x20]
-						;; size=55 bbWeight=1 PerfScore 14.25
+						;; size=34 bbWeight=1 PerfScore 7.50

 G_M7200_IG03:
        add      rsp, 40
        tail.jmp [Program:Bar(System.Nullable`1[int])]
 						;; size=10 bbWeight=1 PerfScore 2.25
```

```csharp
static void Foo(int x, float y)
{
    Use((x, y));
}
```

```diff
 G_M42652_IG01:  ;; offset=0x0000
-       push     rax
-						;; size=1 bbWeight=1 PerfScore 1.00
+						;; size=0 bbWeight=1 PerfScore 0.00

 G_M42652_IG02:
-       mov      dword ptr [rsp], ecx
-       vmovss   dword ptr [rsp+0x04], xmm1
-       mov      rcx, qword ptr [rsp]
+       vmovd    eax, xmm1
+       shl      rax, 32
+       mov      ecx, ecx
+       or       rcx, rax
 						;; size=13 bbWeight=1 PerfScore 3.00

 G_M42652_IG03:
-       add      rsp, 8
        tail.jmp [Program:Use[System.ValueTuple`2[int,float]](System.ValueTuple`2[int,float])]
```

A win-arm64 example:
```csharp
static void Foo(int[] arr)
{
    Use(arr.AsMemory());
}
```

```diff
 G_M33990_IG01:
-            stp     fp, lr, [sp, #-0x20]!
+            stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
-            str     xzr, [fp, #0x10]	// [V03 tmp2]
-						;; size=12 bbWeight=1 PerfScore 2.50
+						;; size=8 bbWeight=1 PerfScore 1.50

 G_M33990_IG02:
             cbz     x0, G_M33990_IG04
 						;; size=4 bbWeight=1 PerfScore 1.00

 G_M33990_IG03:
-            str     x0, [fp, #0x10]	// [V07 tmp6]
-            str     wzr, [fp, #0x18]	// [V08 tmp7]
-            ldr     w0, [x0, #0x08]
-            str     w0, [fp, #0x1C]	// [V09 tmp8]
+            ldr     w1, [x0, #0x08]
             b       G_M33990_IG05
-						;; size=20 bbWeight=0.50 PerfScore 3.50
+						;; size=8 bbWeight=0.50 PerfScore 2.00

 G_M33990_IG04:
-            str     xzr, [fp, #0x10]	// [V07 tmp6]
-            str     xzr, [fp, #0x18]
-						;; size=8 bbWeight=0.50 PerfScore 1.00
+            mov     x0, xzr
+            mov     w1, wzr
+						;; size=8 bbWeight=0.50 PerfScore 0.50

 G_M33990_IG05:
-            ldp     x0, x1, [fp, #0x10]	// [V03 tmp2], [V03 tmp2+0x08]
-            movz    x2, #0xD920      // code for Program:Use[System.Memory`1[int]](System.Memory`1[int])
-            movk    x2, #0x4590 LSL #16
+            mov     w1, w1
+            lsl     x1, x1, dotnet#32
+            movz    x2, #0xD950      // code for Program:Use[System.Memory`1[int]](System.Memory`1[int])
+            movk    x2, #0x4592 LSL #16
             movk    x2, #0x7FFE LSL dotnet#32
             ldr     x2, [x2]
-						;; size=20 bbWeight=1 PerfScore 7.50
+						;; size=24 bbWeight=1 PerfScore 6.00

 G_M33990_IG06:
-            ldp     fp, lr, [sp], #0x20
+            ldp     fp, lr, [sp], #0x10
             br      x2
 						;; size=8 bbWeight=1 PerfScore 2.00
-; Total bytes of code: 72
+; Total bytes of code: 60
```
@tannergooding tannergooding deleted the update-chrome-version-6992136271 branch July 1, 2025 14:40
tannergooding pushed a commit that referenced this pull request Dec 2, 2025
Enable X64's optimization where we clear LCLHEAP via STORE_BLK inserted
in Lower on arm64.

```cs
static void Test128() => Consume(stackalloc char[128]);
```
was:
```asm
            stp     xzr, xzr, [sp, #-0x10]!
            stp     xzr, xzr, [sp, #-0xF0]!
            stp     xzr, xzr, [sp, #0x10]
            stp     xzr, xzr, [sp, #0x20]
            stp     xzr, xzr, [sp, #0x30]
            stp     xzr, xzr, [sp, #0x40]
            stp     xzr, xzr, [sp, #0x50]
            stp     xzr, xzr, [sp, #0x60]
            stp     xzr, xzr, [sp, #0x70]
            stp     xzr, xzr, [sp, #0x80]
            stp     xzr, xzr, [sp, #0x90]
            stp     xzr, xzr, [sp, #0xA0]
            stp     xzr, xzr, [sp, #0xB0]
            stp     xzr, xzr, [sp, #0xC0]
            stp     xzr, xzr, [sp, #0xD0]
            stp     xzr, xzr, [sp, #0xE0]
```
now:
```asm
            movi    v16.16b, #0
            stp     q16, q16, [x0]
            stp     q16, q16, [x0, #0x20]
            stp     q16, q16, [x0, #0x40]
            stp     q16, q16, [x0, #0x60]
            stp     q16, q16, [x0, #0x80]
            stp     q16, q16, [x0, #0xA0]
            stp     q16, q16, [x0, #0xC0]
            stp     q16, q16, [x0, #0xE0]
```

Also, for larger sizes the previous logic used to emit a slow loop (e.g.
1024 bytes):
```asm
            mov     w0, #0x400
G_M30953_IG03:
            stp     xzr, xzr, [sp, #-0x10]!
            subs    x0, x0, #16
            bne     G_M30953_IG03
```
Now it will emit a call to `CORINFO_HELP_MEMZERO`


[Benchmarks.](EgorBot/runtime-utils#553)

```cs
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;

public class Benchmarks
{
    [Benchmark] public void Stackalloc64() => Consume(stackalloc byte[64]);
    [Benchmark] public void Stackalloc128() => Consume(stackalloc byte[128]);
    [Benchmark] public void Stackalloc256() => Consume(stackalloc byte[256]);
    [Benchmark] public void Stackalloc512() => Consume(stackalloc byte[512]);
    [Benchmark] public void Stackalloc1024() => Consume(stackalloc byte[1024]);
    [Benchmark] public void Stackalloc16384() => Consume(stackalloc byte[16384]);

    [MethodImpl(MethodImplOptions.NoInlining)]
    static void Consume(Span<byte> x){}
}
```

| Method | Toolchain | Mean | Error | Ratio |
|---------------- |------------------------
|-----------:|----------:|------:|
| Stackalloc64    | Main |   3.425 ns | 0.0004 ns |  1.00 |
| Stackalloc64    | PR |   2.559 ns | 0.0008 ns |  0.75 |
| | | | | |
| Stackalloc128   | Main |   3.999 ns | 0.0002 ns |  1.00 |
| Stackalloc128   | PR |   2.404 ns | 0.0003 ns |  0.60 |
| | | | | |
| Stackalloc256   | Main |   5.431 ns | 0.0005 ns |  1.00 |
| Stackalloc256   | PR |   2.754 ns | 0.0003 ns |  0.51 |
| | | | | |
| Stackalloc512   | Main |  12.661 ns | 0.2744 ns |  1.00 |
| Stackalloc512   | PR |   7.423 ns | 0.0008 ns |  0.59 |
| | | | | |
| Stackalloc1024  | Main |  24.958 ns | 0.5326 ns |  1.00 |
| Stackalloc1024  | PR |  14.031 ns | 0.0040 ns |  0.56 |
| | | | | |
| Stackalloc16384 | Main | 374.899 ns | 0.0130 ns |  1.00 |
| Stackalloc16384 | PR | 111.029 ns | 1.2123 ns |  0.30 |

---------

Co-authored-by: Jakob Botsch Nielsen <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants