-
-
Notifications
You must be signed in to change notification settings - Fork 893
Add SSE2 version of Mean16x4 #1814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
765f5a2
8b8871b
984971e
0c96e37
e8c0d2c
3c9c1bb
0ca9d43
9ab9e75
1418e53
9e143ef
3cfa040
84732bf
50013d7
f0cb89e
1452ba0
7d8225b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,12 +4,20 @@ | |
| using System; | ||
| using System.Buffers.Binary; | ||
| using System.Runtime.CompilerServices; | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| using System.Runtime.Intrinsics; | ||
| using System.Runtime.Intrinsics.X86; | ||
| #endif | ||
|
|
||
| // ReSharper disable InconsistentNaming | ||
| namespace SixLabors.ImageSharp.Formats.Webp.Lossy | ||
| { | ||
| internal static class LossyUtils | ||
| internal static unsafe class LossyUtils | ||
| { | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte(); | ||
| #endif | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16); | ||
|
|
||
|
|
@@ -801,6 +809,64 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride, | |
| FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); | ||
| } | ||
|
|
||
| public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp) | ||
| { | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| if (Sse2.IsSupported) | ||
| { | ||
| #pragma warning disable SA1503 // Braces should not be omitted | ||
| tmp.Clear(); | ||
| fixed (byte* inputPtr = input) | ||
| fixed (ushort* tmpPtr = tmp) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same recommendations to avoid pinning. Alternatively you can pin |
||
| { | ||
| Vector128<byte> a0 = Sse2.LoadVector128(inputPtr); | ||
| Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps); | ||
| Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2)); | ||
| Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3)); | ||
| Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte | ||
| Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); | ||
| Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); | ||
| Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); | ||
| Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte | ||
| Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask); | ||
| Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask); | ||
| Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask); | ||
| Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); | ||
| Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); | ||
| Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); | ||
| Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); | ||
| Vector128<int> e0 = Sse2.Add(d0, d1); | ||
| Vector128<int> e1 = Sse2.Add(d2, d3); | ||
| Vector128<int> f0 = Sse2.Add(e0, e1); | ||
| Sse2.Store(tmpPtr, f0.AsUInt16()); | ||
| } | ||
| #pragma warning restore SA1503 // Braces should not be omitted | ||
|
|
||
| dc[0] = (uint)(tmp[1] + tmp[0]); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks to me like if you reverse these span assignments you'll cut out 9 of 12 bounds checks. |
||
| dc[1] = (uint)(tmp[3] + tmp[2]); | ||
| dc[2] = (uint)(tmp[5] + tmp[4]); | ||
| dc[3] = (uint)(tmp[7] + tmp[6]); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this the same as I'm afraid 12 span indexer bound checks have measureable impact here. All of them seem unnecessary, since is
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes it is the same as |
||
| } | ||
| else | ||
| #endif | ||
| { | ||
| for (int k = 0; k < 4; k++) | ||
| { | ||
| uint avg = 0; | ||
| for (int y = 0; y < 4; y++) | ||
| { | ||
| for (int x = 0; x < 4; x++) | ||
| { | ||
| avg += input[x + (y * WebpConstants.Bps)]; | ||
| } | ||
| } | ||
|
|
||
| dc[k] = avg; | ||
| input = input.Slice(4); // go to next 4x4 block. | ||
| } | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static uint LoadUv(byte u, byte v) => | ||
| (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really needed? We override the contents in the end.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah i think you are right, its not needed