-
-
Notifications
You must be signed in to change notification settings - Fork 893
Add sse2 version of inverse transform #1819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
6e8def1
26d3063
835ecea
5968de8
5c0b598
b6a20af
6039d2a
abcbc4c
18ecb06
6e548b5
a201e8a
b7059ae
544319e
5074ee6
16bb94f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,11 @@ | |
| using System; | ||
| using System.Buffers.Binary; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Runtime.InteropServices; | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| using System.Runtime.Intrinsics; | ||
| using System.Runtime.Intrinsics.X86; | ||
| #endif | ||
|
|
||
| namespace SixLabors.ImageSharp.Formats.Webp.Lossy | ||
| { | ||
|
|
@@ -60,6 +65,12 @@ internal static class Vp8Encoding | |
|
|
||
| public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16(); | ||
|
|
||
| public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16(); | ||
| #endif | ||
|
|
||
| static Vp8Encoding() | ||
| { | ||
| for (int i = -255; i <= 255 + 255; i++) | ||
|
|
@@ -68,12 +79,199 @@ static Vp8Encoding() | |
| } | ||
| } | ||
|
|
||
| // Transforms (Paragraph 14.4) | ||
| // Does one or two inverse transforms. | ||
| public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch) | ||
| { | ||
| ITransformOne(reference, input, dst, scratch); | ||
| if (doTwo) | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| if (Sse2.IsSupported) | ||
| { | ||
| ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); | ||
| // This implementation makes use of 16-bit fixed point versions of two | ||
| // multiply constants: | ||
| // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 | ||
| // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 | ||
| // | ||
| // To be able to use signed 16-bit integers, we use the following trick to | ||
| // have constants within range: | ||
| // - Associated constants are obtained by subtracting the 16-bit fixed point | ||
| // version of one: | ||
| // k = K - (1 << 16) => K = k + (1 << 16) | ||
| // K1 = 85267 => k1 = 20091 | ||
| // K2 = 35468 => k2 = -30068 | ||
| // - The multiplication of a variable by a constant become the sum of the | ||
| // variable and the multiplication of that variable by the associated | ||
| // constant: | ||
| // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x | ||
|
|
||
| // Load and concatenate the transform coefficients (we'll do two inverse | ||
| // transforms in parallel). In the case of only one inverse transform, the | ||
| // second half of the vectors will just contain random value we'll never | ||
| // use nor store. | ||
| ref short inputRef = ref MemoryMarshal.GetReference(input); | ||
| var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0); | ||
| var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0); | ||
| var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0); | ||
| var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0); | ||
|
|
||
| // a00 a10 a20 a30 x x x x | ||
| // a01 a11 a21 a31 x x x x | ||
| // a02 a12 a22 a32 x x x x | ||
| // a03 a13 a23 a33 x x x x | ||
| if (doTwo) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to avoid giving unnecessary extra work to the branch predictor, and refactor One hidden cost is that assembly code having branches on a runtime constant will be longer, and needs more cache lines, which are expensive to load. Alternative "fun" solution, sort of template metaprogramming in C#, that will eliminate branches at JIT time: ITransform<TDoTwo>(...) {
...
if (typeof(TDoTwo) == typeof(DoTwo_True)) {
...
}
else {
...
}
}Can't decide if it's worth it or better to just duplicate the code, and maybe create reusable helper methods for shared bits. |
||
| { | ||
| var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0); | ||
| var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0); | ||
| var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0); | ||
| var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0); | ||
|
|
||
| in0 = Sse2.UnpackLow(in0, inb0); | ||
| in1 = Sse2.UnpackLow(in1, inb1); | ||
| in2 = Sse2.UnpackLow(in2, inb2); | ||
| in3 = Sse2.UnpackLow(in3, inb3); | ||
|
|
||
| // a00 a10 a20 a30 b00 b10 b20 b30 | ||
| // a01 a11 a21 a31 b01 b11 b21 b31 | ||
| // a02 a12 a22 a32 b02 b12 b22 b32 | ||
| // a03 a13 a23 a33 b03 b13 b23 b33 | ||
| } | ||
|
|
||
| // Vertical pass and subsequent transpose. | ||
| // First pass, c and d calculations are longer because of the "trick" multiplications. | ||
| Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); | ||
| Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); | ||
|
|
||
| // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 | ||
| Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); | ||
| Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); | ||
| Vector128<long> c3 = Sse2.Subtract(in1, in3); | ||
| Vector128<short> c4 = Sse2.Subtract(c1, c2); | ||
| Vector128<short> c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); | ||
|
|
||
| // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 | ||
| Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); | ||
| Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); | ||
| Vector128<long> d3 = Sse2.Add(in1, in3); | ||
| Vector128<short> d4 = Sse2.Add(d1, d2); | ||
| Vector128<short> d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); | ||
|
|
||
| // Second pass. | ||
| Vector128<short> tmp0 = Sse2.Add(a, d); | ||
| Vector128<short> tmp1 = Sse2.Add(b, c); | ||
| Vector128<short> tmp2 = Sse2.Subtract(b, c); | ||
| Vector128<short> tmp3 = Sse2.Subtract(a, d); | ||
|
|
||
| // Transpose the two 4x4. | ||
| LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3); | ||
|
|
||
| // Horizontal pass and subsequent transpose. | ||
| // First pass, c and d calculations are longer because of the "trick" multiplications. | ||
| var four = Vector128.Create((short)4); | ||
| Vector128<short> dc = Sse2.Add(t0.AsInt16(), four); | ||
| a = Sse2.Add(dc, t2.AsInt16()); | ||
| b = Sse2.Subtract(dc, t2.AsInt16()); | ||
|
|
||
| // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 | ||
| c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); | ||
| c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); | ||
| c3 = Sse2.Subtract(t1, t3); | ||
| c4 = Sse2.Subtract(c1, c2); | ||
| c = Sse2.Add(c3.AsInt16(), c4); | ||
|
|
||
| // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 | ||
| d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); | ||
| d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); | ||
| d3 = Sse2.Add(t1, t3); | ||
| d4 = Sse2.Add(d1, d2); | ||
| d = Sse2.Add(d3.AsInt16(), d4); | ||
|
|
||
| // Second pass. | ||
| tmp0 = Sse2.Add(a, d); | ||
| tmp1 = Sse2.Add(b, c); | ||
| tmp2 = Sse2.Subtract(b, c); | ||
| tmp3 = Sse2.Subtract(a, d); | ||
| Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); | ||
| Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); | ||
| Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); | ||
| Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); | ||
|
|
||
| // Transpose the two 4x4. | ||
| LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); | ||
|
|
||
| // Add inverse transform to 'ref' and store. | ||
| // Load the reference(s). | ||
| Vector128<byte> ref0 = Vector128<byte>.Zero; | ||
| Vector128<byte> ref1 = Vector128<byte>.Zero; | ||
| Vector128<byte> ref2 = Vector128<byte>.Zero; | ||
| Vector128<byte> ref3 = Vector128<byte>.Zero; | ||
| ref byte referenceRef = ref MemoryMarshal.GetReference(reference); | ||
| if (doTwo) | ||
| { | ||
| // Load eight bytes/pixels per line. | ||
| ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte(); | ||
| ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); | ||
| ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); | ||
| ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); | ||
| } | ||
| else | ||
| { | ||
| // Load four bytes/pixels per line. | ||
| ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte(); | ||
| ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); | ||
| ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); | ||
| ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); | ||
| } | ||
|
|
||
| // Convert to 16b. | ||
| ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero); | ||
| ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero); | ||
| ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero); | ||
| ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero); | ||
|
|
||
| // Add the inverse transform(s). | ||
| Vector128<ushort> ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); | ||
| Vector128<ushort> ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); | ||
| Vector128<ushort> ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); | ||
| Vector128<ushort> ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); | ||
|
|
||
| // Unsigned saturate to 8b. | ||
| ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); | ||
| ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); | ||
| ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); | ||
| ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); | ||
|
|
||
| // Unsigned saturate to 8b. | ||
| if (doTwo) | ||
| { | ||
| // Store eight bytes/pixels per line. | ||
| ref byte outputRef = ref MemoryMarshal.GetReference(dst); | ||
| Unsafe.As<byte, Vector128<byte>>(ref outputRef) = ref0; | ||
| Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; | ||
| Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; | ||
| Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; | ||
| } | ||
| else | ||
| { | ||
| // Store four bytes/pixels per line. | ||
| int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); | ||
| int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); | ||
| int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); | ||
| int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); | ||
|
|
||
| ref byte outputRef = ref MemoryMarshal.GetReference(dst); | ||
| Unsafe.As<byte, int>(ref outputRef) = output0; | ||
| Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; | ||
| Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; | ||
| Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; | ||
| } | ||
| } | ||
| else | ||
| #endif | ||
| { | ||
| ITransformOne(reference, input, dst, scratch); | ||
| if (doTwo) | ||
| { | ||
| ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.