Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ITransform now always does two transforms
  • Loading branch information
brianpopow committed Nov 12, 2021
commit 544319e9ea8689e6f257c03e7990136bbfaad53e
6 changes: 3 additions & 3 deletions src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
LossyUtils.TransformWht(dcTmp, tmp, scratch);
for (n = 0; n < 16; n += 2)
{
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch);
}

return nz;
Expand All @@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
Vp8Encoding.FTransform(src, reference, tmp, scratch);
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch);

return nz;
}
Expand Down Expand Up @@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc

for (n = 0; n < 8; n += 2)
{
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch);
}

return nz << 16;
Expand Down
277 changes: 187 additions & 90 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

using System;
using System.Buffers.Binary;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
Expand All @@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
/// <summary>
/// Methods for encoding a VP8 frame.
/// </summary>
internal static unsafe class Vp8Encoding
internal static class Vp8Encoding
{
private const int KC1 = 20091 + (1 << 16);

Expand Down Expand Up @@ -83,8 +82,8 @@ static Vp8Encoding()
}

// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
// Does two inverse transforms.
public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
Expand Down Expand Up @@ -120,23 +119,20 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (doTwo)
{
var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);

in0 = Sse2.UnpackLow(in0, inb0);
in1 = Sse2.UnpackLow(in1, inb1);
in2 = Sse2.UnpackLow(in2, inb2);
in3 = Sse2.UnpackLow(in3, inb3);

// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);

in0 = Sse2.UnpackLow(in0, inb0);
in1 = Sse2.UnpackLow(in1, inb1);
in2 = Sse2.UnpackLow(in2, inb2);
in3 = Sse2.UnpackLow(in3, inb3);

// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33

// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Expand Down Expand Up @@ -206,22 +202,12 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
if (doTwo)
{
// Load eight bytes/pixels per line.
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
}
else
{
// Load four bytes/pixels per line.
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
}

// Load eight bytes/pixels per line.
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();

// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
Expand All @@ -243,72 +229,183 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte

// Unsigned saturate to 8b.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
if (doTwo)
{
// Store eight bytes/pixels per line.
Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
}
else
{
// Store four bytes/pixels per line.
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());

Unsafe.As<byte, int>(ref outputRef) = output0;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
}

// Store eight bytes/pixels per line.
Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
Comment on lines +187 to +189
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the first 3 calls, it's possible and better to avoid GetLower. The second store will overwrite the upper 8 bit written in the first store and so on:

Suggested change
Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
Unsafe.As<byte, Vector128<byte>>(ref outputRef) = ref0;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2;

If you guarantee at the call site that dst has a patting of 8 bytes, you can also avoid it at the last call.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont think that will work. I only want to write 8 bytes from each ref vector.
The first write will be at position 0, the second at 32 (note the Unsafe.Add() with WebpConstants.Bps, WebpConstants.Bps is 32), the third at 64 and the last at 96.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah haven't noticed that Bps is 32, nevermind then.

Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
}
else
#endif
{
ITransformOne(reference, input, dst, scratch);
if (doTwo)
{
ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
}
ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
}
}

public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
for (i = 0; i < 4; i++)
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// vertical pass.
int a = input[0] + input[8];
int b = input[0] - input[8];
int c = Mul(input[4], KC2) - Mul(input[12], KC1);
int d = Mul(input[4], KC1) + Mul(input[12], KC2);
tmp[0] = a + d;
tmp[1] = b + c;
tmp[2] = b - c;
tmp[3] = a - d;
tmp = tmp.Slice(4);
input = input.Slice(1);
}
// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
// second half of the vectors will just contain random value we'll never
// use nor store.
ref short inputRef = ref MemoryMarshal.GetReference(input);
var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0);
var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0);
var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0);
var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0);

tmp = scratch;
for (i = 0; i < 4; i++)
// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x

// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());

// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);

// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);

// Second pass.
Vector128<short> tmp0 = Sse2.Add(a, d);
Vector128<short> tmp1 = Sse2.Add(b, c);
Vector128<short> tmp2 = Sse2.Subtract(b, c);
Vector128<short> tmp3 = Sse2.Subtract(a, d);

// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
a = Sse2.Add(dc, t2.AsInt16());
b = Sse2.Subtract(dc, t2.AsInt16());

// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
c4 = Sse2.Subtract(c1, c2);
c = Sse2.Add(c3, c4);

// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
d4 = Sse2.Add(d1, d2);
d = Sse2.Add(d3, d4);

// Second pass.
tmp0 = Sse2.Add(a, d);
tmp1 = Sse2.Add(b, c);
tmp2 = Sse2.Subtract(b, c);
tmp3 = Sse2.Subtract(a, d);
Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);

// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);

// Add inverse transform to 'ref' and store.
// Load the reference(s).
Vector128<byte> ref0 = Vector128<byte>.Zero;
Vector128<byte> ref1 = Vector128<byte>.Zero;
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);

// Load four bytes/pixels per line.
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();

// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);

// Add the inverse transform(s).
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());

// Unsigned saturate to 8b.
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);

// Unsigned saturate to 8b.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);

// Store four bytes/pixels per line.
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());

Unsafe.As<byte, int>(ref outputRef) = output0;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
}
else
#endif
{
// horizontal pass.
int dc = tmp[0] + 4;
int a = dc + tmp[8];
int b = dc - tmp[8];
int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
Store(dst, reference, 0, i, a + d);
Store(dst, reference, 1, i, b + c);
Store(dst, reference, 2, i, b - c);
Store(dst, reference, 3, i, a - d);
tmp = tmp.Slice(1);
int i;
Span<int> tmp = scratch.Slice(0, 16);
for (i = 0; i < 4; i++)
{
// vertical pass.
int a = input[0] + input[8];
int b = input[0] - input[8];
int c = Mul(input[4], KC2) - Mul(input[12], KC1);
int d = Mul(input[4], KC1) + Mul(input[12], KC2);
tmp[0] = a + d;
tmp[1] = b + c;
tmp[2] = b - c;
tmp[3] = a - d;
tmp = tmp.Slice(4);
input = input.Slice(1);
}

tmp = scratch;
for (i = 0; i < 4; i++)
{
// horizontal pass.
int dc = tmp[0] + 4;
int a = dc + tmp[8];
int b = dc - tmp[8];
int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
Store(dst, reference, 0, i, a + d);
Store(dst, reference, 1, i, b + c);
Store(dst, reference, 2, i, b - c);
Store(dst, reference, 3, i, a - d);
tmp = tmp.Slice(1);
}
}
}

Expand Down
Loading