Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Make Mean16x4 static and move to LossyUtils
  • Loading branch information
brianpopow committed Nov 7, 2021
commit 8b8871b3ba75581ee2ff5f3fcb294bd640743136
68 changes: 67 additions & 1 deletion src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,20 @@
using System;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
internal static class LossyUtils
internal static unsafe class LossyUtils
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte();
#endif

[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);

Expand Down Expand Up @@ -801,6 +809,64 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
}

public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
#pragma warning disable SA1503 // Braces should not be omitted
tmp.Clear();
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really needed? We override the contents in the end.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i think you are right, its not needed

fixed (byte* inputPtr = input)
fixed (ushort* tmpPtr = tmp)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same recommendations to avoid pinning. Alternatively you can pin YuvIn before the for (k = 0; k < 16; k += 4) loop, and pass pointers to the method.

{
Vector128<byte> a0 = Sse2.LoadVector128(inputPtr);
Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps);
Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2));
Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3));
Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
Vector128<int> e0 = Sse2.Add(d0, d1);
Vector128<int> e1 = Sse2.Add(d2, d3);
Vector128<int> f0 = Sse2.Add(e0, e1);
Sse2.Store(tmpPtr, f0.AsUInt16());
}
#pragma warning restore SA1503 // Braces should not be omitted

dc[0] = (uint)(tmp[1] + tmp[0]);
Copy link
Copy Markdown
Member

@JimBobSquarePants JimBobSquarePants Nov 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks to me like if you reverse these span assignments you'll cut out 9 of 12 bounds checks.

dc[1] = (uint)(tmp[3] + tmp[2]);
dc[2] = (uint)(tmp[5] + tmp[4]);
dc[3] = (uint)(tmp[7] + tmp[6]);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same as _mm_hadd_epi16 aka. Ssse3.HorizontalAdd?

I'm afraid 12 span indexer bound checks have measureable impact here. All of them seem unnecessary, since is tmp is always of 16 size and dc is always of 4 size. If we can't find any matching HorizontalAdd for this, maybe we should consider passing tmp as a pointer and and indexing dc with Unsafe.* stuff.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes it is the same as Ssse3.HorizontalAdd, good catch

}
else
#endif
{
for (int k = 0; k < 4; k++)
{
uint avg = 0;
for (int y = 0; y < 4; y++)
{
for (int x = 0; x < 4; x++)
{
avg += input[x + (y * WebpConstants.Bps)];
}
}

dc[k] = avg;
input = input.Slice(4); // go to next 4x4 block.
}
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static uint LoadUv(byte u, byte v) =>
(uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).
Expand Down
72 changes: 3 additions & 69 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,14 @@
// Licensed under the Apache License, Version 2.0.

using System;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
/// <summary>
/// Iterator structure to iterate through macroblocks, pointing to the
/// right neighbouring data (samples, predictions, contexts, ...)
/// </summary>
internal unsafe class Vp8EncIterator
internal class Vp8EncIterator
{
public const int YOffEnc = 0;

Expand All @@ -33,10 +29,6 @@ internal unsafe class Vp8EncIterator

private readonly int mbh;

#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create(0x00ff).AsByte();
#endif

/// <summary>
/// Stride of the prediction plane(=4*mbw + 1).
/// </summary>
Expand Down Expand Up @@ -371,10 +363,10 @@ public int FastMbAnalyze(int quality)
uint m2;
for (k = 0; k < 16; k += 4)
{
this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
}

for (m = 0, m2 = 0, k = 0; k < 16; ++k)
for (m = 0, m2 = 0, k = 0; k < 16; k++)
{
m += dc[k];
m2 += dc[k] * dc[k];
Expand Down Expand Up @@ -832,64 +824,6 @@ public void BytesToNz()
this.Nz[this.nzIdx] = nz;
}

private void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
#pragma warning disable SA1503 // Braces should not be omitted
tmp.Clear();
fixed (byte* inputPtr = input)
fixed (ushort* tmpPtr = tmp)
{
Vector128<byte> a0 = Sse2.LoadVector128(inputPtr);
Vector128<byte> a1 = Sse2.LoadVector128(inputPtr + WebpConstants.Bps);
Vector128<byte> a2 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 2));
Vector128<byte> a3 = Sse2.LoadVector128(inputPtr + (WebpConstants.Bps * 3));
Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
Vector128<int> e0 = Sse2.Add(d0, d1);
Vector128<int> e1 = Sse2.Add(d2, d3);
Vector128<int> f0 = Sse2.Add(e0, e1);
Sse2.Store(tmpPtr, f0.AsUInt16());
}
#pragma warning restore SA1503 // Braces should not be omitted

dc[0] = (uint)(tmp[1] + tmp[0]);
dc[1] = (uint)(tmp[3] + tmp[2]);
dc[2] = (uint)(tmp[5] + tmp[4]);
dc[3] = (uint)(tmp[7] + tmp[6]);
}
else
#endif
{
for (int k = 0; k < 4; k++)
{
uint avg = 0;
for (int y = 0; y < 4; y++)
{
for (int x = 0; x < 4; x++)
{
avg += input[x + (y * WebpConstants.Bps)];
}
}

dc[k] = avg;
input = input.Slice(4); // go to next 4x4 block.
}
}
}

private void ImportBlock(Span<byte> src, int srcStride, Span<byte> dst, int w, int h, int size)
{
int dstIdx = 0;
Expand Down