Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Use less permutes and more multiply/add
  • Loading branch information
JimBobSquarePants committed Oct 23, 2020
commit 3ae4b024a0a979e46d38c92557c32e431c5ed05a
2 changes: 2 additions & 0 deletions src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ public static class HwIntrinsics
{
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };

public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };

/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,13 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
var chromaOffset = Vector256.Create(-halfValue);
var scale = Vector256.Create(1 / maxValue);
var rCrMult = Vector256.Create(1.402F);
var gCbMult = Vector256.Create(0.344136F);
var gCrMult = Vector256.Create(0.714136F);
var gCbMult = Vector256.Create(-0.344136F);
var gCrMult = Vector256.Create(-0.714136F);
var bCbMult = Vector256.Create(1.772F);

// Used for packing.
Vector4 vo = Vector4.One;
Vector128<float> valpha = Unsafe.As<Vector4, Vector128<float>>(ref vo);
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32);
var va = Vector256.Create(1F);
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

// Walking 8 elements at one step:
Expand All @@ -87,58 +86,36 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);

y = Avx2.PermuteVar8x32(y, vcontrol);
cb = Avx2.PermuteVar8x32(cb, vcontrol);
cr = Avx2.PermuteVar8x32(cr, vcontrol);

// r = y + (1.402F * cr);
// g = y - (0.344136F * cb) - (0.714136F * cr);
// b = y + (1.772F * cb);
// Adding & multiplying 8 elements at one time:
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult);
Vector256<float> g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult));
Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult);
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult);

// TODO: We should be savving to RGBA not Vector4
r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale);
g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale);
b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale);

// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the
// expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
//
// Left side.
Vector256<float> r0 = Avx.InsertVector128(
r,
Unsafe.As<Vector256<float>, Vector128<float>>(ref g),
1);

Vector256<float> r1 = Avx.InsertVector128(
b,
valpha,
1);

// Right side
Vector256<float> r2 = Avx.InsertVector128(
Unsafe.Add(ref Unsafe.As<Vector256<float>, Vector128<float>>(ref r), 1).ToVector256(),
Unsafe.Add(ref Unsafe.As<Vector256<float>, Vector128<float>>(ref g), 1),
1);

Vector256<float> r3 = Avx.InsertVector128(
Unsafe.Add(ref Unsafe.As<Vector256<float>, Vector128<float>>(ref b), 1).ToVector256(),
valpha,
1);

// Split into separate rows
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackHigh(r0, r1);

// Deinterleave and set
Vector256<float> vte = Avx.UnpackLow(r, b);
Vector256<float> vto = Avx.UnpackLow(g, va);

ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
destination = Avx2.PermuteVar8x32(t0, vcontrol);
Unsafe.Add(ref destination, 1) = Avx2.PermuteVar8x32(t2, vcontrol);

// Repeat for right side.
Vector256<float> t4 = Avx.UnpackLow(r2, r3);
Vector256<float> t6 = Avx.UnpackHigh(r2, r3);
destination = Avx.UnpackLow(vte, vto);
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);

vte = Avx.UnpackHigh(r, b);
vto = Avx.UnpackHigh(g, va);

Unsafe.Add(ref destination, 2) = Avx2.PermuteVar8x32(t4, vcontrol);
Unsafe.Add(ref destination, 3) = Avx2.PermuteVar8x32(t6, vcontrol);
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
}
#else
ref Vector<float> yBase =
Expand Down