Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add 4 channel byte shuffling
  • Loading branch information
JimBobSquarePants committed Oct 26, 2020
commit f659bc39501f89d16ec22937f1312beb6f3e424c
122 changes: 120 additions & 2 deletions src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ public static class HwIntrinsics
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of floats</param>
/// <param name="dest">The destination span of float</param>
/// <param name="source">The source span of floats.</param>
/// <param name="dest">The destination span of floats.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4ChannelReduce(
Expand Down Expand Up @@ -58,6 +58,46 @@ public static void Shuffle4ChannelReduce(
}
}

/// <summary>
/// Shuffle 8-bit integers in a within 128-bit lanes in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of bytes.</param>
/// <param name="dest">The destination span of bytes.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4ChannelReduce(
ref ReadOnlySpan<byte> source,
ref Span<byte> dest,
byte control)
{
if (Avx2.IsSupported || Ssse3.IsSupported)
{
int remainder;
if (Avx.IsSupported)
{
remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
}
else
{
remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);
}

int adjustedCount = source.Length - remainder;

if (adjustedCount > 0)
{
Shuffle4Channel(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount),
control);

source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4Channel(
ReadOnlySpan<float> source,
Expand Down Expand Up @@ -98,6 +138,84 @@ private static void Shuffle4Channel(
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4Channel(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
if (Avx2.IsSupported)
{
int n = dest.Length / Vector256<byte>.Count;

Vector256<byte> vcm;
switch (control)
{
case Shuffle.WXYZ:
vcm = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_256));
break;
case Shuffle.XYZW:
vcm = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(Shuffle.XYZW_256));
break;
case Shuffle.ZYXW:
vcm = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_256));
break;
default:
Span<byte> bytes = stackalloc byte[Vector256<byte>.Count];
Shuffle.MmShuffleSpan(ref bytes, control);
vcm = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(bytes));
break;
}

ref Vector256<byte> sourceBase =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(source));

ref Vector256<byte> destBase =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));

for (int i = 0; i < n; i++)
{
Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm);
}
}
else
{
// Ssse3
int n = dest.Length / Vector128<byte>.Count;

Vector128<byte> vcm;
switch (control)
{
case Shuffle.WXYZ:
vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_128));
break;
case Shuffle.XYZW:
vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(Shuffle.XYZW_128));
break;
case Shuffle.ZYXW:
vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_128));
break;
default:
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
Shuffle.MmShuffleSpan(ref bytes, control);
vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes));
break;
}

ref Vector128<byte> sourceBase =
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source));

ref Vector128<byte> destBase =
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));

for (int i = 0; i < n; i++)
{
Vector128<byte> vs = Unsafe.Add(ref sourceBase, i);
Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(vs, vcm);
}
}
}

/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
/// </summary>
Expand Down
115 changes: 80 additions & 35 deletions src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ internal static partial class SimdUtils
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of floats</param>
/// <param name="dest">The destination span of float</param>
/// <param name="source">The source span of floats.</param>
/// <param name="dest">The destination span of floats.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Channel(
Expand All @@ -38,14 +38,43 @@ public static void Shuffle4Channel(
}
}

/// <summary>
/// Shuffle 8-bit integers in a within 128-bit lanes in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of bytes.</param>
/// <param name="dest">The destination span of bytes.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Channel(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
VerifyShuffleSpanInput(source, dest);

// TODO: There doesn't seem to be any APIs for
// System.Numerics that allow shuffling.
#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control);
#endif

// Deal with the remainder:
if (source.Length > 0)
{
ShuffleRemainder4Channel(source, dest, control);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The non-SIMD shuffling has optimized implementations in PixelConverter, although I'm unsure what runtimes support those optimizations.

Copy link
Member Author

@JimBobSquarePants JimBobSquarePants Oct 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I saw that but there's a limited set of shuffles (Though we could maybe T4 template it?). With this API we could potentially expose the ability to use any shuffle combination. for example XXXX. We can still use the optimizations in the explicit pixel converters though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really depends on where do we want to get in the end, and how the overall pipelines would look like. (See my main comment.)

Copy link
Member

@antonfirsov antonfirsov Oct 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to avoid regression on non-x86 and/or non 3.1+, I think we have 2 options now:

  1. Handle the cases right in the PixelOperations method:
            public override void FromRgba32(Configuration configuration, ReadOnlySpan<Rgba32> sourcePixels, Span<Argb32> destinationPixels)
            {
                Guard.NotNull(configuration, nameof(configuration));
                Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));

#if SUPPORTS_RUNTIME_INTRINSICS
                if (Avx.IsSupported || Sse.IsSupported)
                {
                    var source = MemoryMarshal.Cast<Rgba32, byte>(sourcePixels);
                    var dest = MemoryMarshal.Cast<Argb32, byte>(destinationPixels);

                    SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WXYZ);
                }
                else
                
#else
                {
                    ref uint sourceRef = ref Unsafe.As<Rgba32, uint>(ref MemoryMarshal.GetReference(sourcePixels));
                    ref uint destRef = ref Unsafe.As<Argb32, uint>(ref MemoryMarshal.GetReference(destinationPixels));

                    for (int i = 0; i < sourcePixels.Length; i++)
                    {
                        uint sp = Unsafe.Add(ref sourceRef, i);
                        Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToArgb32(sp);
                    }
                }
#endif
            }
  1. Add a switch statement to ShuffleRemainder4Channel to handle well-known cases for control with the code in PixelConverter.

What is the best depends on the plans to handle the 4->3 and the 3->4 cases, so we have consistency in the resulting code. See #1404 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 is probably better

Copy link
Member

@antonfirsov antonfirsov Oct 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there is one other option, you probably meant this:
3. T4 multiple variants of Shuffle4Channel, so we can avoid the switch-branch, but it quite a lot of work, unsure if the perf gains will justify it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There you go.... I haven't wired them up yet but I've optimized our existing shuffles so there's no slowdown on older NET Core.

About 4x faster than they were previously.

Method Job EnvironmentVariables Count Mean Error StdDev Ratio RatioSD Gen 0 Gen 1 Gen 2 Allocated
Shuffle4Channel AVX Empty 128 21.56 ns 0.454 ns 0.651 ns 1.00 0.00 - - - -
Shuffle4Channel No HwIntrinsics COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 128 19.50 ns 0.252 ns 0.224 ns 0.89 0.03 - - - -
Shuffle4Channel SSE COMPlus_EnableAVX=0 128 17.66 ns 0.125 ns 0.111 ns 0.81 0.03 - - - -
Shuffle4Channel AVX Empty 256 24.49 ns 0.284 ns 0.252 ns 1.00 0.00 - - - -
Shuffle4Channel No HwIntrinsics COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 256 38.02 ns 0.783 ns 1.219 ns 1.51 0.04 - - - -
Shuffle4Channel SSE COMPlus_EnableAVX=0 256 29.11 ns 0.317 ns 0.281 ns 1.19 0.02 - - - -
Shuffle4Channel AVX Empty 512 29.75 ns 0.233 ns 0.218 ns 1.00 0.00 - - - -
Shuffle4Channel No HwIntrinsics COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 512 79.31 ns 1.252 ns 1.171 ns 2.67 0.05 - - - -
Shuffle4Channel SSE COMPlus_EnableAVX=0 512 29.84 ns 0.603 ns 0.592 ns 1.00 0.02 - - - -
Shuffle4Channel AVX Empty 1024 38.62 ns 0.244 ns 0.228 ns 1.00 0.00 - - - -
Shuffle4Channel No HwIntrinsics COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 1024 150.61 ns 0.293 ns 0.245 ns 3.90 0.02 - - - -
Shuffle4Channel SSE COMPlus_EnableAVX=0 1024 46.57 ns 0.606 ns 0.567 ns 1.21 0.02 - - - -
Shuffle4Channel AVX Empty 2048 59.99 ns 0.352 ns 0.312 ns 1.00 0.00 - - - -
Shuffle4Channel No HwIntrinsics COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 2048 283.23 ns 0.869 ns 0.770 ns 4.72 0.02 - - - -
Shuffle4Channel SSE COMPlus_EnableAVX=0 2048 85.03 ns 1.708 ns 1.899 ns 1.42 0.04 - - - -

}
}

[MethodImpl(InliningOptions.ColdPath)]
public static void ShuffleRemainder4Channel(
ReadOnlySpan<float> source,
Span<float> dest,
public static void ShuffleRemainder4Channel<T>(
ReadOnlySpan<T> source,
Span<T> dest,
byte control)
where T : struct
{
ref float sBase = ref MemoryMarshal.GetReference(source);
ref float dBase = ref MemoryMarshal.GetReference(dest);
ref T sBase = ref MemoryMarshal.GetReference(source);
ref T dBase = ref MemoryMarshal.GetReference(dest);
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0);

for (int i = 0; i < source.Length; i += 4)
Expand All @@ -58,7 +87,8 @@ public static void ShuffleRemainder4Channel(
}

[Conditional("DEBUG")]
private static void VerifyShuffleSpanInput(ReadOnlySpan<float> source, Span<float> dest)
private static void VerifyShuffleSpanInput<T>(ReadOnlySpan<T> source, Span<T> dest)
where T : struct
{
DebugGuard.IsTrue(
source.Length == dest.Length,
Expand All @@ -77,49 +107,64 @@ public static class Shuffle
public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0;
public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2;

public static ReadOnlySpan<byte> WXYZ_128 => MmShuffleByte128(2, 1, 0, 3);
public static ReadOnlySpan<byte> WXYZ_128 => MmShuffleSpan128(WXYZ);

public static ReadOnlySpan<byte> XYZW_128 => MmShuffleByte128(3, 2, 1, 0);
public static ReadOnlySpan<byte> XYZW_128 => MmShuffleSpan128(XYZW);

public static ReadOnlySpan<byte> ZYXW_128 => MmShuffleByte128(3, 0, 1, 2);
public static ReadOnlySpan<byte> ZYXW_128 => MmShuffleSpan128(ZYXW);

public static ReadOnlySpan<byte> WXYZ_256 => MmShuffleByte256(2, 1, 0, 3);
public static ReadOnlySpan<byte> WXYZ_256 => MmShuffleSpan256(WXYZ);

public static ReadOnlySpan<byte> XYZW_256 => MmShuffleByte256(3, 2, 1, 0);
public static ReadOnlySpan<byte> XYZW_256 => MmShuffleSpan256(XYZW);

public static ReadOnlySpan<byte> ZYXW_256 => MmShuffleByte256(3, 0, 1, 2);
public static ReadOnlySpan<byte> ZYXW_256 => MmShuffleSpan256(ZYXW);

private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0)
private static ReadOnlySpan<byte> MmShuffleSpan128(byte control)
{
byte[] result = new byte[16];

for (int i = 0; i < result.Length; i += 4)
{
result[i] = (byte)(p0 + i);
result[i + 1] = (byte)(p1 + i);
result[i + 2] = (byte)(p2 + i);
result[i + 3] = (byte)(p3 + i);
}
Span<byte> buffer = new byte[16];
MmShuffleSpan(ref buffer, control);
return buffer;
}

return result;
private static ReadOnlySpan<byte> MmShuffleSpan256(byte control)
{
Span<byte> buffer = new byte[32];
MmShuffleSpan(ref buffer, control);
return buffer;
}

private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0)
[MethodImpl(InliningOptions.ShortMethod)]
public static byte MmShuffle(int p3, int p2, int p1, int p0)
=> (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0);

[MethodImpl(InliningOptions.ShortMethod)]
public static void MmShuffleSpan(ref Span<byte> span, byte control)
{
byte[] result = new byte[32];
InverseMmShuffle(
control,
out int p3,
out int p2,
out int p1,
out int p0);

for (int i = 0; i < result.Length; i += 4)
ref byte spanBase = ref MemoryMarshal.GetReference(span);

for (int i = 0; i < span.Length; i += 4)
{
result[i] = (byte)(p0 + i);
result[i + 1] = (byte)(p1 + i);
result[i + 2] = (byte)(p2 + i);
result[i + 3] = (byte)(p3 + i);
Unsafe.Add(ref spanBase, i) = (byte)(p0 + i);
Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i);
Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i);
Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i);
}

return result;
}

public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0)
[MethodImpl(InliningOptions.ShortMethod)]
public static void InverseMmShuffle(
byte control,
out int p3,
out int p2,
out int p1,
out int p0)
{
p3 = control >> 6 & 0x3;
p2 = control >> 4 & 0x3;
Expand Down
68 changes: 68 additions & 0 deletions tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using BenchmarkDotNet.Attributes;

namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
[Config(typeof(Config.HwIntrinsics_SSE_AVX))]
public class ShuffleByte4Channel
{
private byte[] source;
private byte[] destination;

[GlobalSetup]
public void Setup()
{
this.source = new byte[this.Count];
new Random(this.Count).NextBytes(this.source);
this.destination = new byte[this.Count];
}

[Params(128, 256, 512, 1024, 2048)]
public int Count { get; set; }

[Benchmark]
public void Shuffle4Channel()
{
SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ);
}
}

// 2020-10-26
// ##########
//
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
// Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
// .NET Core SDK = 5.0.100-rc.2.20479.15
//
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
// AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
// No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
// SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
//
// Runtime=.NET Core 3.1
//
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
// |---------------- |---------------- |-------------------------------------------------- |------ |----------:|----------:|----------:|------:|--------:|-------:|------:|------:|----------:|
// | Shuffle4Channel | AVX | Empty | 128 | 33.57 ns | 0.694 ns | 1.268 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.97 ns | 0.940 ns | 1.045 ns | 1.94 | 0.10 | - | - | - | - |
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 27.23 ns | 0.338 ns | 0.300 ns | 0.84 | 0.04 | 0.0095 | - | - | 40 B |
// | | | | | | | | | | | | | |
// | Shuffle4Channel | AVX | Empty | 256 | 34.57 ns | 0.295 ns | 0.276 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.62 ns | 0.257 ns | 0.228 ns | 3.60 | 0.03 | - | - | - | - |
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 32.22 ns | 0.106 ns | 0.099 ns | 0.93 | 0.01 | 0.0095 | - | - | 40 B |
// | | | | | | | | | | | | | |
// | Shuffle4Channel | AVX | Empty | 512 | 40.41 ns | 0.826 ns | 0.848 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 251.65 ns | 0.440 ns | 0.412 ns | 6.23 | 0.13 | - | - | - | - |
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 41.54 ns | 0.128 ns | 0.114 ns | 1.03 | 0.02 | 0.0095 | - | - | 40 B |
// | | | | | | | | | | | | | |
// | Shuffle4Channel | AVX | Empty | 1024 | 51.54 ns | 0.156 ns | 0.121 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 493.66 ns | 1.316 ns | 1.231 ns | 9.58 | 0.04 | - | - | - | - |
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 61.45 ns | 0.216 ns | 0.181 ns | 1.19 | 0.00 | 0.0095 | - | - | 40 B |
// | | | | | | | | | | | | | |
// | Shuffle4Channel | AVX | Empty | 2048 | 76.85 ns | 0.176 ns | 0.138 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 985.64 ns | 11.396 ns | 10.103 ns | 12.84 | 0.15 | - | - | - | - |
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 106.13 ns | 0.335 ns | 0.297 ns | 1.38 | 0.01 | 0.0095 | - | - | 40 B |
}
Loading