diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index f252864476..d7511fddac 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -280,7 +280,7 @@ public void AddInPlace(float value)
}
///
- /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
+ /// Quantize input block, transpose, apply zig-zag ordering and store as .
///
/// Source block.
/// Destination block.
@@ -291,19 +291,19 @@ public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8
if (Avx2.IsSupported)
{
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
- ZigZag.ApplyZigZagOrderingAvx2(ref dest);
+ ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
}
else if (Ssse3.IsSupported)
{
MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
- ZigZag.ApplyZigZagOrderingSsse3(ref dest);
+ ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
}
else
#endif
{
for (int i = 0; i < Size; i++)
{
- int idx = ZigZag.ZigZagOrder[i];
+ int idx = ZigZag.TransposingOrder[i];
float quantizedVal = block[idx] * qt[idx];
quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
dest[i] = (short)quantizedVal;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index 94864005ec..8acc4b6269 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -29,11 +29,10 @@ private static void FDCT8x8_Avx(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
- // First pass - process rows
- block.TransposeInplace();
+ // First pass - process columns
FDCT8x8_1D_Avx(ref block);
- // Second pass - process columns
+ // Second pass - process rows
block.TransposeInplace();
FDCT8x8_1D_Avx(ref block);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index c27ad5b82b..e1bcff30f3 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -92,6 +92,11 @@ public static void AdjustToFDCT(ref Block8x8F quantTable)
tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
tableRef = ref Unsafe.Add(ref tableRef, 1);
}
+
+ // Spectral macroblocks are not transposed before quantization
+ // Transpose is done after quantization at zig-zag stage
+ // so we must transpose quantization table
+ quantTable.TransposeInplace();
}
///
@@ -133,14 +138,9 @@ public static void TransformFDCT(ref Block8x8F block)
}
else
#endif
- if (Vector.IsHardwareAccelerated)
{
FDCT_Vector4(ref block);
}
- else
- {
- FDCT_Scalar(ref block);
- }
}
///
@@ -217,136 +217,17 @@ static void IDCT8x4_Vector4(ref Vector4 vecRef)
}
}
- ///
- /// Apply 2D floating point FDCT inplace using scalar operations.
- ///
- ///
- /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
- ///
- /// Input block.
- private static void FDCT_Scalar(ref Block8x8F block)
- {
- const int dctSize = 8;
-
- float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- float tmp10, tmp11, tmp12, tmp13;
- float z1, z2, z3, z4, z5, z11, z13;
-
- // First pass - process rows
- ref float blockRef = ref Unsafe.As(ref block);
- for (int ctr = 7; ctr >= 0; ctr--)
- {
- tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
- tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
- tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
- tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
- tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
- tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
- tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
- tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
-
- // Even part
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
- Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
- Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
-
- z1 = (tmp12 + tmp13) * 0.707106781f;
- Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
- Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
-
- // Odd part
- tmp10 = tmp4 + tmp5;
- tmp11 = tmp5 + tmp6;
- tmp12 = tmp6 + tmp7;
-
- z5 = (tmp10 - tmp12) * 0.382683433f;
- z2 = (0.541196100f * tmp10) + z5;
- z4 = (1.306562965f * tmp12) + z5;
- z3 = tmp11 * 0.707106781f;
-
- z11 = tmp7 + z3;
- z13 = tmp7 - z3;
-
- Unsafe.Add(ref blockRef, 5) = z13 + z2;
- Unsafe.Add(ref blockRef, 3) = z13 - z2;
- Unsafe.Add(ref blockRef, 1) = z11 + z4;
- Unsafe.Add(ref blockRef, 7) = z11 - z4;
-
- blockRef = ref Unsafe.Add(ref blockRef, dctSize);
- }
-
- // Second pass - process columns
- blockRef = ref Unsafe.As(ref block);
- for (int ctr = 7; ctr >= 0; ctr--)
- {
- tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
- tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
- tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
- tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
- tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
- tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
- tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
- tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
-
- // Even part
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
- Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
- Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
-
- z1 = (tmp12 + tmp13) * 0.707106781f;
- Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
- Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
-
- // Odd part
- tmp10 = tmp4 + tmp5;
- tmp11 = tmp5 + tmp6;
- tmp12 = tmp6 + tmp7;
-
- z5 = (tmp10 - tmp12) * 0.382683433f;
- z2 = (0.541196100f * tmp10) + z5;
- z4 = (1.306562965f * tmp12) + z5;
- z3 = tmp11 * 0.707106781f;
-
- z11 = tmp7 + z3;
- z13 = tmp7 - z3;
-
- Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
- Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
- Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
- Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
-
- blockRef = ref Unsafe.Add(ref blockRef, 1);
- }
- }
-
///
/// Apply floating point FDCT inplace using API.
///
- ///
- /// This implementation must be called only if hardware supports 4
- /// floating point numbers vector. Otherwise explicit scalar
- /// implementation is faster
- /// because it does not rely on block transposition.
- ///
/// Input block.
public static void FDCT_Vector4(ref Block8x8F block)
{
- DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
-
- // First pass - process rows
- block.TransposeInplace();
+ // First pass - process columns
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);
- // Second pass - process columns
+ // Second pass - process rows
block.TransposeInplace();
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
index 6577739c1a..850de26c30 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -3,6 +3,7 @@
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
+using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@@ -18,120 +19,138 @@ internal static partial class ZigZag
#pragma warning restore SA1309
///
- /// Gets shuffle vectors for
+ /// Gets shuffle vectors for
/// zig zag implementation.
///
private static ReadOnlySpan SseShuffleMasks => new byte[]
{
- // row0
- 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
- _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
- _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
-
- // row1
- _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
- 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
- _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
-
- // row2
- _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
- _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
-
- // row3
- _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
- _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
- _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
- 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
-
- // row4
- _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
- _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
- _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
-
- // row5
- _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
- 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
-
- // row6
- _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
- _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
- 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
-
- // row7
- 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
- _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
+#pragma warning disable SA1515
+ /* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
+ // A
+ 0, 1, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
+ // B
+ _, _, 0, 1, _, _, _, _, 2, 3, _, _, _, _, _, _,
+ // C
+ _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3,
+
+ /* row1 - B2 A3 A4 B3 C2 D1 E0 F0 */
+ // A
+ _, _, 6, 7, 8, 9, _, _, _, _, _, _, _, _, _, _,
+ // B
+ 4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _, _, _,
+
+ /* row2 - E1 D2 C3 B4 A5 A6 B5 C4 */
+ // A
+ _, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, _, _,
+ // B
+ _, _, _, _, _, _, 8, 9, _, _, _, _, 10, 11, _, _,
+ // C
+ _, _, _, _, 6, 7, _, _, _, _, _, _, _, _, 8, 9,
+
+ /* row3 - D3 E2 F1 G0 H0 G1 F2 E3 */
+ // E
+ _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, 6, 7,
+ // F
+ _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, _, _,
+ // G
+ _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _,
+
+ /* row4 - D4 C5 B6 A7 B7 C6 D5 E4 */
+ // B
+ _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _,
+ // C
+ _, _, 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _,
+ // D
+ 8, 9, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _,
+
+ /* row5 - F3 G2 H1 H2 G3 F4 E5 D6 */
+ // F
+ 6, 7, _, _, _, _, _, _, _, _, 8, 9, _, _, _, _,
+ // G
+ _, _, 4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _,
+ // H
+ _, _, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
+
+ /* row6 - C7 D7 E6 F5 G4 H3 H4 G5 */
+ // G
+ _, _, _, _, _, _, _, _, 8, 9, _, _, _, _, 10, 11,
+ // H
+ _, _, _, _, _, _, _, _, _, _, 6, 7, 8, 9, _, _,
+
+ /* row7 - F6 E7 F7 G6 H5 H6 G7 H7 */
+ // F
+ 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
+ // G
+ _, _, _, _, _, _, 12, 13, _, _, _, _, 14, 15, _, _,
+ // H
+ _, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, 14, 15,
+#pragma warning restore SA1515
};
///
- /// Gets shuffle vectors for
+ /// Gets shuffle vectors for
/// zig zag implementation.
///
private static ReadOnlySpan AvxShuffleMasks => new byte[]
{
- // 01_AB/01_EF/23_CD - cross-lane
- 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0,
-
- // 01_AB - inner-lane
- 0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7,
-
- // 01_CD/23_GH - cross-lane
- 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _,
-
- // 01_CD - inner-lane
- _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _,
-
- // 01_EF - inner-lane
- _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
-
- // 23_AB/45_CD/67_EF - cross-lane
- 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _,
-
- // 23_AB - inner-lane
- 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _,
-
- // 23_CD - inner-lane
- _, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13,
-
- // 23_EF - inner-lane
- _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
-
- // 23_GH - inner-lane
- _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
-
- // 45_AB - inner-lane
- _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _,
-
- // 45_CD - inner-lane
- _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _,
-
- // 45_EF - cross-lane
- 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
-
- // 45_EF - inner-lane
- 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _,
-
- // 45_GH - inner-lane
- _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
-
- // 67_CD - inner-lane
- _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
-
- // 67_EF - inner-lane
- _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _,
-
- // 67_GH - inner-lane
- 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15
+#pragma warning disable SA1515
+ /* 01 */
+ // [cr] crln_01_AB_CD
+ 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0,
+ // (in) AB
+ 0, 1, 8, 9, 2, 3, 4, 5, 10, 11, _, _, _, _, _, _, 12, 13, 2, 3, 4, 5, 14, 15, _, _, _, _, _, _, _, _,
+ // (in) CD
+ _, _, _, _, _, _, _, _, _, _, 0, 1, 8, 9, 2, 3, _, _, _, _, _, _, _, _, 0, 1, 10, 11, _, _, _, _,
+ // [cr] crln_01_23_EF_23_CD
+ 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0,
+ // (in) EF
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, 8, 9,
+
+ /* 23 */
+ // [cr] crln_23_AB_23_45_GH
+ 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0,
+ // (in) AB
+ _, _, _, _, _, _, 8, 9, 2, 3, 4, 5, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+ // (in) CDe
+ _, _, 12, 13, 6, 7, _, _, _, _, _, _, _, _, 8, 9, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+ // (in) EF
+ 2, 3, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 4, 5, 10, 11, _, _, _, _, _, _, 12, 13, 6, 7,
+ // (in) GH
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, 8, 9, 2, 3, _, _, _, _,
+
+ /* 45 */
+ // (in) AB
+ _, _, _, _, 12, 13, 6, 7, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+ // [cr] crln_45_67_CD_45_EF
+ 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
+ // (in) CD
+ 8, 9, 2, 3, _, _, _, _, _, _, 4, 5, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 12, 13,
+ // (in) EF
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, 6, 7, _, _, _, _, _, _, _, _, 8, 9, 2, 3, _, _,
+ // (in) GH
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 4, 5, 10, 11, 12, 13, 6, 7, _, _, _, _, _, _,
+
+ /* 67 */
+ // (in) CD
+ 6, 7, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+ // [cr] crln_67_EF_67_GH
+ 2, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _,
+ // (in) EF
+ _, _, _, _, 4, 5, 14, 15, _, _, _, _, _, _, _, _, 8, 9, 2, 3, 10, 11, _, _, _, _, _, _, _, _, _, _,
+ // (in) GH
+ _, _, _, _, _, _, _, _, 0, 1, 10, 11, 12, 13, 2, 3, _, _, _, _, _, _, 0, 1, 6, 7, 8, 9, 2, 3, 10, 11,
+#pragma warning restore SA1515
};
///
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
///
/// Input matrix.
- public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
+ public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
{
DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
- fixed (byte* maskPtr = SseShuffleMasks)
+ fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
{
Vector128 rowA = block.V0.AsByte();
Vector128 rowB = block.V1.AsByte();
@@ -142,73 +161,69 @@ public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
Vector128 rowG = block.V6.AsByte();
Vector128 rowH = block.V7.AsByte();
- // row0 - A0 A1 B0 C0 B1 A2 A3 B2
- Vector128 rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16();
- Vector128 rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16();
- Vector128 row0 = Sse2.Or(rowA0, rowB0);
- Vector128 rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16();
- row0 = Sse2.Or(row0, rowC0);
-
- // row1 - C1 D0 E0 D1 C2 B3 A4 A5
- Vector128 rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16();
- Vector128 rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16();
- Vector128 row1 = Sse2.Or(rowA1, rowC1);
- Vector128 rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16();
- row1 = Sse2.Or(row1, rowD1);
- row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16();
- row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16();
-
- // row2
- Vector128 rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16();
- Vector128 rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16();
- Vector128 row2 = Sse2.Or(rowE2, rowF2);
- row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16();
- row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16();
- row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16();
- row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16();
-
- // row3
- Vector128 rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16();
- Vector128 rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16();
- Vector128 row3 = Sse2.Or(rowA3, rowB3);
- Vector128 rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16();
- row3 = Sse2.Or(row3, rowC3);
- Vector128 shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11));
- Vector128 rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16();
- row3 = Sse2.Or(row3, rowD3);
-
- // row4
- Vector128 rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16();
- Vector128 rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16();
- Vector128 row4 = Sse2.Or(rowE4, rowF4);
- Vector128 rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16();
- row4 = Sse2.Or(row4, rowG4);
- Vector128 rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16();
- row4 = Sse2.Or(row4, rowH4);
-
- // row5
- Vector128 rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16();
- Vector128 rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
- Vector128 row5 = Sse2.Or(rowC5, rowD5);
- row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16();
- row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16();
- row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16();
- row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16();
-
- // row6
- Vector128 rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16();
- Vector128 rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16();
- Vector128 row6 = Sse2.Or(rowE6, rowF6);
- Vector128 rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16();
- row6 = Sse2.Or(row6, rowH6);
- row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16();
- row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16();
-
- // row7
- Vector128 rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16();
- Vector128 rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16();
- Vector128 row7 = Sse2.Or(rowG7, rowH7);
- row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16();
+ // row0 - A0 B0 A1 A2 B1 C0 D0 C1
+ Vector128 row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
+ Vector128 row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
+ Vector128 row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
+ Vector128 row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
+ row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
+
+ // row1 - B2 A3 A4 B3 C2 D1 E0 F0
+ Vector128 row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
+ Vector128 row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
+ Vector128 row1 = Sse2.Or(row1_A, row1_B);
+ row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
+ row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
+ row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
+ row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
+
+ // row2 - E1 D2 C3 B4 A5 A6 B5 C4
+ Vector128 row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
+ Vector128 row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
+ Vector128 row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
+ Vector128 row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
+ row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
+ row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
+
+ // row3 - D3 E2 F1 G0 H0 G1 F2 E3
+ Vector128 row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
+ Vector128 row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
+ Vector128 row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
+ Vector128 row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
+ row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
+ row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
+
+ // row4 - D4 C5 B6 A7 B7 C6 D5 E4
+ Vector128 row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
+ Vector128 row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
+ Vector128 row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
+ Vector128 row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
+ row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
+ row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
+
+ // row5 - F3 G2 H1 H2 G3 F4 E5 D6
+ Vector128 row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
+ Vector128 row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
+ Vector128 row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
+ Vector128 row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
+ row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
+ row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
+
+ // row6 - C7 D7 E6 F5 G4 H3 H4 G5
+ Vector128 row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
+ Vector128 row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
+ Vector128 row6 = Sse2.Or(row6_G, row6_H);
+ row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
+ row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
+ row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
+ row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
+
+ // row7 - F6 E7 F7 G6 H5 H6 G7 H7
+ Vector128 row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
+ Vector128 row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
+ Vector128 row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
+ Vector128 row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
+ row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
block.V0 = row0;
block.V1 = row1;
@@ -225,69 +240,61 @@ public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
/// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
///
/// Input matrix.
- public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block)
+ public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
- fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
+ fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(AvxShuffleMasks))
{
- Vector256 rowsAB = block.V01.AsByte();
- Vector256 rowsCD = block.V23.AsByte();
- Vector256 rowsEF = block.V45.AsByte();
- Vector256 rowsGH = block.V67.AsByte();
-
- // rows 0 1
- Vector256 rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
- Vector256 row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+ Vector256 rowAB = block.V01.AsByte();
+ Vector256 rowCD = block.V23.AsByte();
+ Vector256 rowEF = block.V45.AsByte();
+ Vector256 rowGH = block.V67.AsByte();
+
+ /* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */
+ Vector256 crln_01_AB_CD = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
+ Vector256 row01_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_01_AB_CD).AsByte();
row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
-
- Vector256 rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
- Vector256 row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
- row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
-
- Vector256 row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
- Vector256 row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
-
- Vector256 row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
-
- // rows 2 3
- Vector256 rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
- Vector256 row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
- Vector256 row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
-
- Vector256 row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
+ Vector256 row01_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_AB_CD).AsByte();
+ row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (2 * 32))).AsByte();
+ Vector256 crln_01_23_EF_23_CD = Avx.LoadVector256(shuffleVectorsPtr + (3 * 32)).AsInt32();
+ Vector256 row01_23_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_01_23_EF_23_CD).AsByte();
+ Vector256 row01_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
+
+ Vector256 row01 = Avx2.Or(row01_AB, Avx2.Or(row01_CD, row01_EF));
+
+ /* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */
+ Vector256 crln_23_AB_23_45_GH = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
+ Vector256 row23_45_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_23_AB_23_45_GH).AsByte();
+ Vector256 row23_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
+ Vector256 row23_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_23_EF_23_CD).AsByte();
row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
-
- Vector256 row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
-
- Vector256 row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
- Vector256 row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
+ Vector256 row23_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
+ Vector256 row23_45_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_23_AB_23_45_GH).AsByte();
+ Vector256 row23_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32))).AsByte();
Vector256 row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
- // rows 4 5
- Vector256 row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
- Vector256 row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
- Vector256 row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
-
- Vector256 rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
- Vector256 row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
- row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
-
- Vector256 row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
+ /* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */
+ Vector256 row45_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32))).AsByte();
+ Vector256 crln_45_67_CD_45_EF = Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsInt32();
+ Vector256 row45_67_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_45_67_CD_45_EF).AsByte();
+ Vector256 row45_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (12 * 32))).AsByte();
+ Vector256 row45_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_45_67_CD_45_EF).AsByte();
+ row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32))).AsByte();
+ Vector256 row45_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32))).AsByte();
Vector256 row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
- // rows 6 7
- Vector256 row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
-
- Vector256 row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
- row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
-
- Vector256 row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
- row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
+ /* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */
+ Vector256 row67_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32))).AsByte();
+ Vector256 crln_67_EF_67_GH = Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsInt32();
+ Vector256 row67_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_67_EF_67_GH).AsByte();
+ row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32))).AsByte();
+ Vector256 row67_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_67_EF_67_GH).AsByte();
+ row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (18 * 32))).AsByte();
- Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
+ Vector256 row67 = Avx2.Or(row67_CD, Avx2.Or(row67_EF, row67_GH));
block.V01 = row01.AsInt16();
block.V23 = row23.AsInt16();
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index ae7e81254b..9576cbd3c8 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -220,7 +220,7 @@ static void RunTest(string srcSeedSerialized, string qtSeedSerialized)
// Reference implementation quantizes given block via division
Block8x8 expected = default;
- ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
+ ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.TransposingOrder);
// Actual current implementation quantizes given block via multiplication
// With quantization table reciprocal
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 36570ce55a..9c467a1cc9 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -135,10 +135,9 @@ static void RunTest(string serialized)
FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
srcBlock.MultiplyInPlace(ref dequantMatrix);
+ // testee
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
-
- // IDCT calculation
FastFloatingPointDCT.TransformIDCT(ref srcBlock);
float[] actualDest = srcBlock.ToArray();
@@ -180,7 +179,10 @@ static void RunTest(string serialized)
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
// testee
+ // Second transpose call is done by Quantize step
+ // Do this manually here just to be complient to the reference implementation
FastFloatingPointDCT.TransformFDCT(ref block);
+ block.TransposeInplace();
// Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix