From 822c931867aeaf88b3d226b6d70fe8f06abaf413 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 30 Sep 2022 09:46:51 -0700 Subject: [PATCH] Ensure NI_Vector256_GetLower has the right simdSize and improve codegen slightly --- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 15 +++++++++++++-- src/coreclr/jit/hwintrinsicxarch.cpp | 3 ++- src/coreclr/jit/lowerxarch.cpp | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 107705fa5e5729..8bd12f54d8d1df 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1093,12 +1093,23 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) { if (op1->isContained() || op1->isUsedFromSpillTemp()) { - genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1); + // We want to always emit the EA_16BYTE version here. + // + // For ToVector256Unsafe the upper bits don't matter and for GetLower we + // only actually need the lower 16-bytes, so we can just be "more efficient" + + genHWIntrinsic_R_RM(node, ins, EA_16BYTE, targetReg, op1); } else { + // We want to always emit the EA_32BYTE version here. + // + // For ToVector256Unsafe the upper bits don't matter and this allows same + // register moves to be elided. For GetLower we're getting a Vector128 and + // so the upper bits aren't impactful either allowing the same. + // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs - emit->emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); + emit->emitIns_Mov(INS_movaps, EA_32BYTE, targetReg, op1Reg, /* canSkip */ true); } break; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index bee8e827f3a042..616af4bee949d5 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1318,11 +1318,12 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode(0xD8), NI_AVX2_Permute4x64, simdOtherJitType, simdSize); - simdSize = 16; simdType = TYP_SIMD16; op1 = gtNewSimdHWIntrinsicNode(simdType, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize); + + simdSize = 16; } break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 07ec567c511a41..cbead10c82926e 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2624,7 +2624,7 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) // ... // op1 = op1.GetLower(); - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, 16); + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize); BlockRange().InsertBefore(node, tmp1); LowerNode(tmp1); }