From 4bf42a793d0595133fd1888065b353ae3691e6e2 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 11 Apr 2023 02:57:42 +0200 Subject: [PATCH] Revert "Revert "Add SIMD to LowerCallMemcmp (#84530)" (#84595)" --- src/coreclr/jit/gentree.cpp | 6 ++-- src/coreclr/jit/lower.cpp | 68 ++++++++++++++++++++++++++++++------- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5e1a168883002c..27073d655726dc 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7384,9 +7384,9 @@ GenTree* Compiler::gtNewZeroConNode(var_types type) #ifdef FEATURE_SIMD if (varTypeIsSIMD(type)) { - GenTreeVecCon* allBitsSet = gtNewVconNode(type); - allBitsSet->gtSimdVal = simd_t::Zero(); - return allBitsSet; + GenTreeVecCon* vecCon = gtNewVconNode(type); + vecCon->gtSimdVal = simd_t::Zero(); + return vecCon; } #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 87ea617fff8d4f..456fb7b9c3589f 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1902,8 +1902,20 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call) { GenTree* lArg = call->gtArgs.GetUserArgByIndex(0)->GetNode(); GenTree* rArg = call->gtArgs.GetUserArgByIndex(1)->GetNode(); - // TODO: Add SIMD path for [16..128] via GT_HWINTRINSIC nodes - if (cnsSize <= 16) + + ssize_t MaxUnrollSize = 16; +#ifdef FEATURE_SIMD + MaxUnrollSize = 32; +#ifdef TARGET_XARCH + if (comp->compOpportunisticallyDependsOn(InstructionSet_Vector256)) + { + MaxUnrollSize = 64; + } +// TODO-XARCH-AVX512: Consider enabling this for AVX512 +#endif +#endif + + if (cnsSize <= MaxUnrollSize) { unsigned loadWidth = 1 << BitOperations::Log2((unsigned)cnsSize); var_types loadType; @@ -1919,11 +1931,25 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call) { loadType = TYP_INT; } - else if ((loadWidth == 8) || (loadWidth == 16)) + else if ((loadWidth == 8) || (MaxUnrollSize == 16)) { loadWidth = 8; loadType = TYP_LONG; } +#ifdef FEATURE_SIMD + else if ((loadWidth == 16) || (MaxUnrollSize == 32)) + { + loadWidth = 16; + loadType = TYP_SIMD16; + } +#ifdef TARGET_XARCH + else if ((loadWidth == 32) || (MaxUnrollSize == 64)) + { + loadWidth = 32; + loadType = TYP_SIMD32; + } +#endif // TARGET_XARCH +#endif // FEATURE_SIMD else { unreached(); @@ -1932,8 +1958,26 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call) GenTree* result = nullptr; + auto newBinaryOp = [](Compiler* comp, genTreeOps oper, var_types type, GenTree* op1, + GenTree* op2) -> GenTree* { +#ifdef FEATURE_SIMD + if (varTypeIsSIMD(op1)) + { + if (GenTree::OperIsCmpCompare(oper)) + { + assert(type == TYP_INT); + return comp->gtNewSimdCmpOpAllNode(oper, TYP_BOOL, op1, op2, CORINFO_TYPE_NATIVEUINT, + genTypeSize(op1)); + } + return comp->gtNewSimdBinOpNode(oper, op1->TypeGet(), op1, op2, CORINFO_TYPE_NATIVEUINT, + genTypeSize(op1)); + } +#endif + return comp->gtNewOperNode(oper, type, op1, op2); + }; + // loadWidth == cnsSize means a single load is enough for both args - if ((loadWidth == (unsigned)cnsSize) && (loadWidth <= 8)) + if (loadWidth == (unsigned)cnsSize) { // We're going to emit something like the following: // @@ -1943,7 +1987,7 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call) // GenTree* lIndir = comp->gtNewIndir(loadType, lArg); GenTree* rIndir = comp->gtNewIndir(loadType, rArg); - result = comp->gtNewOperNode(GT_EQ, TYP_INT, lIndir, rIndir); + result = newBinaryOp(comp, GT_EQ, TYP_INT, lIndir, rIndir); BlockRange().InsertAfter(lArg, lIndir); BlockRange().InsertAfter(rArg, rIndir); @@ -1990,17 +2034,17 @@ GenTree* Lowering::LowerCallMemcmp(GenTreeCall* call) // GenTree* l1Indir = comp->gtNewIndir(loadType, lArgUse.Def()); GenTree* r1Indir = comp->gtNewIndir(loadType, rArgUse.Def()); - GenTree* lXor = comp->gtNewOperNode(GT_XOR, actualLoadType, l1Indir, r1Indir); + GenTree* lXor = newBinaryOp(comp, GT_XOR, actualLoadType, l1Indir, r1Indir); GenTree* l2Offs = comp->gtNewIconNode(cnsSize - loadWidth, TYP_I_IMPL); - GenTree* l2AddOffs = comp->gtNewOperNode(GT_ADD, lArg->TypeGet(), lArgClone, l2Offs); + GenTree* l2AddOffs = newBinaryOp(comp, GT_ADD, lArg->TypeGet(), lArgClone, l2Offs); GenTree* l2Indir = comp->gtNewIndir(loadType, l2AddOffs); GenTree* r2Offs = comp->gtCloneExpr(l2Offs); // offset is the same - GenTree* r2AddOffs = comp->gtNewOperNode(GT_ADD, rArg->TypeGet(), rArgClone, r2Offs); + GenTree* r2AddOffs = newBinaryOp(comp, GT_ADD, rArg->TypeGet(), rArgClone, r2Offs); GenTree* r2Indir = comp->gtNewIndir(loadType, r2AddOffs); - GenTree* rXor = comp->gtNewOperNode(GT_XOR, actualLoadType, l2Indir, r2Indir); - GenTree* resultOr = comp->gtNewOperNode(GT_OR, actualLoadType, lXor, rXor); - GenTree* zeroCns = comp->gtNewIconNode(0, actualLoadType); - result = comp->gtNewOperNode(GT_EQ, TYP_INT, resultOr, zeroCns); + GenTree* rXor = newBinaryOp(comp, GT_XOR, actualLoadType, l2Indir, r2Indir); + GenTree* resultOr = newBinaryOp(comp, GT_OR, actualLoadType, lXor, rXor); + GenTree* zeroCns = comp->gtNewZeroConNode(actualLoadType); + result = newBinaryOp(comp, GT_EQ, TYP_INT, resultOr, zeroCns); BlockRange().InsertAfter(rArgClone, l1Indir, r1Indir, l2Offs, l2AddOffs); BlockRange().InsertAfter(l2AddOffs, l2Indir, r2Offs, r2AddOffs, r2Indir);