diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 4d18a72d2adb1a..7cb024e744e539 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -491,14 +491,38 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre if (vecCon->IsAllBitsSet()) { - if ((attr != EA_32BYTE) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + switch (attr) { + case EA_8BYTE: + case EA_16BYTE: + { + emit->emitIns_R_R(INS_pcmpeqd, attr, targetReg, targetReg); + return; + } #if defined(FEATURE_SIMD) - emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); -#else - emit->emitIns_R_R(INS_pcmpeqd, attr, targetReg, targetReg); + case EA_32BYTE: + { + if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg); + return; + } + break; + } + + case EA_64BYTE: + { + assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)); + emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg, + static_cast(0xFF)); + return; + } #endif // FEATURE_SIMD - break; + + default: + { + unreached(); + } } } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 4d31097b5ad4a7..757f5eb2045581 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -17752,6 +17752,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vpsllvq: case INS_vpsrlvd: case INS_vpsrlvq: + case INS_vpternlogd: result.insThroughput = PERFSCORE_THROUGHPUT_2X; result.insLatency += PERFSCORE_LATENCY_1C; break; diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 9c33405a67f0f1..3a0d9ea52bb23c 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -1195,7 +1195,9 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_Vector256_Create: case NI_Vector512_Create: case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: case NI_VectorT256_CreateBroadcast: case NI_X86Base_BitScanForward: case NI_X86Base_X64_BitScanForward: @@ -1519,6 +1521,9 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_Vector256_get_AllBitsSet: case NI_Vector256_get_One: case NI_Vector256_get_Zero: + case NI_Vector512_get_AllBitsSet: + case NI_Vector512_get_One: + case NI_Vector512_get_Zero: case NI_VectorT256_get_AllBitsSet: case NI_VectorT256_get_One: case NI_VectorT256_get_Zero: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 16cae1bde464e7..957fdd63dd860c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -3743,6 +3743,9 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp) case NI_Vector256_Create: case NI_Vector256_CreateScalar: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_Create: + case NI_Vector512_CreateScalar: + case NI_Vector512_CreateScalarUnsafe: #elif defined(TARGET_ARM64) case NI_Vector64_Create: case NI_Vector64_CreateScalar: @@ -19246,6 +19249,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { // These HWIntrinsic operations are contained as part of scalar ops return true; @@ -21478,6 +21482,10 @@ GenTree* Compiler::gtNewSimdCreateScalarNode( { hwIntrinsicID = NI_Vector256_CreateScalar; } + else if (simdSize == 64) + { + hwIntrinsicID = NI_Vector512_CreateScalar; + } #elif defined(TARGET_ARM64) if (simdSize == 8) { @@ -21618,6 +21626,10 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode( { hwIntrinsicID = NI_Vector256_CreateScalarUnsafe; } + else if (simdSize == 64) + { + hwIntrinsicID = NI_Vector512_CreateScalarUnsafe; + } #elif defined(TARGET_ARM64) if (simdSize == 8) { @@ -21911,15 +21923,7 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode( // We don't guarantee a non-temporal load will actually occur, so fallback // to regular aligned loads if the required ISA isn't supported. - if (simdSize == 64) - { - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) - { - intrinsic = NI_AVX512F_LoadAlignedVector512NonTemporal; - isNonTemporal = true; - } - } - else if (simdSize == 32) + if (simdSize == 32) { if (compOpportunisticallyDependsOn(InstructionSet_AVX2)) { @@ -21932,6 +21936,14 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode( intrinsic = NI_AVX_LoadAlignedVector256; } } + else if (simdSize == 64) + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + intrinsic = NI_AVX512F_LoadAlignedVector512NonTemporal; + isNonTemporal = true; + } + } else if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) { intrinsic = NI_SSE41_LoadAlignedVector128NonTemporal; @@ -23173,16 +23185,16 @@ GenTree* Compiler::gtNewSimdStoreAlignedNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 64) - { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); - intrinsic = NI_AVX512F_StoreAligned; - } - else if (simdSize == 32) + if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAligned; } + else if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_StoreAligned; + } else if (simdBaseType != TYP_FLOAT) { intrinsic = NI_SSE2_StoreAligned; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 5ab36443360b8f..495847c1025be7 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6355,7 +6355,9 @@ struct GenTreeVecCon : public GenTree case NI_Vector256_Create: case NI_Vector512_Create: case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: #elif defined(TARGET_ARM64) case NI_Vector64_Create: case NI_Vector64_CreateScalar: @@ -6371,7 +6373,8 @@ struct GenTreeVecCon : public GenTree // CreateScalar leaves the upper bits as zero #if defined(TARGET_XARCH) - if ((intrinsic != NI_Vector128_CreateScalar) && (intrinsic != NI_Vector256_CreateScalar)) + if ((intrinsic != NI_Vector128_CreateScalar) && (intrinsic != NI_Vector256_CreateScalar) && + (intrinsic != NI_Vector512_CreateScalar)) #elif defined(TARGET_ARM64) if ((intrinsic != NI_Vector64_CreateScalar) && (intrinsic != NI_Vector128_CreateScalar)) #endif diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 77a7ab22dc19cc..350a2a38edca95 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -901,6 +901,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) { case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { if (varTypeIsIntegral(baseType)) { diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 042f57891a4d5d..054273c147d73b 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -237,7 +237,11 @@ HARDWARE_INTRINSIC(Vector256, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Vector512 Intrinsics HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector512, get_AllBitsSet, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, get_One, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 7feb5336ca0e5c..a0ddeb10460acc 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1088,6 +1088,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_CreateScalar: case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: { assert(sig->numArgs == 1); @@ -1108,6 +1109,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { assert(sig->numArgs == 1); @@ -1403,6 +1405,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_get_AllBitsSet: case NI_Vector256_get_AllBitsSet: + case NI_Vector512_get_AllBitsSet: { assert(sig->numArgs == 0); retNode = gtNewAllBitsSetConNode(retType); @@ -1411,6 +1414,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_get_One: case NI_Vector256_get_One: + case NI_Vector512_get_One: { assert(sig->numArgs == 0); retNode = gtNewOneConNode(retType, simdBaseType); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index c5c8c1924ea3fb..8da608dd356467 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,7 +101,7 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && + if (GetEmitter()->IsVexOrEvexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && !GetEmitter()->IsKInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); @@ -700,6 +700,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) { case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { // The hwintrinsic should be contained and its // op1 should be either contained or spilled. This diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index e089f6195f0de1..69bb4e243909c0 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -654,6 +654,7 @@ INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None) INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index afcbe5d82c1860..499245632bbacf 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1038,6 +1038,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector512_Create: case NI_Vector128_CreateScalar: case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: { // We don't directly support the Vector128.Create or Vector256.Create methods in codegen // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect @@ -1934,9 +1935,10 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) GenTree* tmp2 = nullptr; GenTree* tmp3 = nullptr; - bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simdVal); - bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar); - size_t argCnt = node->GetOperandCount(); + bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simdVal); + bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar) || + (intrinsicId == NI_Vector512_CreateScalar); + size_t argCnt = node->GetOperandCount(); if (isConstant) { @@ -6680,6 +6682,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { if (!varTypeIsIntegral(childNode->TypeGet())) { @@ -6826,6 +6829,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { if (!supportsSIMDScalarLoads) { @@ -7055,7 +7059,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic childNodeId = childNode->GetHWIntrinsicId(); if ((childNodeId == NI_Vector128_CreateScalarUnsafe) || - (childNodeId == NI_Vector256_CreateScalarUnsafe)) + (childNodeId == NI_Vector256_CreateScalarUnsafe) || + (childNodeId == NI_Vector512_CreateScalarUnsafe)) { // We have a very special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) // diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index c57e64981f69db..78bab951c3f99f 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2019,6 +2019,7 @@ static GenTree* SkipContainedCreateScalarUnsafe(GenTree* node) { case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: { return hwintrinsic->Op(1); } @@ -2127,6 +2128,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou case NI_Vector128_ToScalar: case NI_Vector256_CreateScalarUnsafe: case NI_Vector256_ToScalar: + case NI_Vector512_CreateScalarUnsafe: { assert(numArgs == 1);