Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adding AVX512F APIs for And, AndNot, Load, Or, Store, and Xor
  • Loading branch information
tannergooding committed Mar 16, 2023
commit 4b82857b9bb762d266beae005404ed745404ecc3
36 changes: 18 additions & 18 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const
// Since we are not using k registers yet, this will have no impact on correctness but will affect things
// once
// k registers are used (as that is the point of the "break out operand type" of these instructions)
// case INS_movdqa: // INS_movdqa32, INS_movdqa64.
// case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64.
// case INS_movdqa: // INS_vmovdqa32, INS_vmovdqa64.
// case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_vmovdqu32, INS_vmovdqu64.
// case INS_pand: // INS_vpandd, INS_vpandq.
// case INS_pandn: // INS_vpandnd, INS_vpandnq.
// case INS_por: // INS_vpord, INS_vporq.
Expand Down Expand Up @@ -5934,13 +5934,13 @@ bool emitter::IsMovInstruction(instruction ins)
case INS_movaps:
case INS_movd:
case INS_movdqa:
case INS_movdqa32:
case INS_movdqa64:
case INS_vmovdqa32:
case INS_vmovdqa64:
case INS_movdqu:
case INS_movdqu8:
case INS_movdqu16:
case INS_movdqu32:
case INS_movdqu64:
case INS_vmovdqu32:
case INS_vmovdqu64:
case INS_movsdsse2:
case INS_movss:
case INS_movsx:
Expand Down Expand Up @@ -6083,12 +6083,12 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size)
break;
}

case INS_movdqa32:
case INS_movdqa64:
case INS_vmovdqa32:
case INS_vmovdqa64:
case INS_movdqu8:
case INS_movdqu16:
case INS_movdqu32:
case INS_movdqu64:
case INS_vmovdqu32:
case INS_vmovdqu64:
{
// These EVEX instructions merges/masks based on k-register
// TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these.
Expand Down Expand Up @@ -6299,13 +6299,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN
case INS_movapd:
case INS_movaps:
case INS_movdqa:
case INS_movdqa32:
case INS_movdqa64:
case INS_vmovdqa32:
case INS_vmovdqa64:
case INS_movdqu:
case INS_movdqu8:
case INS_movdqu16:
case INS_movdqu32:
case INS_movdqu64:
case INS_vmovdqu32:
case INS_vmovdqu64:
case INS_movsdsse2:
case INS_movss:
case INS_movupd:
Expand Down Expand Up @@ -17538,13 +17538,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;

case INS_movdqa:
case INS_movdqa32:
case INS_movdqa64:
case INS_vmovdqa32:
case INS_vmovdqa64:
case INS_movdqu:
case INS_movdqu8:
case INS_movdqu16:
case INS_movdqu32:
case INS_movdqu64:
case INS_vmovdqu32:
case INS_vmovdqu64:
case INS_movaps:
case INS_movups:
case INS_movapd:
Expand Down
4 changes: 0 additions & 4 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,7 @@ bool IsWEvexOpcodeExtension(const instrDesc* id)
case INS_vfnmsub231sd:
case INS_unpcklpd:
case INS_vpermilpdvar:
case INS_movdqa64:
case INS_movdqu16:
case INS_movdqu64:
case INS_vinsertf64x4:
case INS_vinserti64x4:
{
Expand Down Expand Up @@ -424,9 +422,7 @@ bool IsWEvexOpcodeExtension(const instrDesc* id)
case INS_vpdpbusds:
case INS_vpdpwssds:
case INS_vpermilpsvar:
case INS_movdqa32:
case INS_movdqu8:
case INS_movdqu32:
case INS_vinsertf32x8:
case INS_vinserti32x8:
{
Expand Down
8 changes: 5 additions & 3 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ HARDWARE_INTRINSIC(Vector256, StoreUnsafe,
HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu8, INS_movdqu8, INS_movdqu16, INS_movdqu16, INS_movdqu32, INS_movdqu32, INS_movdqu64, INS_movdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu8, INS_movdqu8, INS_movdqu16, INS_movdqu16, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
Expand Down Expand Up @@ -754,10 +754,12 @@ HARDWARE_INTRINSIC(AVX512F, And,
HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnd, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX512F, LoadVector512, 64, 1, {INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_vpord, INS_vpord, INS_vpord, INS_vpord, INS_vpord, INS_vpord, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AVX512F, StoreAligned, 64, 2, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX512F, Store, 64, 2, {INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu32, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX512F, StoreAligned, 64, 2, {INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa32, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX512F, StoreAlignedNonTemporal, 64, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, {INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxord, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)

Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1685,6 +1685,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
case NI_SSE_LoadVector128:
case NI_SSE2_LoadVector128:
case NI_AVX_LoadVector256:
case NI_AVX512F_LoadVector512:
case NI_Vector128_Load:
case NI_Vector256_Load:
case NI_Vector512_Load:
Expand Down Expand Up @@ -2093,6 +2094,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
case NI_SSE_Store:
case NI_SSE2_Store:
case NI_AVX_Store:
case NI_AVX512F_Store:
{
assert(retType == TYP_VOID);
assert(sig->numArgs == 2);
Expand Down
24 changes: 12 additions & 12 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -649,19 +649,19 @@ INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BA
INST3(FIRST_AVX512F_INSTRUCTION, "FIRST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values
INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values
INST3(movdqa32, "movdqa32", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None)
INST3(movdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None)
INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None)
INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None)
INST3(vpandd, "pandd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W0) // Packed bit-wise AND of two xmm regs
INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W1) // Packed bit-wise AND of two xmm regs
INST3(vpandnd, "pandnd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W0) // Packed bit-wise AND NOT of two xmm regs
INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W1) // Packed bit-wise AND NOT of two xmm regs
INST3(vpord, "pord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W0) // Packed bit-wise OR of two xmm regs
INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W1) // Packed bit-wise OR of two xmm regs
INST3(vmovdqa32, "movdqa32", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0)
INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1)
INST3(vmovdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0)
INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1)
INST3(vpandd, "pandd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs
INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs
INST3(vpandnd, "pandnd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs
INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs
INST3(vpord, "pord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs
INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs
INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction)
INST3(vpxord, "pxord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W0) // Packed bit-wise XOR of two xmm regs
INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction|REX_W1) // Packed bit-wise XOR of two xmm regs
INST3(vpxord, "pxord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | REX_W0 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs
INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs
INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)

INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None)
Expand Down
Loading